# Introduction

Hyperparameter tuning has always been an iterative process that would take hours and might even take weeks for the beginner. Like for example sklearn's XGB Classifier, it has more than 13 parameters to tweak. Without enough experience, things could get dreary pretty quick.

In this notebook, we will be implementing genetic evolutionary algorithms (or in short evolutionary algorithm) to tune the hyperparameters of a GradientBoostingClassifier model to make model training less of a iterative process but an automatic one.

There are 4 types of search methods, namely;
1. Manual search
2. Grid search
3. Random search
4. Genetic search

Benefits of genetic search over the rest of the search methods are;
1. Faster and completely automatic convergence to a global minima
2. Better scores due to iterating over denser search space
3. Lesser parameters to handle, reducing search space

During tuning, normally you will only input hyperparameters to the nearest 1 or 2 decimal point like learning_rate=0.01, but with genetic algorithms you will be able to train with more concise values, like 0.0123, that might result in better model performance.

Drawbacks of genetic search;
1. Possibility of premature convergence to a local minima
2. Large population might take as long as grid search
3. Takes more code to implement

In [None]:
import pandas as pd
import numpy as np

import re, os

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

PATH = '../input/tabular-playground-series-apr-2021'
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))
submission = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))

# Preprocessing

In [None]:
def ageImputer(Id):
    if data['Age'][Id] == 0:
        if (data['Pclass'][Id]==1) & (data['Sex'][Id]==0):
            return train[(train['Pclass']==1) & (train['Sex']==0)]['Age'].median()
        elif (data['Pclass'][Id]==1) & (data['Sex'][Id]==1):
            return train[(train['Pclass']==1) & (train['Sex']==1)]['Age'].median()
        elif (data['Pclass'][Id]==2) & (data['Sex'][Id]==0):
            return train[(train['Pclass']==2) & (train['Sex']==0)]['Age'].median()
        elif (data['Pclass'][Id]==2) & (data['Sex'][Id]==1):
            return train[(train['Pclass']==2) & (train['Sex']==1)]['Age'].median()
        elif (data['Pclass'][Id]==3) & (data['Sex'][Id]==0):
            return train[(train['Pclass']==1) & (train['Sex']==0)]['Age'].median()
        elif (data['Pclass'][Id]==3) & (data['Sex'][Id]==1):
            return train[(train['Pclass']==1) & (train['Sex']==1)]['Age'].median()
    else:
        return data['Age'][Id]

def fareImputer(Id):
    if data['Fare'][Id] == 0:
        if data['Pclass'][Id]==1:
            return train[train['Pclass']==1]['Fare'].mean()
        elif data['Pclass'][Id]==2:
            return train[train['Pclass']==2]['Fare'].mean()
        elif data['Pclass'][Id]==3:
            return train[train['Pclass']==3]['Fare'].mean()
    else:
        return data['Fare'][Id]

def cabin_map(x):
    if x in ["T", "G", "F"]:
        return "E"
    else:
        return x

for i, data in zip(['train', 'test'], [train, test]):
    # Sex
    data['Sex'] = pd.Categorical(data['Sex']).codes
    # Pclass
    data['Pclass'] = data['Pclass'].astype(np.uint8)
    # Age
    data['Age'] = data['Age'].fillna(0)
    tqdm.pandas(desc=f"{i} Age Imputation")
    data['Age'] = data.reset_index()['index'].progress_apply(ageImputer)
    # Parch/SibSp
    data['familySize'] = data['Parch'] + data['SibSp'] + 1
    data['isAlone'] = pd.Categorical(data['familySize']==0).codes
    # Fare
    data['Fare'] = data['Fare'].fillna(0)
    tqdm.pandas(desc=f"{i} Fare Imputation")
    data['Fare'] = data.reset_index()['index'].progress_apply(fareImputer)
    data['Fare'] = np.log1p(data['Fare'])
    # Cabin
    data['Cabin'] = data['Cabin'].fillna("M").str[0]
    data['Cabin'] = data['Cabin'].apply(cabin_map)
    # Embarked
    data['Embarked'] = data['Embarked'].fillna(train['Embarked'].mode()[0])
    # Name
    data['Name'] = data['Name'].str.split(', ', expand=True)[1]
    # Ticket
    data['Ticket'] = data['Ticket'].fillna('M').astype(str)
    for i in tqdm(data.reset_index()['index'], desc=f"{i} Ticket Discretization"):
        text = data['Ticket'][i]
        if re.findall(r"^(PC)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "PC")
        elif re.findall(r"(A.|A/5.|A/4|A/4.|A4)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "A")
        elif re.findall(r"(STON|SOTON)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "SOTON")
        elif re.findall(r"(W./C.|W/C)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "WC")   
        elif re.findall(r"(F.C.)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "FCC") 
        elif re.findall(r"(SC|C|SO/C|S.O.C.|SCO)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "SOC")
        elif re.findall(r"(S.W.|SW)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "SW")
        elif re.findall(r"(S.O./P|S.P|S.O.P.|PP)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "SOP")
        elif re.findall(r"(WE|W.E.P.)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "WEP")
        elif re.findall(r"(Fa|FA)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "FA")
        elif re.findall(r"(LP|Lp)", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "LP")
        elif re.findall(r"^[1][\d]{4}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "1xxxx")
        elif re.findall(r"^[1][\d]{5}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "1xxxxx")
        elif re.findall(r"^[2][\d]{4}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "2xxxx")
        elif re.findall(r"^[2][\d]{5}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "2xxxxx")  
        elif re.findall(r"^[3][\d]{3}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "3xxx")
        elif re.findall(r"^[3][\d]{4}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "3xxxx")
        elif re.findall(r"^[3][\d]{5}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "3xxxxx") 
        elif re.findall(r"^[3][\d]{6}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "3xxxxxx")
        elif re.findall(r"^[4][\d]{3}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "4xxx")
        elif re.findall(r"^[4][\d]{4}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "4xxxx")
        elif re.findall(r"^[4][\d]{5}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "4xxxxxx")
        elif re.findall(r"^[4][\d]{6}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "4xxxxxx")
        elif re.findall(r"^[5][\d]{3}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "5xxx")
        elif re.findall(r"^[5][\d]{4}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "5xxxx")
        elif re.findall(r"^[5][\d]{5}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "5xxxxx")
        elif re.findall(r"^[6][\d]{3}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "6xxx")
        elif re.findall(r"^[6][\d]{4}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "6xxxx")
        elif re.findall(r"^[7][\d]{3}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "7xxx")
        elif re.findall(r"^[7][\d]{4}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "7xxxx")
        elif re.findall(r"^[8][\d]{3}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "8xxx")
        elif re.findall(r"^[9][\d]{3}$", text) != []:
            data['Ticket'] = data['Ticket'].replace(text, "9xxx")
    ticket_bins = [0, 2500, 5000, 7500, 10000, 12500, 15000, 17500]
    data['TicketFreq'] = data['Ticket'].map(data['Ticket'].value_counts().to_dict())
    data['TicketFreq'] = pd.cut(data['TicketFreq'], ticket_bins, labels=range(len(ticket_bins)-1))

# Outliers
train = train[train['Fare']>=1]
for data in [train, test]:
    age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
    data['Age'] = pd.cut(data['Age'], age_bins, labels=range(len(age_bins)-1))

data = pd.concat([train, test], axis=0).reset_index(drop=True)
data.drop(['PassengerId'], axis=1, inplace=True)

# Label encode
for feature in ["Cabin", "Embarked", "Name"]:
    data[feature] = pd.Categorical(data[feature]).codes

# One-hot encode
data = pd.get_dummies(data, columns=["Sex", "Cabin", 'Ticket', "Embarked"])

train = data.iloc[:train.shape[0], :]
test = data.iloc[train.shape[0]:, :].reset_index(drop=True)
test.drop(["Survived"], axis=1, inplace=True)

print("train shape:", train.shape)
print("test shape:", test.shape)

train.to_csv("train_cleaned.csv", index=False)
test.to_csv("test_cleaned.csv", index=False)

In [None]:
plt.figure(figsize=(10,10))
for i, feature in enumerate(['Age', 'Fare', 'familySize', 'Pclass', 'isAlone', 'TicketFreq']):
    plt.subplot(3, 2, i+1)
    sns.histplot(x=train[feature], hue=train['Survived'], discrete=True)

# Hyperparameter Tuning

#### eXtreme Gradient Boosting Classifier (XGB) (scikit-learn)

|Features|Options|Default|Bounds|
|:-|:-:|:-:|:-:|
|loss|{'deviance', 'exponential'}|'deviance'|0.0 to 1.0|
|learning_rate|float|0.1|0.1 to 0.5|
|n_estimators|Int|100|1 to 500|
|subsample|float|1.0|0.1 to 1.0|
|criterion|{"friedman_mse", "mse"}|"friedman_mse"|0.0 to 1.0|
|min_samples_split|Int or float|2|2 to 10|
|min_samples_leaf|Int or float|1|1 to 10|
|min_weight_fraction_leaf|float|0.0|0.0 to 0.5|
|max_depth|Int|None|2 to 100|
|min_impurity_decrease|Float|0.0|0.0 to 1.0|
|max_features|{"auto", "sqrt", "log2"}|"auto"|0.0 to 2.0|
|max_leaf_nodes|Int|None|2 to 100|
|ccp_alpha|Float|0.0|0.0 to 1.0|

## Tuning functions

For this module, we will be designing our individuals that will make up our simulated population.

Our individuals will have chromosomes that are made up of an array of random float numbers. These float numbers are the settings of the hyperparameters that we will be testing with.

We have also incorporated a 5-fold stratified cross validation splits and we will be using accuracy score as our fitness function.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

class XGBGeneticTuning:
    NUM_FOLDS = 5

    def __init__(self, randomSeed):
        self.randomSeed = randomSeed
        self.init_data()
        self.skf = StratifiedKFold(
            n_splits=self.NUM_FOLDS,
            random_state=self.randomSeed,
            shuffle=True
        )

    def init_data(self):
        self.data = pd.read_csv("./train_cleaned.csv")
        self.X = self.data.iloc[:, 1:]
        self.y = self.data.iloc[:, 0]

    def convertParams(self, params):
        loss = ["deviance", "exponential"][round(params[0])]
        learning_rate = params[1]
        n_estimators = round(params[2])
        subsample = params[3]
        criterion = ["friedman_mse", "mse", "mae"][round(params[4])]
        min_samples_split = round(params[5])
        min_samples_leaf = round(params[6])
        min_weight_fraction_leaf = params[7]
        max_depth = round(params[8])
        min_impurity_decrease = params[9]        
        max_features = ["auto", "sqrt", "log2", None][round(params[10])]
        max_leaf_nodes = round(params[11])
        ccp_alpha = params[12]        
        return loss, learning_rate, n_estimators, subsample, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, min_impurity_decrease, max_features, max_leaf_nodes, ccp_alpha
        
    def getAccuracy(self, params):              
        (loss, learning_rate, n_estimators, subsample, criterion,
         min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
         max_depth, min_impurity_decrease, max_features, max_leaf_nodes,
         ccp_alpha) = self.convertParams(params)
            
        self.classifier = GradientBoostingClassifier(
            random_state=self.randomSeed,
            loss=loss,
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            subsample=subsample,
            criterion=criterion,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,            
            max_depth=max_depth,
            min_impurity_decrease=min_impurity_decrease,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            ccp_alpha=ccp_alpha,
        )
        scores = []
        for train_idx, val_idx in self.skf.split(self.X, self.y):
            X_train = self.X.iloc[train_idx, :]
            X_val = self.X.iloc[val_idx, :]
            y_train = self.y.iloc[train_idx]
            y_val = self.y.iloc[val_idx]

            self.classifier.fit(X_train, y_train)            
            preds = self.classifier.predict(X_val)

            score = accuracy_score(y_val, preds)
            scores.append(score)

        return np.mean(scores)

    def formatParams(self, params):
        return "'loss'=%s, 'learning_rate'=%1.3f, 'n_estimators'=%3d, 'subsample'=%1.3f, 'criterion'=%s, 'min_samples_split'=%3d, 'min_samples_leaf'=%3d, 'min_weight_fraction_leaf'=%1.3f, 'max_depth'=%3d, 'min_impurity_decrease'=%1.3f, 'max_features'=%s, 'max_leaf_nodes'=%3d, 'ccp_alpha'=%1.3f" % (self.convertParams(params))

## Genetic Operators

In this section, we will be designing the genetic flow using DEAP framework. It mainly consists of 5 stages; 

1. Population Generator
2. Selection
3. Crossover/Mating
4. Mutation
5. Evaluation

Stage 1 is where we initialize the population with random individuals. They will then be evaluated for their fitness functions followed by a selection phase to obtain the best candidates for crossover/mating (parents) in stage 2. Afterwards, in stage 3, the offspring from that crossover phase will be further mutated, combining the traits of its parents. Lastly, in stage 5, the newly generated population will be evaluated for their fitness functions.

This sums up the genetic flow for a single generation and will be repeated for a number of generations.

We will be using a tournament mechanism where n-individuals will be put into a survival-of-the-fittest setting where the best individuals will survive till the next generation. The individuals will be evaluated using their computed fitness functions while weak individuals will be dropped off at the end of each bout.

In [None]:
from deap import base, creator, tools, algorithms

import random

# Initialize Toolbox
toolbox = base.Toolbox()

# Initialize constants
POPULATION_SIZE = 30
P_CROSSOVER = 0.9
P_MUTATION = 0.5
MAX_GENERATIONS = 30
HALL_OF_FAME_SIZE = 5
CROWDING_FACTOR = 20.0

# Initialize state space
BOUNDS_LOW =  [0.0, 0.1, 100, 0.1, 0.0,  2,  1, 0.0,   2, 0.0, 0.0,   2, 0.0]
BOUNDS_HIGH = [1.0, 0.5, 250, 1.0, 1.0, 10, 10, 0.5, 100, 1.0, 3.0, 100, 1.0]
NUM_OF_PARAMS = len(BOUNDS_HIGH)

# Set random seed
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
tuning = XGBGeneticTuning(RANDOM_SEED)

In [None]:
# Fitness strategy
creator.create(
    name="FitnessMax",
    base=base.Fitness,
    weights=(1.0,)
)

# Individuals list container
creator.create(
    "Individual",
    list,
    fitness=creator.FitnessMax
)

# Register a random float operator for each hyperparameter
for i in range(NUM_OF_PARAMS):
    toolbox.register(
        alias="hyperparameter_"+str(i),
        function=random.uniform,
        a=BOUNDS_LOW[i],
        b=BOUNDS_HIGH[i]
    )
    
hyperparameters = ()
for i in range(NUM_OF_PARAMS):
    hyperparameters += (toolbox.__getattribute__("hyperparameter_" + str(i)),)

# Population Generator
toolbox.register(
    "individualCreator",
    tools.initCycle,
    creator.Individual,
    hyperparameters,
    n=1
)

toolbox.register(
    alias="populationCreator",
    function=tools.initRepeat,
    container=list,
    func=toolbox.individualCreator
)

# Evaluation operator
def classificationAccuracy(individual):
    return tuning.getAccuracy(individual),

toolbox.register(
    alias="evaluate",
    function=classificationAccuracy
)

# Selection operator
toolbox.register(
    alias="select",
    function=tools.selTournament,
    tournsize=3
)

# Crossover operator
toolbox.register(
    alias="mate",
    function=tools.cxSimulatedBinaryBounded,
    low=BOUNDS_LOW,
    up=BOUNDS_HIGH,
    eta=CROWDING_FACTOR
)

# Mutation operator
toolbox.register(
    alias="mutate",
    function=tools.mutPolynomialBounded,
    low=BOUNDS_LOW,
    up=BOUNDS_HIGH,
    eta=CROWDING_FACTOR,
    indpb=1.0/NUM_OF_PARAMS
)

## Genetic Search

### Simple Evolutionary Algorithm

In [None]:
def GAsimple():
    population = toolbox.populationCreator(n=POPULATION_SIZE)

    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("max", np.max)
    stats.register("avg", np.mean)

    hof = tools.HallOfFame(HALL_OF_FAME_SIZE)

    print("Training...")
    population, logbook = algorithms.eaSimple(
        population=population,
        toolbox=toolbox,
        cxpb=P_CROSSOVER,
        mutpb=P_MUTATION,
        ngen=MAX_GENERATIONS,
        stats=stats,
        halloffame=hof,
        verbose=True
    )

    print("- Best solution:", tuning.formatParams(hof.items[0]))
    print("- Best accuracy:", hof.items[0].fitness.values[0])

    maxFitnessValues, meanFitnessValues = logbook.select("max", "avg")

    sns.set_style("whitegrid")
    plt.plot(maxFitnessValues, color='red', label='max')
    plt.plot(meanFitnessValues, color='green', label='avg')
    plt.xlabel("Generation")
    plt.ylabel("Fitness")
    plt.title("Max/Avg Fitness over Generations")
    plt.legend()
    plt.show()

    return tuning.convertParams(hof.items[0])

In [None]:
best_params = GAsimple()

## Submission

We will refit the model with all the data with the best found parameters.

In [None]:
X = train.iloc[:, 1:]
y = train.iloc[:, 0]

xgb = GradientBoostingClassifier(
    random_state=RANDOM_SEED,
    loss=best_params[0],
    learning_rate=best_params[1],
    n_estimators=best_params[2],
    subsample=best_params[3],
    criterion=best_params[4],
    min_samples_split=best_params[5],
    min_samples_leaf=best_params[6],
    min_weight_fraction_leaf=best_params[7],            
    max_depth=best_params[8],
    min_impurity_decrease=best_params[9],
    max_features=best_params[10],
    max_leaf_nodes=best_params[11],
    ccp_alpha=best_params[12],
).fit(X, y)

submission['Survived'] = xgb.predict(test).astype(int)
submission

In [None]:
submission.to_csv('submission.csv', index=False)