Hi all, 

this is part of a Homework Project I did am doing for school.
I am building a pypi package (pip install galearn) to do hyperparameter tuning compatible with the sklearn api.
It's a work in progress, but here it is what I've got so far for the titanic set (I'll upload one for Boston Housing as well).
It performs better than randomized search in most of my test cases and is fairly fast.

I would be very grateful if you try it out for your own problems and share your experiences with me so I may improve it (lots of work still to be done). 
You don't need to install the package but can also just copy paste any functions from here.

If anything is unclear or you have some questions please ask, any input is greatly appreciated.

cheers!

Oli


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
import pandas as pd
import bqplot
import matplotlib.pyplot as plt
import time
import numpy as np
from tqdm.notebook import tqdm, trange
import seaborn as sns
import optuna
import ipywidgets as widgets
sns.set({'figure.figsize': (12, 8)})
sns.set_color_codes("pastel")
rng = np.random.default_rng()

# Genetic Algorithms for Model Selection

###  *Two major options to improving performance ML Problems*


<ol>
<li>Feature Engineering</li>
<li>Hyperparameter Tuning</li>
</ol>

Genetic Algorithms can be applied to both, feature selection in Feature Engineering and Hyperparameter Search in Model Selection, or fine tuning.

Here, I will focus on Hyperparameter Tuning.

There are many existing solutions for this problem, sklearn has built in GridSearchCV, RandomizedSearchCV for this. There are countless companies and open source libraries like H2O, Optuna, Hyperopt and raytunes. 

All using different algorithms. Brute Force, randomization, AI and bayesian optimization.

The former is not feasible for large spaces and even the latter two are quite slow.

Even a simple Model like Logistic Regression with only two continous parameters can have a huge search space.

In [None]:
from IPython.display import display
C = widgets.IntSlider(min=10, max=10000, step=10, value=100)
l1_ratio = widgets.IntSlider(min=10, max=10000, step=10, value=100)

In [None]:
def mul(a, b):
    print(f"search space has {a*b} parameters")

In [None]:
?LogisticRegression

In [None]:
widgets.interact(mul, a = C, b = l1_ratio);

*I picked discrete values because I feel it's more in line with an actual gene_pool, i.e. we have a building block 20 amino acids which aren't continous.
Everything could easily be changed to search with continous values with arithmetic.*

In [None]:
#create a gene_pool millions of possible combinations with just two variables!
params = dict()
params["penalty"] = ["elasticnet"]
params["C"] = np.linspace(0.001, 100, C.value)
params["l1_ratio"] = np.linspace(0.0001, 1, l1_ratio.value)
params["solver"] = ['saga']
params["random_state"] = [42]

This makes it very hard to pick the right model from the crowd

In [None]:
params['C']

### Hmmm, ok, you!



![display image](https://thumbs.gfycat.com/DazzlingPresentAlbacoretuna-size_restricted.gif)

So now that we have a gene pool defined (the parameter space), lets create some individuals by random choice.

These functions below do the basic and are here to just test it's working, they are defined in the Individual Class again

In [None]:
#generates a dictionary from the pool of genes
def generate_parent(gene_pool):
    parent = dict()
    for gene in gene_pool.keys():
        parent[gene] = rng.choice(gene_pool[gene])
    fitness = get_fitness(estimator(**parent), fitness_function)
    return Individual(parent, fitness)

In [None]:
def mutate(parent, gene_pool):
    gene = rng.choice(list(params))
    child = parent.copy()
    new_gene, alternate = rng.choice(gene_pool[gene], 2)
    #help make sure the gene get's mutated
    child[gene] = alternate if new_gene == child[gene] else new_gene
    return child

In [None]:
class Individual:
    def __init__(self, genes, fitness):
        self._genes = genes
        self._fitness = fitness
        self._fp = 0 #fitness_proportion to be used when selection == fp
        
    def __eq__(self, other):
        return self.genes == other.genes

    def __lt__(self, other):
        return self.fitness < other.fitness
    
    def __gt__(self, other):
        return self.fitness > other.fitness
    
    def __str__(self):
        return f"Individual with genes: {self._genes} and fitness:{self._fitness}"
        
    @property    
    def genes(self):
        return self._genes
    
    @property
    def fitness(self):
        return self._fitness
    
    #may add cv = cv/skf as an option
    def set_fitness(self):
        self._fitness = get_fitness(estimator(**self.genes), fitness_function)
    
    def set_gene(self, gene, value):
        self._genes[gene] = value
        
    def get_gene_from_window(self, gene):
        min_c = gene_pool[gene].min()
        max_c = gene_pool[gene].max()
        dist_1 = self._genes[gene] - min_c
        dist_2 = max_c - self._genes[gene]
        dist = min(dist_1, dist_2)*gnp_window
        lb = self._genes[gene] - dist
        ub = self._genes[gene] + dist
        new_gene, alternate = rng.choice(gene_pool[gene][(gene_pool[gene] >= lb) & (gene_pool[gene] <= ub)], 2)
        return new_gene, alternate
        
    
    def mutate(self):
        gene = rng.choice(list(self._genes))
        if restrict_gnp and isinstance(gene_pool[gene], float):
            #give chance of diversity = 1-p_mutate until 10% chance
            if rng.random() < p_outlier:
                print(f"got an outlier")
                new_gene, alternate = rng.choice(gene_pool[gene], 2)
            else:
                new_gene, alternate = self.get_gene_from_window(gene)
        else:
            new_gene, alternate = rng.choice(gene_pool[gene], 2)
    #help make sure the gene get's mutated
        self._genes[gene] = alternate if new_gene == self._genes[gene] else new_gene
        return
    
    def get_estimator(self):
        return estimator(**self._genes)

In [None]:
#creates a population of size size with parameters from gene_pool
def create_population(gene_pool, size = 10):
    population = []
    for i in range(size):
        population.append(generate_parent(gene_pool))
    population.sort(reverse = True)
    return population

In [None]:
class Population:
    #create initial population
    def __init__(self, gene_pool, size = 10):
        self._population = create_population(gene_pool, size)
        self._size = size
      
    #note that if several individuals have == best fitness anyone of them is returned in the sorted list
    @property
    def best_individual(self):
        return self._population[0]
    
    @property
    def best_fitness(self):
        return self._population[0].fitness
    
    @property
    def population(self):
        return self._population
    
    @property
    def size(self):
        return self._size
    
    def replace_generation(self, new_gen):
        new_gen.sort(reverse = True)
        self._population = new_gen

## Fitness

How do we know that one individual is better than another?

We use a fitness function. 
For our models there a many different metrics that qualify depending on your problem.
MSE, RMSE, MAE for regression, AUC, F1, Precision, Accuracy etc. for Classification.

Each individual gets a score via cross validation on the fitness function.
Cross validation actually has a nice tangent to actuall biology.

If an individual doesn't hunt gazelles well, maybe it catches fish better. (I.e. get different chances on different subsets of the training data!) So the one who does best overall gets picked as the fittest.

In [None]:
def get_fitness(individual, fitness_function, cv = 3):
    score = cross_val_score(individual, X_train, y_train, cv=cv, scoring = fitness_function)
    return score.mean()

Alright, we have our population, we let them loose in the wild. And now they get to pass on their genes to the next generation!

The population automatically sorts it's individuals by fitness, depending on the selection process that is used in one way or another to produce the new generation.


There is recombination with more than two individuals and those are quite interesting implementations. 

For my project, I use just two parents, which simply produce two children.

Children are first direct copies of their parents and then there is a possibility of crossover/recombination and mutation to produce new individuals

In [None]:
def select_breeding(population, selection = 'truncation', frac = 0.75):
    if selection == 'truncation':
        cut = int(len(population.population)*frac)
        breeding = population.population[:cut]
        return breeding
    elif selection == 'fitness_proportionate' or selection == 'fp':
        size = int(population.size * frac)
        return fp_selection(population, size)
    elif selection == 'tournament':
        size = int(population.size * frac)
        return tournament_selection(population, size)
    elif selection == 'sus':
        size = int(population.size * frac)
        return sus_selection(population, size)

In [None]:
def fp_selection(pop, size):
    p = np.array([ind.fitness for ind in pop.population])
    total_fitness = p.sum()
    p = p / total_fitness
    #p = np.cumsum(p) nice alternative solution
    return rng.choice(pop.population, size = size, p = p).tolist()

Stochastic Universal Sampling is super weird, sometimes you can get so stuck that there will be zero improvements after the first iteration


In [None]:
#stochastic universal sampling
def sus_selection(pop, size):
    p = np.array([ind.fitness for ind in pop.population]).cumsum()
    total_fitness = np.array([ind.fitness for ind in pop.population]).sum()
    step = total_fitness / size
    start = rng.uniform(0, step)
    steps = [(start + i*step) for i in range(size)]
    i = 0
    breeding = []
    for s in steps:
        while p[i] < s:
            i = i + 1
            breeding.append(pop.population[i])
    return breeding

In [None]:
#add requirement size and elitism have to be even!
#also elitism is almost unnecessary if tournament, almost!
def tournament_selection(pop, size):
    participants = [ind for ind in pop.population]
    breeding = []
    #could implement different rounds here
    #but I think that's almost the same as calling tournament different times with smaller sizes
    for i in range(size):
        a, b = rng.choice(participants, 2)
        if a > b:
            breeding.append(a)
            participants.remove(a)
        else:
            breeding.append(b)
            participants.remove(b)
    return breeding
        
    
#reverse tournament, eliminates need for elitism
#could use with parallelism
def rev_tournament_selection(pop, size):
    breeding = [ind for ind in pop.population]
    num_eliminated = len(breeding) - size
    for i in range(num_eliminated):
        a, b = rng.choice(participants, 2)
        if a > b:
            breeding.remove(b)
        else:
            breeding.remove(a)
    return breeding
        

Alright, lets get ready have natural selection do it's thing.

![display image](https://www.publicdomainpictures.net/pictures/10000/nahled/1-1276250040fO7C.jpg)

In [None]:
def breed(parent_1, parent_2, p_cross, p_mutate):
    # check for recombination
    # if crossover happens at probability p then not crossover would happen at probability 1-p
    #rand() will draw a number larger than p_cross 1-p times
    #and a number < p_cross p times
    # children are copies of parents by default
    child_1, child_2 = Individual(parent_1.genes, parent_1.fitness), Individual(parent_2.genes, parent_2.fitness)
    if np.random.rand() < p_cross: 
        
        genes = list(child_1.genes)
        child_1, child_2 = crossover(parent_1, parent_2, child_1, child_2)
        #mutate if p
    if np.random.rand() < p_mutate:
        child_1.mutate()
    if np.random.rand() < p_mutate:
        child_2.mutate()
        
        child_1.set_fitness()
        child_2.set_fitness()
    return child_1, child_2

In [None]:
# crossover two parents to create two children
# should not be called by itself because it doesn't set fitness 
def crossover(parent_1, parent_2, child_1, child_2):
    # children are copies of parents by default
    genes = list(child_1.genes) #make global to make more efficient!
    # select crossover point that is not on the end of the string
    start = rng.choice(range(len(genes) - 1))
    #no crossover happening
    if start == len(genes) -1:
        return [child_1, child_2]
    cut = rng.choice(range(start, len(genes)))
    #no crossover happening
    if cut == start:
        return [child_1, child_2]
    # perform crossover
    for gene in genes[start:cut]:
        if isinstance(gene_pool[gene], float): #introduce more diversity by modified crossover for continous values
            #could also solve this with algebra, but I like using the predefined gene_pool
            lower = parent_1[gene]
            higher = parnt_2[gene]
            if parent_1[gene] > parent_2[gene]:
                lower = parent_2[gene]
                higher = parent_1[gene]
                
            new_gene_1, new_gene_2, = rng.choice(gene_pool[gene][(gene_pool[gene] >= lower) & (gene_pool[gene] <= higher)], 2)
            child_1.set_gene(gene, new_gene_1)
            child_2.set_gene(gene, new_gene_2)
        else:
            child_1.set_gene(gene, parent_2.genes[gene])
            child_2.set_gene(gene, parent_1.genes[gene])
        
    return child_1, child_2

In [None]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")
gs = pd.read_csv("../input/titanic/gender_submission.csv")
X, y = train.drop(["Survived", "PassengerId", "Name", "Cabin", "Ticket"], axis=1).copy(), train["Survived"].copy()
enc = LabelEncoder()
X.Embarked = enc.fit_transform(X.Embarked)
X.Sex = enc.fit_transform(X.Sex)
X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = StandardScaler()
X_train[X.columns] = scaler.fit_transform(X_train)
X_test[X.columns] = scaler.transform(X_test)
X_train.fillna(0, inplace = True)
X_test.fillna(0, inplace = True)

In [None]:
def simulate(params,
             scorer,
             iterations,
             model,
             train_set,
             train_labels,
             selection = 'fp',
             p_cross = 1.0,
             cv = 3,
             p_mutate = 1.0,
             sim_ann = True, 
             restrict_gene_pool = True, #narrow genes i.e. finetune
             gene_pool_window = 1.0, #initial size of window
             decay = None,
             elitism = 2):
    #add some fixed genes
    global X_train, y_train, estimator, fitness_function, gene_pool, restrict_gnp, gnp_window, rng
    global p_outlier
    generations = np.arange(0, iterations)
    fitness_prog = []
    p_outlier = 1 - p_mutate
    rng = np.random.default_rng()
    X_train, y_train = train_set, train_labels
    fitness_function = scorer
    estimator = model
    gene_pool = params
    restrict_gnp = restrict_gene_pool
    gnp_window = gene_pool_window
    population = Population(gene_pool)
    best_fitness = population.best_fitness
    if decay == None:
        decay = 1 / iterations
    print(f"best initial fitness: {population.best_fitness}")
    for i in trange(iterations):
        fitness_prog.append(best_fitness)
        new_gen = []
        breeding = select_breeding(population, selection)
        for elite in range(elitism):
            new_gen.append(population.population[elite])
       
        #elitism to be implemented here
        while(len(new_gen) < population.size): #let populatin size oscillate +1 -1?
            parent_1, parent_2 = rng.choice(breeding, 2) #possibility of selecting the same individual
            child_1, child_2 = breed(parent_1, parent_2, p_cross, p_mutate)
            new_gen.append(child_1)
            new_gen.append(child_2)
        #replace the previous generation
        population.replace_generation(new_gen)
        #are you better than the last?
        if (best_fitness < population.best_fitness):
            diff = population.best_fitness - best_fitness
            best_fitness = population.best_fitness
            display(f"child {population.best_individual} with fitness {population.best_fitness}, which is {diff} better than before")
        if sim_ann:
            p_cross = p_cross - p_cross*decay
            p_mutate = p_mutate - p_mutate*decay
            if p_outlier > 0.1:
                p_outlier = 1 - p_mutate
    #note if several individuals have same fitness anyone of them is returned
    return population.best_individual, (generations, fitness_prog)

In [None]:
reg = LogisticRegression()
reg.fit(X_train, y_train)
print(f"baseline score: {roc_auc_score(y_test, reg.predict(X_test))}")

In [None]:
start = time.time()
best, history= simulate(params, 'roc_auc', 1000, LogisticRegression, X_train, y_train, selection = 'tournament')
end = time.time()
ga_time = end - start

In [None]:
print(best)

In [None]:
sns.regplot(x = history[0], y = history[1])
plt.xlabel('generation')
plt.ylabel('fitness')
plt.show()

In [None]:
start = time.time()
ran_search = RandomizedSearchCV(LogisticRegression(), params, scoring = 'roc_auc', random_state = 42, n_iter =10000, cv = 3)
ran_search.fit(X_train, y_train)
end = time.time()
random_search_time = end - start

In [None]:
ran_search.best_score_

In [None]:
ran_search.best_estimator_

In [None]:
optuna.logging.set_verbosity(0)

In [None]:
# 1. Define an objective function to be maximized.
def objective(trial):
    # 2. Suggest values for the hyperparameters using a trial object.
    
    #create a gene_pool millions of possible combinations with just two variables!
    parameters = dict()
    parameters["penalty"] = "elasticnet"
    parameters["C"] = trial.suggest_float('C', 0, 100)
    parameters["l1_ratio"] = trial.suggest_uniform('l1_ratio', 0, 1)
    parameters["solver"] = 'saga'
    reg = LogisticRegression(**parameters)
    score = cross_val_score(reg, X_train, y_train, n_jobs=-1, cv=3, scoring = 'roc_auc')
    roc = score.mean()
    return roc

# 3. Create a study object and optimize the objective function.
start = time.time()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, gc_after_trial = True)
end = time.time()
optuna_time = end - start

In [None]:
print(study.best_trial.value)

In [None]:
times = np.array([ga_time, random_search_time, optuna_time])
x = ['GA', 'Random', 'Optuna']

In [None]:
sns.barplot(x =x , y = times)
plt.show()