## Feature Selection in Machine Learning
We will be looking at the Friedman dataset from the sklearn library. It is a popular dataset used to compare feature selection methods for machine learning algorithms.

### Step 1: Defining a class for the problem

In [166]:
%matplotlib notebook

In [148]:
from sklearn import datasets
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
import numpy as np

In [149]:
class Problem:
    
    def __init__(self,num_samples,num_features,noise,random_seed,test_split):
        
        self.num_samples = num_samples
        self.num_features = num_features
        self.noise = noise
        self.random_seed = random_seed
        self.test_split = test_split
        
        
        self.X, self.Y = datasets.make_friedman1(n_samples=self.num_samples,n_features=self.num_features,
                                                 noise=self.noise,random_state=self.random_seed)
        self.X_train, self.X_validation, self.Y_train, self.Y_validation = model_selection.train_test_split(self.X,self.Y,
                                                                    test_size = self.test_split,random_state=self.random_seed)
        
        self.regressor = ensemble.GradientBoostingRegressor(random_state=self.random_seed)
    
    def get_MSE(self,binary_list):
        
        zeroIndices = [i for i,n in enumerate(binary_list) if n==0]
        currentX_train = np.delete(self.X_train, zeroIndices, 1)
        currentX_validation = np.delete(self.X_validation, zeroIndices, 1)
        
        self.regressor.fit(currentX_train,self.Y_train)
        
        predictions = self.regressor.predict(currentX_validation)
        
        return metrics.mean_squared_error(self.Y_validation,predictions)

### Step 2: Problem representation and Parameters of Genetic algorithm

In [150]:
import random
from deap import tools
from deap import base
from deap import creator
import elitism
import seaborn as sns
import matplotlib.pyplot as plt

In [159]:
POPULATION_SIZE = 30
P_CROSSOVER = 0.9
P_MUTATION = 0.3
MAX_GENERATIONS = 30
HALL_OF_FAME_SIZE = 5
NUM_SAMPLES = 60
NUM_FEATURES = 15
NOISE = 1.0
TEST_SPLIT = 0.2
RANDOM_SEED = 42

In [161]:
creator.create("FitnessMin",base.Fitness,weights=(-1.0,))
creator.create("individual",list,fitness = creator.FitnessMin)

In [162]:
friedman = Problem(NUM_SAMPLES,NUM_FEATURES,NOISE,RANDOM_SEED,TEST_SPLIT)

toolbox = base.Toolbox()

toolbox.register("zeroOrone",random.randint,0,1)

toolbox.register("individualCreator",tools.initRepeat,creator.individual,toolbox.zeroOrone,NUM_FEATURES)
toolbox.register("populationCreator",tools.initRepeat,list,toolbox.individualCreator)

toolbox.register("select",tools.selTournament,tournsize=2)
toolbox.register("mate",tools.cxTwoPoint)
toolbox.register("mutate",tools.mutFlipBit,indpb=1.0/NUM_FEATURES)

def get_MSE(individual):
    return friedman.get_MSE(individual),

toolbox.register("evaluate",get_MSE)

### Step 3: Genetic algorithm flow

In [None]:
population = toolbox.populationCreator(POPULATION_SIZE)

stats = tools.Statistics(lambda ind: ind.fitness.values)

stats.register("min", np.min)
stats.register("avg", np.mean)

hof = tools.HallOfFame(HALL_OF_FAME_SIZE)

population, logbook = elitism.eaSimpleWithElitism(population, toolbox, cxpb=P_CROSSOVER, mutpb=P_MUTATION,
                                                  ngen=MAX_GENERATIONS, stats=stats, halloffame=hof, verbose=True)

best = hof.items[0]
print("-- Best Ever Individual = ", best)
print("-- Best Ever Fitness = ", best.fitness.values[0])

minFitnessValues, meanFitnessValues = logbook.select("min", "avg")

sns.set_style("whitegrid")
plt.plot(minFitnessValues, color='red')
plt.plot(meanFitnessValues, color='green')
plt.xlabel('Generation')
plt.ylabel('Min / Average Fitness')
plt.title('Min and Average fitness over Generations')
plt.savefig("Images/ML_Plot.png")
plt.show()

gen	nevals	min    	avg    
0  	30    	12.2102	25.8835
1  	24    	11.37  	21.2714
2  	22    	9.67989	16.5366
3  	24    	9.67989	15.7351
4  	22    	9.67989	12.8944
5  	24    	8.2708 	11.6992
6  	22    	7.45319	10.6374
7  	24    	7.45319	10.2147
8  	21    	7.45319	10.4654
9  	24    	7.45319	9.50906
10 	25    	7.45319	10.0254
11 	24    	7.40186	10.1017
12 	24    	7.40186	8.80597
13 	23    	6.70267	8.14531
14 	21    	6.70267	8.29383
15 	22    	6.70267	8.32698
16 	23    	6.70267	8.29836
17 	25    	6.70267	8.19991
18 	21    	6.70267	7.53792
19 	23    	6.70267	7.78032
20 	21    	6.70267	8.58631
21 	22    	6.70267	7.57305
22 	22    	6.70267	7.75474
23 	23    	6.70267	8.00354
24 	25    	6.70267	8.02115
25 	23    	6.70267	7.80624
26 	20    	6.70267	7.45505
27 	22    	6.70267	8.0233 
