In [1]:
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
from sklearn.metrics import f1_score, r2_score
from sklearn import preprocessing
from sklearn import utils
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score

class FitenessFunction:
    
    def __init__(self,n_splits = 5,*args,**kwargs):
        """
            Parameters
            -----------
            n_splits :int, 
                Number of splits for cv
            
            verbose: 0 or 1
        """
        self.n_splits = n_splits
    

    def calculate_fitness(self,model,x,y):
#         print(x.shape, y.shape)
#         print(y)
        #lb = preprocessing.LabelBinarizer(y)
        #lb.fit_transform(y)
#         print(type(y))
        y_pred = cross_val_predict(model, x, y, cv=5)
        acc = accuracy_score(y, y_pred)
        #return r2_score(y,cv_set)
        return acc


In [2]:
from deap import base, creator
import random
import numpy as np
from deap import tools
#import fitness_function as ff


class Feature_Selection_GA:
    """
        FeaturesSelectionGA
        This class uses Genetic Algorithm to find out the best features for an input model
        using Distributed Evolutionary Algorithms in Python(DEAP) package. Default toolbox is
        used for GA but it can be changed accordingly.

    
    """
    def __init__(self,model,x,y,cv_split=5,verbose=1):
        """
            Parameters
            -----------
            model : scikit-learn supported model, 
                x :  {array-like}, shape = [n_samples, n_features]
                     Training vectors, where n_samples is the number of samples 
                     and n_features is the number of features.
 
                y  : {array-like}, shape = [n_samples]
                     Target Values
            cv_split: int
                     Number of splits for cross_validation to calculate fitness.
            
            verbose: 0 or 1
        """
        self.model =  model
        self.n_features = x.shape[1]
        self.toolbox = None
        self.creator = self._create()
        self.cv_split = cv_split
        self.x = x
        self.y = y
        self.verbose = verbose
        if self.verbose==1:
            print("Model {} will select best features among {} features using cv_split :{}.".format(model,x.shape[1],cv_split))
            print("Shape od train_x: {} and target: {}".format(x.shape,y.shape))
        self.final_fitness = []
        self.fitness_in_generation = {}
        self.best_ind = None
    
    def evaluate(self,individual):
        fit_obj = FitenessFunction(self.cv_split)
        np_ind = np.asarray(individual)
        if np.sum(np_ind) == 0:
            fitness = 0.0
        else:
            feature_idx = np.where(np_ind==1)[0]
            fitness = fit_obj.calculate_fitness(self.model,self.x[:,feature_idx],self.y)
        
        if self.verbose == 1:
            print("Individual: {}  Fitness_score: {} ".format(individual,fitness))
            
        return fitness,
    
    
    def _create(self):
        creator.create("FeatureSelect", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FeatureSelect)
        return creator
    
    def create_toolbox(self):
        """ 
            Custom creation of toolbox.
            Parameters
            -----------
                self
            Returns
            --------
                Initialized toolbox
        """
        
        self._init_toolbox()
        return toolbox
        
    def register_toolbox(self,toolbox):
        """ 
            Register custom created toolbox. Evalute function will be registerd
            in this method.
            Parameters
            -----------
                Registered toolbox with crossover,mutate,select tools except evaluate
            Returns
            --------
                self
        """
        toolbox.register("evaluate", self.evaluate)
        self.toolbox = toolbox
     
    
    def _init_toolbox(self):
        toolbox = base.Toolbox()
        toolbox.register("attr_bool", random.randint, 0, 1)
        # Structure initializers
        toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, self.n_features)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        return toolbox
        
        
    def _default_toolbox(self):
        toolbox = self._init_toolbox()
        toolbox.register("mate", tools.cxTwoPoint)
        toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
        toolbox.register("select", tools.selTournament, tournsize=3)
        toolbox.register("evaluate", self.evaluate)
        return toolbox
    
    def get_final_scores(self,pop,fits):
        self.final_fitness = list(zip(pop,fits))
        
    
        
    def generate(self,n_pop,cxpb = 0.5,mutxpb = 0.2,ngen=20,set_toolbox = False):
        
        """ 
            Generate evolved population
            Parameters
            -----------
                n_pop : {int}
                        population size
                cxpb  : {float}
                        crossover probablity
                mutxpb: {float}
                        mutation probablity
                n_gen : {int}
                        number of generations
                set_toolbox : {boolean}
                              If True then you have to create custom toolbox before calling 
                              method. If False use default toolbox.
            Returns
            --------
                Fittest population
        """
        
        
        
        if self.verbose==1:
            print("Population: {}, crossover_probablity: {}, mutation_probablity: {}, total generations: {}".format(n_pop,cxpb,mutxpb,ngen))
        
        if not set_toolbox:
            self.toolbox = self._default_toolbox()
        else:
            raise Exception("Please create a toolbox.Use create_toolbox to create and register_toolbox to register. Else set set_toolbox = False to use defualt toolbox")
        pop = self.toolbox.population(n_pop)
        CXPB, MUTPB, NGEN = cxpb,mutxpb,ngen

        # Evaluate the entire population
        print("EVOLVING.......")
        fitnesses = list(map(self.toolbox.evaluate, pop))
        
        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit
        
        fit_ls = []
        for g in range(NGEN):
            print("-- GENERATION {} --".format(g+1))
            offspring = self.toolbox.select(pop, len(pop))
            self.fitness_in_generation[str(g+1)] = max([ind.fitness.values[0] for ind in pop])
            # Clone the selected individuals
            offspring = list(map(self.toolbox.clone, offspring))

            # Apply crossover and mutation on the offspring
            for child1, child2 in zip(offspring[::2], offspring[1::2]):
                if random.random() < CXPB:
                    self.toolbox.mate(child1, child2)
                    del child1.fitness.values
                    del child2.fitness.values

            for mutant in offspring:
                if random.random() < MUTPB:
                    self.toolbox.mutate(mutant)
                    del mutant.fitness.values

            # Evaluate the individuals with an invalid fitness
            weak_ind = [ind for ind in offspring if not ind.fitness.valid]
            fitnesses = list(map(self.toolbox.evaluate, weak_ind))
            print('here-->')
            print(fitnesses)
            
            ls = []
            for i in fitnesses:
                ls.append(i[0])

            mx_fit = max(ls)
            fit_ls.append(mx_fit)
            
            for ind, fit in zip(weak_ind, fitnesses):
                ind.fitness.values = fit
            print("Evaluated %i individuals" % len(weak_ind))

            # The population is entirely replaced by the offspring
            pop[:] = offspring
            
                    # Gather all the fitnesses in one list and print the stats
        fits = [ind.fitness.values[0] for ind in pop]
        
        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2 / length - mean**2)**0.5
        if self.verbose==1:
            print("  Min %s" % min(fits))
            print("  Max %s" % max(fits))
            print("  Avg %s" % mean)
            print("  Std %s" % std)
    
        print("-- Only the fittest survives --")

        best_ind = tools.selBest(pop, 1)[0]
        print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))
#        self.get_final_scores(pop,fits)
        
        print(fits)
        
        print('fitness evolution :')
        print(fit_ls)
    
        return pop
    
   
    
    


In [3]:
import pandas as pd
df = pd.read_csv('../input/datathonset/final_dataframe.csv')

In [4]:
cleanup_nums = {"activity":     {'Walking': 0, 'Jogging': 1, 'Upstairs': 2, 'Downstairs': 3, 'Sitting': 4,
       'Standing': 5}}
df.replace(cleanup_nums, inplace=True)
df.head()

Unnamed: 0,id,activity,time_min,time_max,time_average,x_min,y_min,z_min,x_std,y_std,z_std,x_var,y_var,z_var,x_median,y_median,z_median,x_mean,y_mean,z_mean,x_max,y_max,z_max
0,1,0,4991920000000.0,5974870000000.0,5472666000000.0,-19.61,-5.79,-15.09,6.967989,5.39616,3.220753,48.552864,29.118539,10.373247,-1.23,9.62,-0.72,-1.000041,9.455536,-0.343607,19.57,19.57,13.44
1,1,1,5374660000000.0,6298490000000.0,5854316000000.0,-19.61,-19.61,-17.2,10.706282,9.483499,5.653878,114.624479,89.936758,31.966332,-0.04,0.42,0.57,-0.208456,0.197931,1.164557,19.57,19.57,19.0
2,1,2,6489310000000.0,6848770000000.0,6668160000000.0,-19.61,-13.95,-16.44,7.779192,6.492359,4.75123,60.515836,42.150722,22.574183,-6.7,5.09,0.15,-6.349433,5.332679,0.63366,16.55,19.57,18.58
3,1,3,6552940000000.0,6895550000000.0,6726542000000.0,-19.61,-5.01,-10.15,8.061251,4.761467,3.978546,64.98377,22.671565,15.828829,-0.84,8.54,-0.11,-1.033366,8.54287,0.575641,19.57,19.57,17.05
4,2,0,7981270000000.0,10012500000000.0,9028226000000.0,-19.5,-3.21,-17.27,3.15126,3.480465,3.433426,9.93044,12.113639,11.788414,-4.21,8.77,-0.5,-4.281787,8.76406,-0.571448,10.92,19.57,11.14


In [5]:
## convert into matrix
df = df.as_matrix()

  


In [6]:
X = df[:,2:]
y = df[:,1]

In [7]:
# ## import necessary libraries
# import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt

# ## Read Input files
# #df = pd.read_excel('../input/ppg-data/Normal_dataset.xlsx')
# df = pd.read_csv('../input/cxf286/CXR_feature_IP_CXF30_HOG_24k.csv')

# ## Show first 5 subjects
# print(df.head())

# ## Show the shape of df
# print(df.shape)

# ## convert into matrix
# df = df.as_matrix()

# ### split Input feature and Labels
# #y = df[:,[1]] ## label
# #X = df[:,[0,2,3,4,5,6,7,8,9,10,11,12,13,14]]  ## input feature 

# ### split Input feature and Labels
# y = df[:,287]## label
# X = df[:,64:128]  ## input feature

# ## split the dataset with train and test set
# ##========== If you want to take first 80% values into train set then follow (1) otherwise follow (2) 
# ############ random selection
# #####(1) 
# train_size = int(X.shape[0])
# X_train, X_test, y_train, y_test = X[0:train_size], X[train_size:], y[0:train_size], y[train_size:]

In [8]:

    
####(2)
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Now, You are ready for applying GA on your datset for feature selection.

### import GA's files
#from feature_selection_ga import *
#from fitness_function import *

#### Seed
import random
seed = 42
random.seed(seed)

### Now run
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
#model = LogisticRegression()
model = RandomForestClassifier(n_estimators=100, max_depth=64,random_state=0,n_jobs=-1)
fsga = Feature_Selection_GA(model,X,y)
pop = fsga.generate(10) ## population size = 10


#####================ OMG. R^2 = 0.7810365329522431
## Result
###Best individual is [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0], (0.7810365329522431,)


Model RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=64, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False) will select best features among 21 features using cv_split :5.
Shape od train_x: (179, 21) and target: (179,)
Population: 10, crossover_probablity: 0.5, mutation_probablity: 0.2, total generations: 20
EVOLVING.......
Individual: [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0]  Fitness_score: 0.6312849162011173 
Individual: [0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1]  Fitness_score: 0.5698324022346368 
Individual: [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0]  Fitness_score: 0.6983240223463687 
Individual: [1, 1, 1, 0, 1, 0, 0, 0