In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from deap import algorithms, base, creator, tools
import random
import pickle

In [None]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [None]:
df = pd.read_csv("C:\Feature Gap\prime-india_diabates.csv", names = names)

In [None]:
X = df.iloc[:, : -1]
y = df.iloc[:, -1]

In [None]:
X.select_dtypes(include=[object]).shape[1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
clf = RandomForestClassifier(n_estimators=100)

In [None]:
clf.fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_test)

In [123]:
f1_score(y_test,y_pred)

0.6211180124223602

In [None]:
filename = 'finalized_rf_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
model =  pickle.load(open(filename, 'rb'))  # Need to load from client model
n_features = X.shape[1]
n_pops = 20

In [None]:
 def calculate_fitness(model,x,y, n_splits):
        cv_set = np.repeat(-1.,x.shape[0])
        skf = StratifiedKFold(n_splits = n_splits)
        for train_index,test_index in skf.split(x,y):
            x_train,x_test = x.iloc[train_index],x.iloc[test_index]
            y_train,y_test = y.iloc[train_index],y.iloc[test_index]
            if x_train.shape[0] != y_train.shape[0]:
                raise Exception()
            model.fit(x_train,y_train)
            predicted_y = model.predict(x_test)
            cv_set[test_index] = predicted_y
        return f1_score(y,cv_set)

In [None]:
 def evaluate_fitness(individual):
        np_ind = np.asarray(individual)
        if np.sum(np_ind) == 0:
            fitness = 0.0
        else:
            feature_idx = np.where(np_ind==1)[0]
            fitness = calculate_fitness(model,X,y, n_splits = 10)
        return fitness,

In [None]:
creator.create("FeatureSelect", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FeatureSelect)

In [None]:
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n = n_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [None]:
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate_fitness)

In [None]:
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

In [None]:
hof = tools.HallOfFame(1)

In [None]:
pop = toolbox.population(n_pops)

In [None]:
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, stats=stats, halloffame=hof, verbose=True)

In [None]:
m = tools.selBest(pop, k=1)

In [113]:
Columns_Selected = X.columns[[i == 1 for i in list(*m)]]

In [117]:
Columns_Selected

Index(['preg', 'plas', 'pedi'], dtype='object')

In [119]:
X_new = X[Columns_Selected]

In [120]:
X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new, y, test_size = 0.3, random_state = 42)

In [121]:
clf.fit(X_new_train,y_new_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [122]:
y_new_pred = clf.predict(X_new_test)

In [124]:
f1_score(y_new_test,y_new_pred )

0.6000000000000001

In [125]:
df.shape[0]

768

In [4]:
import numpy as np
a = np.random.randn(3, 3) # a.shape = (4, 3)
b = np.random.randn(3, 1) # b.shape = (3, 2)
c = a*b

In [5]:
c.shape

(3, 3)