In [1]:
import pandas as pd
import numpy as np
import os,time,sys
import pickle
from sklearn.model_selection import train_test_split

In [2]:
from preprocessing.Normalize import Normalize
import helper.SeriesHelper as series_helper

In [3]:
normal_matrix = Normalize().get_normalized_data()

In [4]:
cols = normal_matrix.columns
index = normal_matrix.index
X = normal_matrix.to_numpy()
y = series_helper.get_relapse_value_from_series_matrix(normal_matrix)

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [6]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFdr, SelectFpr, SelectFwe
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE,RFECV
from sklearn.feature_selection import chi2,f_classif, mutual_info_classif
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import LogisticRegression,LassoCV, Lasso
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, NMF
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from xgboost import XGBClassifier
from tpot import TPOTClassifier

In [7]:
score_func = [chi2,f_classif,mutual_info_classif]
k = [num for num in range(100,1050,100)]
percentile = [perc for perc in range(5,15,3)]
alpha = [alpha/1000 for alpha in range(0,55,5)]
penalty = ['l1','l2']
neighbors = [x for x in range(5,50,10)]
estimator = []
# estimator += [KNeighborsClassifier(n_neighbors = neighbor) for neighbor in neighbors]
# estimator += [LinearSVC(penalty=x,dual=False) for x in penalty] 
# estimator += [NuSVC(),SVC()]
# estimator += [SGDClassifier(penalty=x) for x in penalty] 
estimator += [LogisticRegression(penalty=x,dual=False) for x in penalty] 
# estimator += [GaussianNB(), MultinomialNB(), ComplementNB(), BernoulliNB()]
# estimator += [LassoCV(cv=5,n_jobs=7)]
estimator += [ExtraTreesClassifier(bootstrap=True,n_jobs=4)]
estimator += [ExtraTreesClassifier(n_jobs=4)]
estimator += [XGBClassifier()]
# estimator += [BaggingClassifier(bootstrap=True,n_jobs=4)]
# estimator += [BaggingClassifier(n_jobs=4)]

In [8]:
# feature selection
select_k_best_grid = [{'score_func': score_func,'k': k}]
select_percentile_grid = [{'score_func': score_func,'percentile': percentile}]
select_fdr_grid = [{'score_func': score_func,'alpha': alpha}]
select_fpr_grid = [{'score_func': score_func,'alpha': alpha}]
select_fwe_grid = [{'score_func': score_func,'alpha': alpha}]
# doesnot goes with the GridSearchCV, cross_val_score
select_from_model_grid = [{'estimator' : estimator}]
rfe_grid  = [{'estimator' : estimator,'n_features_to_select' : k, 'step' : [50]}]
rfecv_grid = [{'estimator' : estimator, 'min_features_to_select' : [100], 'cv' : [10],'n_jobs' : [4]}]

In [9]:
param_grid = [
{
    'feature_selection': [PCA(iterated_power=7)],
    'feature_selection__n_components': [100,200,300,400,500,600],
    'model' : estimator
},
{
    'feature_selection': [NMF()],
    'feature_selection__n_components': [100],
    'model' : estimator
},
{
    'feature_selection': [SelectKBest()],
    'feature_selection__score_func': select_k_best_grid[0]['score_func'],
    'feature_selection__k' : select_k_best_grid[0]['k'], 
    'model' : estimator
},
{
    'feature_selection': [SelectPercentile()],
    'feature_selection__score_func': select_percentile_grid[0]['score_func'],
    'feature_selection__percentile' : select_percentile_grid[0]['percentile'], 
    'model' : estimator  
},

]

In [10]:
pipeline = Pipeline(steps = [('feature_selection',SelectKBest()),('model',estimator[0])])

In [11]:
parameter = ParameterGrid(param_grid)

In [12]:
grid_search = GridSearchCV(pipeline,param_grid=param_grid,n_jobs = -1,cv=5,return_train_score=True,verbose=20)

In [13]:
# grid_search.fit(X_train,y_train)

In [14]:
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42,n_jobs=7)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_.py')

Optimization Progress:  23%|██▎       | 68/300 [28:06<5:50:04, 90.54s/pipeline]

In [27]:
df = pd.DataFrame(columns=['Model','dimension','train_score','test_score'])
for model in estimator:
    data = [model]
    pca = PCA().fit(X_train,y_train)
    select = SelectFromModel(model).fit(pca.transform(X_train),y_train)
    temp_train = select.transform(pca.transform(X_train))
    data.append(temp_train.shape)
    temp_test = select.transform(pca.transform(X_test))
    model.fit(temp_train,y_train)
    data.append(model.score(temp_train,y_train))
    data.append(model.score(temp_test,y_test))
    df = df.append(pd.Series(data),ignore_index=True)
    print(data)
    
    
    

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False), (837, 439), 0.996415770609319, 0.6166666666666667]
[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False), (837, 342), 1.0, 0.6166666666666667]
[ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_sample

In [12]:
def add_pickle(oobject,filename):
        with open(filename+'.pickle', 'wb') as f:
            pickle.dump(oobject, f)