In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE 
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter(action='ignore', category=Warning)
%matplotlib inline



X_train = pd.read_csv('~/reproducible-research-IA369Z/data/X_train.csv')
X_test = pd.read_csv('~/reproducible-research-IA369Z/data/X_test.csv')
y_train = pd.read_csv('~/reproducible-research-IA369Z/data/y_train.csv')
y_test = pd.read_csv('~/reproducible-research-IA369Z/data/y_test.csv')


X_train =X_train.to_numpy()
X_test = X_test.to_numpy()
y_train =y_train.to_numpy()
y_test = y_test.to_numpy()



#X_train, X_test,y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=42)

###############################################################
# prepare configuration for cross validation test harness

seed = 42


# prepare models

param_grids=[]
##logistic
param_grids_0= {           
    'penalty':['l1', 'l2']
}
param_grids.append(param_grids_0)

##knn
param_grids_1 = {                      
    'n_neighbors': [3,5,7,10,25]
}
param_grids.append(param_grids_1)

## Decision Tree
param_grids_2 = { 
    'criterion':['gini', 'entropy'],
    'max_depth': [ 3, 10, 50, None],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'class_weight': ['balanced',None]
}

param_grids.append(param_grids_2)
## NB
param_grids_3 = {   
    'priors':[None],
    'var_smoothing':[1e-09]
}

param_grids.append(param_grids_3)
 ## SVM
param_grids_4 = {  
     'gamma' :['scale'],
     'kernel' : [ 'rbf', 'poly'],
     #'degree' : [1,3,6],
     'C' : [0.01, 0.1, 1]

}
param_grids.append(param_grids_4)
 ### Random foresst
param_grids_5 = {      
    'bootstrap': [True],
    'max_depth': [ 3, 10, 50, 90, 100],
    'max_features': [3],
    'n_estimators': [100, 200,500],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'class_weight': ['balanced',None]
}
param_grids.append(param_grids_5)
### MLP
param_grids_6 = {    
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam', 'lbfgs' ],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
param_grids.append(param_grids_6)

models = []
models.append((LogisticRegression()))
models.append((KNeighborsClassifier()))
models.append((DecisionTreeClassifier()))
models.append((GaussianNB()))
models.append((SVC()))
models.append((RandomForestClassifier()))
models.append((MLPClassifier()))


best_estimator=[]


for i in range(7):
    #print(param_grids[i])
    #param_grids= param_grids_%d' % (i))
    grid_search = GridSearchCV(estimator = models[i], param_grid = param_grids[i],  cv =5, n_jobs = -1, verbose = 0, return_train_score=True)
    grid_search.fit(X_train, y_train)
    #print(grid_search.best_params_)
    
    best_estimator.append(grid_search.best_estimator_)
    
print('----------------------------------------')
print('Best parameters for seven algorithms')
print('----------------------------------------')
print()

for j in range(7):
    print( best_estimator[j])
    print('\n')
    




In [None]:
import warnings
warnings.filterwarnings('ignore')
import pickle

print('----------------------------------------')
print('Box plot of all model cross validation accuracy')
print('----------------------------------------')
print()
# prepare configuration for cross validation test harness
seed = 42
# prepare models
models = []

models.append(('LR',  best_estimator[0])) #LogisticRegression()
models.append(('KNN', best_estimator[1])) #KNeighborsClassifier()))
models.append(('DTC', best_estimator[2])) #DecisionTreeClassifier()))
models.append(('GNB', best_estimator[3]))
models.append(('SVM', best_estimator[4]))
models.append(('RFC', best_estimator[5]))
models.append(('MLP', best_estimator[6]))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithms AcuuracyComparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.savefig('box_plot.pdf')
plt.show()

## save random forrest model with tune parameter
rfc_model = best_estimator[5]
rfc_model.fit(X_train, y_train)
# save the model to disk
pickle.dump(rfc_model, open('rfc', 'wb'))