In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

from sklearn.datasets import load_digits

 # Dataset

In [2]:
digits = load_digits()
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [3]:
df = pd.DataFrame(digits.data, columns=digits.feature_names)
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [4]:
df.shape

(1797, 64)

# Normalization

In [5]:
n_scaler = MinMaxScaler()

df2 = pd.DataFrame(n_scaler.fit_transform(df), columns=df.columns)
df2.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,0.3125,0.8125,0.5625,0.0625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.375,0.8125,0.625,0.0,0.0,0.0
1,0.0,0.0,0.0,0.75,0.8125,0.3125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.6875,1.0,0.625,0.0,0.0
2,0.0,0.0,0.0,0.25,0.9375,0.75,0.0,0.0,0.0,0.0,...,0.3125,0.0,0.0,0.0,0.0,0.1875,0.6875,1.0,0.5625,0.0
3,0.0,0.0,0.4375,0.9375,0.8125,0.0625,0.0,0.0,0.0,0.5,...,0.5625,0.0,0.0,0.0,0.4375,0.8125,0.8125,0.5625,0.0,0.0
4,0.0,0.0,0.0,0.0625,0.6875,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.125,1.0,0.25,0.0,0.0


# Normal Train Test Split 
- Problem is that with each execution the score of the models change as the train and test data change

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df2,digits.target,train_size=0.8)

In [7]:
# driver function for getting score

def get_score(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    return model.score(x_test,y_test)

In [8]:
# Logistic Regression
get_score(LogisticRegression(max_iter=1000),x_train,x_test,y_train,y_test)

0.9638888888888889

In [9]:
# SVM
get_score(SVC(),x_train,x_test,y_train,y_test)

0.9861111111111112

In [10]:
# Decision Tree
get_score(DecisionTreeClassifier(),x_train,x_test,y_train,y_test)

0.825

In [11]:
# Random Forest
get_score(RandomForestClassifier(),x_train,x_test,y_train,y_test)

0.9666666666666667

In [12]:
# Gaussian NB
get_score(GaussianNB(),x_train,x_test,y_train,y_test)

0.8388888888888889

In [13]:
# Multinomial NB
get_score(MultinomialNB(),x_train,x_test,y_train,y_test)

0.8888888888888888

In [14]:
# Bernaulli NB
get_score(BernoulliNB(),x_train,x_test,y_train,y_test)

0.8416666666666667

# K-Fold Cross validation
- returns index of the partitions

In [15]:
from sklearn.model_selection import KFold

In [16]:
kf = KFold(n_splits = 5)

In [17]:
df3 = df2.copy()
df3['target'] = digits.target

In [18]:
log_reg_scores = []
svm_scores = []
decision_tree_scores = []
random_forest_scores = []
gaussian_NB_scores = []
multinomial_NB_scores = []
bernaulli_NB_scores = []

# driver code

for train_index,test_index in kf.split(df2):
    
    train = df3.iloc[train_index]
    test = df3.iloc[test_index]
    
    log_reg_scores.append(get_score(LogisticRegression(max_iter=1000),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))
    svm_scores.append(get_score(SVC(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    decision_tree_scores.append(get_score(DecisionTreeClassifier(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    random_forest_scores.append(get_score(RandomForestClassifier(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    gaussian_NB_scores.append(get_score(GaussianNB(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    multinomial_NB_scores.append(get_score(MultinomialNB(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    bernaulli_NB_scores.append(get_score(BernoulliNB(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    

In [19]:
classifire_names = ['Logistic_Regression','SVM','Decision_tree','Random_Forest','Gaussian_NB','Multinomial_NB','Bernaulli_NB']
classifires = [log_reg_scores,svm_scores,decision_tree_scores,random_forest_scores,gaussian_NB_scores,multinomial_NB_scores,bernaulli_NB_scores]


scores = pd.DataFrame(classifires,columns = [ 'iteration_'+str(i+1) for i in range(5)])
algorithm_names = pd.DataFrame(classifire_names, columns=['Algorithm'])
mean_scores = pd.DataFrame([np.mean(i) for i in classifires], columns=['mean_scores'])

pd.concat([algorithm_names,scores,mean_scores], axis=1)

Unnamed: 0,Algorithm,iteration_1,iteration_2,iteration_3,iteration_4,iteration_5,mean_scores
0,Logistic_Regression,0.941667,0.897222,0.949861,0.963788,0.902507,0.931009
1,SVM,0.969444,0.944444,0.983287,0.988858,0.941504,0.965508
2,Decision_tree,0.775,0.738889,0.81337,0.832869,0.799443,0.791914
3,Random_Forest,0.930556,0.905556,0.961003,0.961003,0.927577,0.937139
4,Gaussian_NB,0.777778,0.777778,0.793872,0.849582,0.796657,0.799133
5,Multinomial_NB,0.888889,0.838889,0.857939,0.938719,0.844011,0.873689
6,Bernaulli_NB,0.858333,0.752778,0.802228,0.916435,0.793872,0.824729


- SVM performs best among all algorithms so we gonna use it

# Stratified K Fold
- There are 16 data points. 12 of them belong to class 1 and remaining 4 belong to class 0 so this is an imbalanced class distribution.
- KFold does not take this into consideration.
- Therefore, in classifications tasks with imbalanced class distributions, we should prefer StratifiedKFold over KFold.

In [20]:
from sklearn.model_selection import StratifiedKFold

In [21]:
S_kfold = StratifiedKFold(n_splits=10)

In [22]:
log_reg_scores = []
svm_scores = []
decision_tree_scores = []
random_forest_scores = []
gaussian_NB_scores = []
multinomial_NB_scores = []
bernaulli_NB_scores = []

# driver code

for train_index,test_index in S_kfold.split(df2, digits.target):
    
    train = df3.iloc[train_index]
    test = df3.iloc[test_index]
    
    log_reg_scores.append(get_score(LogisticRegression(max_iter=1000),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))
    svm_scores.append(get_score(SVC(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    decision_tree_scores.append(get_score(DecisionTreeClassifier(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    random_forest_scores.append(get_score(RandomForestClassifier(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    gaussian_NB_scores.append(get_score(GaussianNB(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    multinomial_NB_scores.append(get_score(MultinomialNB(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    bernaulli_NB_scores.append(get_score(BernoulliNB(),train.drop('target',axis = 1),test.drop('target',axis = 1),train.target,test.target))    
    
    

classifire_names = ['Logistic_Regression','SVM','Decision_tree','Random_Forest','Gaussian_NB','Multinomial_NB','Bernaulli_NB']
classifires = [log_reg_scores,svm_scores,decision_tree_scores,random_forest_scores,gaussian_NB_scores,multinomial_NB_scores,bernaulli_NB_scores]


scores = pd.DataFrame(classifires,columns = [ 'iteration_'+str(i+1) for i in range(10)])
algorithm_names = pd.DataFrame(classifire_names, columns=['Algorithm'])
mean_scores = pd.DataFrame([np.mean(i) for i in classifires], columns=['mean_scores'])

pd.concat([algorithm_names,scores,mean_scores], axis=1)

Unnamed: 0,Algorithm,iteration_1,iteration_2,iteration_3,iteration_4,iteration_5,iteration_6,iteration_7,iteration_8,iteration_9,iteration_10,mean_scores
0,Logistic_Regression,0.911111,0.972222,0.9,0.933333,0.961111,0.972222,0.977778,0.960894,0.882682,0.938547,0.94099
1,SVM,0.944444,0.994444,0.933333,0.966667,0.983333,0.988889,0.988889,0.994413,0.96648,0.955307,0.97162
2,Decision_tree,0.788889,0.844444,0.833333,0.827778,0.783333,0.877778,0.888889,0.821229,0.815642,0.810056,0.829137
3,Random_Forest,0.911111,0.983333,0.95,0.927778,0.961111,0.972222,0.983333,0.960894,0.932961,0.938547,0.952129
4,Gaussian_NB,0.766667,0.788889,0.883333,0.672222,0.711111,0.833333,0.805556,0.826816,0.804469,0.804469,0.789687
5,Multinomial_NB,0.844444,0.927778,0.911111,0.783333,0.866667,0.85,0.961111,0.938547,0.804469,0.877095,0.876456
6,Bernaulli_NB,0.811111,0.916667,0.838889,0.688889,0.783333,0.822222,0.933333,0.938547,0.798883,0.793296,0.832517


# Cross Val Score
- Use to simplify the use of K-fold / Stratified K-Fold cross validation
- We donot have to write driver code just to use the indexes returned by k-Fold
- we use 'cv' to specify the number of folds we want to make

In [26]:
from sklearn.model_selection import cross_val_score

In [46]:
classifire_names = ['Logistic_Regression','SVM','Decision_tree','Random_Forest','Gaussian_NB','Multinomial_NB','Bernaulli_NB']
algorithm_names = pd.DataFrame(classifire_names, columns=['Algorithm'])


scores = pd.DataFrame([cross_val_score(LogisticRegression(max_iter=1000),df2,digits.target,cv = 4),
cross_val_score(SVC(),df2,digits.target,cv = 4),
cross_val_score(DecisionTreeClassifier(),df2,digits.target,cv = 4),
cross_val_score(RandomForestClassifier(),df2,digits.target,cv = 4),
cross_val_score(GaussianNB(),df2,digits.target,cv = 4),
cross_val_score(MultinomialNB(),df2,digits.target,cv = 4),
cross_val_score(BernoulliNB(),df2,digits.target,cv = 4)]
            , columns= ['iteration'+str(i+1) for i in range(4)])


mean_scores = []
for i in range(scores.shape[0]):
    mean_scores.append(scores.iloc[i].mean())

mean_scores_df = pd.DataFrame(mean_scores,columns=['mean_scores'])

In [47]:
pd.concat([algorithm_names,scores,mean_scores_df], axis=1)

Unnamed: 0,Algorithm,iteration1,iteration2,iteration3,iteration4,mean_scores
0,Logistic_Regression,0.946667,0.917595,0.966592,0.917595,0.937112
1,SVM,0.96,0.966592,0.986637,0.951002,0.966058
2,Decision_tree,0.806667,0.739421,0.839644,0.790646,0.794094
3,Random_Forest,0.933333,0.91314,0.96882,0.930958,0.936563
4,Gaussian_NB,0.8,0.734967,0.806236,0.812918,0.78853
5,Multinomial_NB,0.904444,0.832962,0.910913,0.855234,0.875888
6,Bernaulli_NB,0.86,0.750557,0.877506,0.812918,0.825245
