# Models with LDA (2 class, (n=1)components)

## Import library and dataset

In [6]:
# Import libraries
import numpy as np
import pandas as pd
import pandas_profiling
from pandas_profiling import ProfileReport
#import tkinter
from matplotlib import pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import scikitplot as skplt
import seaborn as sns
sns.set(style="whitegrid")

#from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from sklearn import model_selection

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import matplotlib
matplotlib.use('TKAgg')
%matplotlib inline

## Load Dataset, Replace Class, Scaling

In [2]:
# Load 2 datasets
dfC = pd.read_csv("MeterC",sep='\t',header=None, names=['flatr','symm','crossf','v1','v2','v3','v4','sos1','sos2','sos3','sos4','ss11','ss12','ss21','ss22','ss31','ss32','ss41','ss42','sq11','sq12','sq21','sq22','sq31','sq32','sq41','sq42','gain11','gain12','gain21','gain22','gain31','gain32','gain41','gain42','tt11','tt12','tt21','tt22','tt31','tt32','tt41','tt42','class'])
dfD = pd.read_csv("MeterD",sep='\t',header=None, names=['flatr','symm','crossf','v1','v2','v3','v4','sos1','sos2','sos3','sos4','ss11','ss12','ss21','ss22','ss31','ss32','ss41','ss42','sq11','sq12','sq21','sq22','sq31','sq32','sq41','sq42','gain11','gain12','gain21','gain22','gain31','gain32','gain41','gain42','tt11','tt12','tt21','tt22','tt31','tt32','tt41','tt42','class'])

# replace class
dfC['class'].replace(1, 0,inplace=True)
dfC['class'].replace(2, 1,inplace=True)
dfC['class'].replace(3, 1,inplace=True)
dfC['class'].replace(4, 1,inplace=True)
dfD['class'].replace(1, 0,inplace=True)
dfD['class'].replace(2, 1,inplace=True)
dfD['class'].replace(3, 1,inplace=True)
dfD['class'].replace(4, 1,inplace=True)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features = ['flatr','symm','crossf','v1','v2','v3','v4','sos1','sos2','sos3','sos4','ss11','ss12','ss21','ss22','ss31','ss32','ss41','ss42','sq11','sq12','sq21','sq22','sq31','sq32','sq41','sq42','gain11','gain12','gain21','gain22','gain31','gain32','gain41','gain42','tt11','tt12','tt21','tt22','tt31','tt32','tt41','tt42']

X = dfC.loc[:, features].values
y = dfC.loc[:,['class']].values
X = scaler.fit_transform(X)

#process for dataset D
Xd = dfD.loc[:, features].values
yd = dfD.loc[:,['class']].values
Xd = scaler.fit_transform(Xd)

#change the shape of y from column array to horizontal array or (n_samples, ) using ravel().
y = y.ravel()
print('y class',dfC['class'].unique())
yd = yd.ravel()
print('yd class',dfD['class'].unique())

y class [0 1]
yd class [0 1]


## Apply LDA

In [3]:
lda = LDA(n_components=1)
X = lda.fit_transform(X,y)

lda1 = LDA(n_components=1)
Xd = lda1.fit_transform(Xd,yd)

print(np.cumsum(lda.explained_variance_ratio_))
print(np.cumsum(lda1.explained_variance_ratio_))

[1.]
[1.]


In [18]:
dfX = pd.DataFrame(X)
dfX['class']=y
dfX.columns = ['LDA1','class']
dfX.tail()

Unnamed: 0,LDA1,class
176,0.867333,1
177,0.854629,1
178,1.314498,1
179,1.320894,1
180,1.550675,1


## Evaluate using K-fold validation

In [4]:
#classifier models
clfNB = GaussianNB()
clfKNN = KNeighborsClassifier(n_neighbors=5)
clfSVM = svm.SVC(kernel='linear', C=1)
clfRF = RandomForestClassifier(max_depth=2, random_state=0)
clfNN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1, max_iter=2500)

n_folds = [5,10]
targets = [clfNB,clfKNN,clfSVM,clfRF,clfNN]
models = ['Bayes','KNN','SVM','RF','ANN']
# Computing training scores to get insights on how different parameter settings impact the overfitting/underfitting trade-off. 
# However computing the scores on the training set can be computationally expensive and is not strictly required 
# to select the parameters that yield the best generalization performance

print("\n{:>10} | {:>6} | {:>7} | {:>14} | {:>14} | {:>9},{:>9},{:>7} | {:>9},{:>9},{:>8}".format('Classifier', '5fold', '10fold', 'Accuracy_5fold','Accuracy_10fold','F1micro5','F1macro5','F1avg5','F1micro10','F1macro10','F1avg10'))
print('------------------------------------------------------------------------------------------------------------------------------')
for i,j in zip(targets,models):
    # K-fold cross validation for the training dataset (Dataset C)
    cv5 = cross_validate(i, X, y, cv=5, return_train_score=True) # == cvScore = cross_val_score(i, X, y, cv=5)
    cv10 = cross_validate(i, X, y, cv=10, return_train_score=True)    
    cv5mean = np.round(cv5['test_score'].mean(),decimals=3)
    cv10mean = np.round(cv10['test_score'].mean(),decimals=3)
    # K-fold cross validation and predict for the validation dataset (Dataset D)
    y_pred5 = cross_val_predict(i, Xd, yd, cv=5)
    y_pred10 = cross_val_predict(i, Xd, yd, cv=10)
    # Evaluate performance for validation score
    accuracy5 = np.round(accuracy_score(yd, y_pred5),decimals=3)
    accuracy10 = np.round(accuracy_score(yd, y_pred10),decimals=3)
    f1micro5 = np.round(f1_score(yd, y_pred5, average='macro'),decimals=3)
    f1macro5 = np.round(f1_score(yd, y_pred5, average='micro'),decimals=3)
    f1avg5 = np.round(f1_score(yd, y_pred5, average='weighted'),decimals=3)
    f1micro10 = np.round(f1_score(yd, y_pred10, average='macro'),decimals=3)
    f1macro10 = np.round(f1_score(yd, y_pred10, average='micro'),decimals=3)
    f1avg10 = np.round(f1_score(yd, y_pred10, average='weighted'),decimals=3)
    print("{:>10} | {:>6} | {:>7} | {:>14} | {:>14}  | {:>9}{:>9}{:>9} | {:>9}{:>9}{:>9}".format(j,cv5mean,cv10mean,accuracy5,accuracy10,f1micro5,f1macro5,f1avg5,f1micro10,f1macro10,f1avg10))



Classifier |  5fold |  10fold | Accuracy_5fold | Accuracy_10fold |  F1micro5, F1macro5, F1avg5 | F1micro10,F1macro10, F1avg10
------------------------------------------------------------------------------------------------------------------------------
     Bayes |   0.84 |    0.84 |          0.889 |          0.889  |     0.854    0.889    0.885 |     0.856    0.889    0.886
       KNN |  0.818 |   0.818 |          0.839 |          0.856  |     0.798    0.839    0.837 |     0.818    0.856    0.854
       SVM |  0.851 |   0.856 |          0.894 |          0.894  |     0.858    0.894    0.889 |      0.86    0.894     0.89
        RF |  0.824 |   0.835 |          0.839 |          0.867  |     0.795    0.839    0.836 |     0.832    0.867    0.865
       ANN |   0.84 |    0.84 |          0.833 |          0.867  |      0.79    0.833    0.831 |     0.832    0.867    0.865


## Evaluate using Train and Test set

In [9]:
#split dataset into train and test set
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, y, test_size=0.20, random_state=100)

In [10]:
print("\n{:>10} | {:>10} | {:>10} | {:>9},{:>9},{:>7} | {:>9},{:>9},{:>8}".format('Classifier', 'AccuracyC','AccuracyD','F1microC','F1macroC','F1avgC','F1microD','F1macroD','F1avgD'))
print('-------------------------------------------------------------------------------------------------')
for i,j in zip(targets,models):
    i.fit(X_train, Y_train) # train the model
    accuracyC = np.round(i.score(X_test, Y_test),decimals=3) # Calc accuracy score for the training dataset (Dataset C)
    accuracyD = np.round(i.score(Xd, yd),decimals=3) # Calc accuracy score for the validation dataset (Dataset D)
    # Evaluate performance for validation score
    y_predC = i.predict(X_test)
    y_predD = i.predict(Xd)
    f1microC = np.round(f1_score(Y_test, y_predC, average='macro'),decimals=3)
    f1macroC = np.round(f1_score(Y_test, y_predC, average='micro'),decimals=3)
    f1avgC = np.round(f1_score(Y_test, y_predC, average='weighted'),decimals=3)
    f1microD = np.round(f1_score(yd, y_predD, average='macro'),decimals=3)
    f1macroD = np.round(f1_score(yd, y_predD, average='micro'),decimals=3)
    f1avgD = np.round(f1_score(yd, y_predD, average='weighted'),decimals=3)
    print("{:>10} | {:>10} | {:>10} | {:>9}{:>9}{:>9} | {:>9}{:>9}{:>9}".format(j,accuracyC,accuracyD,f1microC,f1macroC,f1avgC,f1microD,f1macroD,f1avgD))



Classifier |  AccuracyC |  AccuracyD |  F1microC, F1macroC, F1avgC |  F1microD, F1macroD,  F1avgD
-------------------------------------------------------------------------------------------------
     Bayes |      0.892 |      0.889 |     0.801    0.892    0.877 |      0.86    0.889    0.887
       KNN |      0.811 |      0.861 |     0.733    0.811    0.815 |     0.836    0.861    0.864
       SVM |      0.892 |        0.9 |     0.801    0.892    0.877 |     0.877      0.9      0.9
        RF |      0.892 |        0.9 |     0.801    0.892    0.877 |     0.878      0.9    0.901
       ANN |      0.892 |        0.9 |     0.801    0.892    0.877 |     0.877      0.9      0.9


# Confusion Matrix

In [5]:
for i,j in zip(targets,models):
    # K-fold cross validation for the training dataset (Dataset C)
    y_predC5 = cross_val_predict(i, X, y, cv=5)
   # print(j,'Confusion Matrix for Test Set (5-fold):\n',confusion_matrix(y, y_predC5))
    print('\n',j,'Confusion Matrix for Test Set (5-fold):\n',confusion_matrix(y, y_predC5))
    
print('--------------------------------------------------------')
      
for i,j in zip(targets,models):
    # K-fold cross validation and predict for the validation dataset (Dataset D)
    y_predD5 = cross_val_predict(i, Xd, yd, cv=5)
   # print(j,'Confusion Matrix for Test Set (5-fold):\n',confusion_matrix(y, y_predC5))
    print('\n',j,'Confusion Matrix for Validation Set (5-fold):\n',confusion_matrix(yd, y_predD5))


 Bayes Confusion Matrix for Test Set (5-fold):
 [[ 31  23]
 [  6 121]]

 KNN Confusion Matrix for Test Set (5-fold):
 [[ 35  19]
 [ 14 113]]

 SVM Confusion Matrix for Test Set (5-fold):
 [[ 34  20]
 [  7 120]]

 RF Confusion Matrix for Test Set (5-fold):
 [[ 35  19]
 [ 13 114]]

 ANN Confusion Matrix for Test Set (5-fold):
 [[ 35  19]
 [ 10 117]]
--------------------------------------------------------

 Bayes Confusion Matrix for Validation Set (5-fold):
 [[ 36  15]
 [  5 124]]

 KNN Confusion Matrix for Validation Set (5-fold):
 [[ 35  16]
 [ 13 116]]

 SVM Confusion Matrix for Validation Set (5-fold):
 [[ 35  16]
 [  3 126]]

 RF Confusion Matrix for Validation Set (5-fold):
 [[ 34  17]
 [ 12 117]]

 ANN Confusion Matrix for Validation Set (5-fold):
 [[ 34  17]
 [ 13 116]]
