Importing Relevant Libraries

In [2]:
import csv
import random
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import data_file_reader

Calling File reader and splitting train and test set from the overall data

In [3]:
filename='features_train.csv'
data,features,labels=data_file_reader.file_reader(filename)
train_data_features, test_data_features, train_data_labels, test_data_labels = train_test_split(features, labels, test_size=0.2, random_state=10)

Defining the classifiers to be used

In [4]:
svc=SVC(kernel='linear', C=1)
rf=RandomForestClassifier(n_estimators=50, random_state=1)
knn=KNeighborsClassifier(n_neighbors=3)
mv=VotingClassifier(estimators=[('rf', rf),('knn',knn),('svc',svc)], voting='hard')

Performing cross validation on the classfiers to gauge performance

In [5]:
cv_scores_svc=cross_val_score(svc, train_data_features,np.ravel(train_data_labels), cv=10)
accur_crossval_svc=cv_scores_svc.mean()*100
std_crossval_svc=cv_scores_svc.std()*2
print('The Accuracy of the Support Vector Machine Classifier with 10-fold Cross Validation is : %f'%accur_crossval_svc+'%'+' (+/- %0.2f)'%std_crossval_svc)

cv_scores_rf=cross_val_score(rf, train_data_features,np.ravel(train_data_labels), cv=10)
accur_crossval_rf=cv_scores_rf.mean()*100
std_crossval_rf=cv_scores_rf.std()*2
print('The Accuracy of the Random Forest Classifier with 10-fold Cross Validation is : %f'%accur_crossval_rf+'%'+' (+/- %0.2f)'%std_crossval_rf)

cv_scores_knn=cross_val_score(knn, train_data_features,np.ravel(train_data_labels), cv=10)
accur_crossval_knn=cv_scores_knn.mean()*100
std_crossval_knn=cv_scores_knn.std()*2
print('The Accuracy of the K-Nearest Neighbour Classifier with 10-fold Cross Validation is : %f'%accur_crossval_knn+'%'+' (+/- %0.2f)'%std_crossval_knn)

cv_scores_mv=cross_val_score(mv, train_data_features,np.ravel(train_data_labels), cv=10)
accur_crossval_mv=cv_scores_mv.mean()*100
std_crossval_mv=cv_scores_mv.std()*2
print('The Accuracy of the Majority Voting Classifier with 10-fold Cross Validation is : %f'%accur_crossval_mv+'%'+' (+/- %0.2f)'%std_crossval_mv)


The Accuracy of the Support Vector Machine Classifier with 10-fold Cross Validation is : 81.593353% (+/- 0.12)
The Accuracy of the Random Forest Classifier with 10-fold Cross Validation is : 85.141740% (+/- 0.12)
The Accuracy of the K-Nearest Neighbour Classifier with 10-fold Cross Validation is : 82.903226% (+/- 0.10)
The Accuracy of the Majority Voting Classifier with 10-fold Cross Validation is : 86.999022% (+/- 0.10)


Test Accuracy

In [12]:
rf_clf=rf.fit(train_data_features,np.ravel(train_data_labels))
prediction_rf=rf_lda.predict(test_data_features)
rf_score=accuracy_score(test_data_labels,prediction_rf)
print('The Random Forest Classifier Accuracy is: %f'%(rf_score*100)+'%')

The Random Forest Classifier Accuracy is: 85.000000%


Following are two procedures to perform PCA and LDA on the data to perform feature reduction. 

In [6]:
# Scale the input data (Good Practice when performing PCA)
sc=StandardScaler()
train_set=sc.fit_transform(train_data_features)
test_set=sc.fit_transform(test_data_features)

#Perform PCA on the input data reducing the input from 4 dimensions to 2 dimensions
pca=PCA(n_components=80)
pca_train_set= pca.fit_transform(train_set) 
pca_test_set=pca.fit_transform(test_set)
print(pca.explained_variance_ratio_)  

[8.91869850e-01 4.91621793e-02 1.85044092e-02 1.45781479e-02
 8.88418884e-03 6.31858412e-03 2.76583588e-03 2.02294241e-03
 1.02585962e-03 8.07986733e-04 7.46700187e-04 5.76930830e-04
 4.78889280e-04 3.43817259e-04 3.03797944e-04 2.41133337e-04
 1.92414996e-04 1.77777417e-04 1.36745437e-04 1.16583190e-04
 9.85832456e-05 9.79087906e-05 7.33971433e-05 7.11634636e-05
 4.63349944e-05 4.29096102e-05 3.43062725e-05 3.17417216e-05
 2.61825841e-05 2.50970413e-05 2.27608488e-05 1.90762189e-05
 1.81383716e-05 1.55151545e-05 1.35455445e-05 1.16894752e-05
 9.88551888e-06 8.74717993e-06 8.41672381e-06 7.66473103e-06
 7.22634033e-06 5.84993251e-06 5.23212950e-06 4.61766127e-06
 4.08501160e-06 3.62449165e-06 3.38947429e-06 2.84218091e-06
 2.76364051e-06 2.35707431e-06 2.19047958e-06 2.08721952e-06
 1.79775123e-06 1.54614873e-06 1.32165465e-06 1.20238737e-06
 9.91387462e-07 9.30946760e-07 8.94360507e-07 8.62148025e-07
 7.26822664e-07 6.94351304e-07 5.72216854e-07 5.37872172e-07
 4.88172609e-07 4.554736

In [16]:
rf_pca=rf.fit(pca_train_set,np.ravel(train_data_labels))
prediction_pca=rf_pca.predict(pca_test_set)
rf_pca_score=accuracy_score(test_data_labels,prediction_pca)
print('The Random Forest Classifier Accuracy with PCA is: %f'%(rf_pca_score*100)+'%')

The Random Forest Classifier Accuracy with PCA is: 78.750000%


In [17]:
lda=LDA(n_components=200)
lda_train_set=lda.fit_transform(train_data_features,np.ravel(train_data_labels))
lda_test_set = lda.transform(test_data_features)

rf_lda=rf.fit(lda_train_set,np.ravel(train_data_labels))
prediction_lda=rf_lda.predict(lda_test_set)
rf_lda_score=accuracy_score(test_data_labels,prediction_lda)
print('The Random Forest Classifier Accuracy with LDA is: %f'%(rf_lda_score*100)+'%')

The Random Forest Classifier Accuracy with LDA is: 80.000000%


