In [261]:
print("Comparing the performance of various classifiers")

Comparing the performance of various classifiers


In [262]:
#imports
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import KFold
from statistics import mean
import warnings
warnings.filterwarnings('ignore')

In [263]:
#load dataset
#read the data from the url
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
headers = ['buying','maint','doors','persons','lug_boot','safety','class_label']
df = pd.read_csv(url, names=headers)

In [264]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class_label
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [265]:
#some insights
df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class_label
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,med,med,3,4,med,med,unacc
freq,432,432,432,576,576,576,1210


In [266]:
#raw dataframe
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class_label
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [267]:
#let's preprocess
labelEncoder = preprocessing.LabelEncoder()

labelEncoder.fit(df['buying'])
labelEncoder.fit(df['maint'])
labelEncoder.fit(df['doors'])
labelEncoder.fit(df['persons'])
labelEncoder.fit(df['lug_boot'])
labelEncoder.fit(df['safety'])
labelEncoder.fit(df['class_label'])

df['buying'] = labelEncoder.fit_transform(df['buying'])
df['maint'] = labelEncoder.fit_transform(df['maint'])
df['doors'] = labelEncoder.fit_transform(df['doors'])
df['persons'] = labelEncoder.fit_transform(df['persons'])
df['lug_boot'] = labelEncoder.fit_transform(df['lug_boot'])
df['safety'] = labelEncoder.fit_transform(df['safety'])
df['class_label'] = labelEncoder.fit_transform(df['class_label'])

#preprocessing done
#processed data frame
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class_label
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2
5,3,3,0,0,1,0,2
6,3,3,0,0,0,1,2
7,3,3,0,0,0,2,2
8,3,3,0,0,0,0,2
9,3,3,0,1,2,1,2


In [268]:
#split instances and class labels
X = df.iloc[:, :-1]
y = df['class_label']

In [269]:
#function for Decision Tree

def getAccuracyClassifier(classifier, X, y, k):
    kf = KFold(n_splits = k, shuffle = True, random_state = 50)
    accuracy = []
    precision = []
    recall = []
    #auc_roc = []
        
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
    
        classifier.fit(X_train, y_train)
        y_predict = classifier.predict(X_test)
    
        accuracy.append(accuracy_score(y_test, y_predict)*100)
        precision.append(precision_score(y_test, y_predict, average='weighted')*100)
        recall.append(recall_score(y_test, y_predict, average='macro')*100)
        #auc_roc.append(roc_auc_score(y_test, y_predict))
     
    print("Accuracy:", mean(accuracy))
    print("Precision:", mean(precision))
    print("Recall:", mean(recall))
    #print("Area under ROC:", mean(auc_roc))


In [270]:
depth = 10
maxFeatures = 6
decisionTree = DecisionTreeClassifier(criterion = "entropy", random_state = 50, max_depth = depth, max_features = maxFeatures, min_samples_leaf=5)
print("Max Depth:",decisionTree.max_depth,",", "Max features", decisionTree.max_features)
getAccuracyClassifier(decisionTree, X, y, 10)

Max Depth: 10 , Max features 6
Accuracy: 95.3690012098
Precision: 95.7500376418
Recall: 90.1481533073


In [272]:
perceptron = Perceptron(penalty=None, alpha=.0001, fit_intercept=True, max_iter=10, tol=None, shuffle=True, verbose=0, eta0=1.0, n_jobs=1, random_state=0, class_weight=None, warm_start=False, n_iter=None)
print("Alpha:", a, "Max Iterations:", i)
getAccuracyClassifier(perceptron, X, y, 10)
print()

Alpha: 0.001 Max Iterations: 200
Accuracy: 66.6003495093
Precision: 61.8666534994
Recall: 37.1974810186



In [273]:
neuralNetwork = MLPClassifier(solver='lbfgs', alpha=1e-5, max_iter=200,hidden_layer_sizes=(20, 10), random_state=1)
getAccuracyClassifier(neuralNetwork, X, y, 10)

Accuracy: 97.2227449926
Precision: 97.4027231794
Recall: 93.6860431463


In [274]:
deepLearner = MLPClassifier(activation='tanh', solver='lbfgs', hidden_layer_sizes=(25,40,10,50,69,10,20),
                            learning_rate='constant',  max_iter=400, shuffle=True, random_state=400)
getAccuracyClassifier(deepLearner, X, y, 10)

Accuracy: 98.2057400188
Precision: 98.352946551
Recall: 97.0921561154


In [275]:
svm = LinearSVC(max_iter=100, multi_class ="crammer_singer", random_state=100, loss="squared_hinge") 
getAccuracyClassifier(svm, X, y, 10)

Accuracy: 70.1915580051
Precision: 50.7276487104
Recall: 27.4467266013


In [284]:
naiveBayes = GaussianNB(priors=None) 
getAccuracyClassifier(naiveBayes, X, y, 10)

Accuracy: 62.5527624681
Precision: 73.773232932
Recall: 47.64969686


In [290]:
multiNomialNB = MultinomialNB(alpha=2, class_prior=None, fit_prior=True)
getAccuracyClassifier(multiNomialNB, X, y, 10)

Accuracy: 70.1912219384
Precision: 54.7395413449
Recall: 25.1940653264


In [291]:
logisticRegression = LogisticRegression(max_iter=100, multi_class='ovr', n_jobs=1,
                                        random_state=50, solver='lbfgs')
getAccuracyClassifier(logisticRegression, X, y, 10)

Accuracy: 69.3846619169
Precision: 61.3514770449
Recall: 29.3545007382


In [292]:
KNearestNeighbors = KNeighborsClassifier(algorithm='auto',n_neighbors=8, n_jobs=1, metric='minkowski')
getAccuracyClassifier(KNearestNeighbors, X, y, 10)

Accuracy: 93.9222341713
Precision: 94.0645175651
Recall: 81.5580590405


In [305]:
Bagging = BaggingClassifier(bootstrap=False, max_features=6, max_samples=1, n_jobs=1, random_state=150)
getAccuracyClassifier(Bagging, X, y, 10)

Accuracy: 70.0171394005
Precision: 49.1657283356
Recall: 25.0


In [306]:
randomForest = RandomForestClassifier(bootstrap=True,criterion='gini', max_features = 6, random_state = 100, max_depth=30, max_leaf_nodes=200)
getAccuracyClassifier(randomForest, X, y, 10)

Accuracy: 97.6280414034
Precision: 97.7778936139
Recall: 93.5765006676


In [304]:
AdaBoost = AdaBoostClassifier(algorithm='SAMME',learning_rate=0.5, n_estimators=400, random_state=100 )
getAccuracyClassifier(AdaBoost, X, y, 10)

Accuracy: 85.3565667428
Precision: 83.3071829571
Recall: 60.4620778757


In [303]:
GradientBoosting = GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.4, max_depth=10, max_features=5, max_leaf_nodes=10, random_state=100)
getAccuracyClassifier(GradientBoosting, X, y, 10)

Accuracy: 99.7106465923
Precision: 99.7247681552
Recall: 99.1282051282


In [148]:
name_classifiers = ['decisionTree', 'perceptron', 'neuralNetwork', 'deepLearner', 'svm', 'naiveBayes', 'multiNomialNB', 'logisticRegression', 'KNearestNeighbors', 'Bagging', 'randomForest', 'AdaBoost', 'GradientBoosting']


In [307]:
#Fair evaluation
numFolds = 10
name_classifiers = ['decisionTree', 'perceptron', 'neuralNetwork', 'deepLearner', 'svm', 'naiveBayes', 'multiNomialNB', 'logisticRegression', 'KNearestNeighbors', 'Bagging', 'randomForest', 'AdaBoost', 'GradientBoosting']
classifiers = [decisionTree, perceptron, neuralNetwork, deepLearner, svm, naiveBayes, multiNomialNB, logisticRegression, KNearestNeighbors, Bagging, randomForest, AdaBoost, GradientBoosting]
kf = KFold(n_splits = numFolds, shuffle = True, random_state = 60)

accuracy = [[] for clf in classifiers]
precision = [[] for clf in classifiers]
recall = [[] for clf in classifiers]

for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        for clf in classifiers:
            clf.fit(X_train, y_train)
            y_predict = clf.predict(X_test)
    
            accuracy[classifiers.index(clf)].append(accuracy_score(y_test, y_predict)*100)
            precision[classifiers.index(clf)].append(precision_score(y_test, y_predict, average='weighted')*100)
            recall[classifiers.index(clf)].append(recall_score(y_test, y_predict, average='macro')*100)
        

In [308]:
listAccuracy = []
for accuracy_clf in accuracy:
    listAccuracy.append(mean(accuracy_clf))
print(name_classifiers[listAccuracy.index(max(listAccuracy))], "has the best accuracy of", max(listAccuracy))

GradientBoosting has the best accuracy of 99.3056862482


In [309]:
listPrecision = []
for precision_clf in precision:
    listPrecision.append(mean(precision_clf))
print(name_classifiers[listPrecision.index(max(listPrecision))], "has the best precision of", max(listPrecision))

GradientBoosting has the best precision of 99.4335065432


In [310]:
listRecall = []
for recall_clf in recall:
    listRecall.append(mean(recall_clf))
print(name_classifiers[listRecall.index(max(listRecall))], "has the best recall of", max(listRecall))

GradientBoosting has the best recall of 98.901203064


In [320]:
# listAccuracy

In [321]:
# listPrecision

In [322]:
# listRecall

In [323]:
# for i in range(0,len(name_classifiers)):
#     print(name_classifiers[i]+",",str(listAccuracy[i])+",",str(listPrecision[i])+",", str(listRecall[i]))