# Comparison of Classifiers

Here we evaluate the performance of some of the popular machine learning algorithms in identifying the species of leaves.

In [6]:
__author__ = "rohitravishankar"

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler 
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, log_loss

def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

### Training Data 

In [2]:
train_df = pd.read_csv('../train.csv')

label_encoder = LabelEncoder().fit(train_df['species'])
labels = label_encoder.transform(train_df['species'])
train_df = train_df.drop(['id', 'species'], axis = 1)

### Stratified Split

This is to ensure that all classes are represented in both the training and test indices.

In [3]:
stratified_shuffle_split = StratifiedShuffleSplit(labels, 10, test_size = 0.2, random_state = 23)

for train_index, test_index in stratified_shuffle_split:
    X_train, X_test = train_df.values[train_index], train_df.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

### Classifier Comparison

In [4]:
classifiers = [
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    DecisionTreeClassifier(),
    GaussianNB()
]

for clf in classifiers:
    clf.fit(X_train, y_train)
    
    print(clf.__class__.__name__)
    
    #Accuracy of the model
    predictions = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions) * 100
    print("Accuracy: " + str(accuracy) + "%")
    
    #Log loss of the model
    predictions = clf.predict_proba(X_test)
    logloss = log_loss(y_test, predictions)
    print("Log Loss: " + str(logloss) + "\n\n")


LinearDiscriminantAnalysis
Accuracy: 97.9797979798%
Log Loss: 0.930197776314


QuadraticDiscriminantAnalysis
Accuracy: 4.0404040404%
Log Loss: 33.1432702779


KNeighborsClassifier
Accuracy: 88.8888888889%
Log Loss: 1.57550751299


SVC
Accuracy: 81.8181818182%
Log Loss: 4.59512196836


NuSVC
Accuracy: 88.3838383838%
Log Loss: 2.48032192867


AdaBoostClassifier
Accuracy: 4.54545454545%
Log Loss: 4.20721577649


GradientBoostingClassifier
Accuracy: 57.0707070707%
Log Loss: 2.41913205807


RandomForestClassifier
Accuracy: 87.3737373737%
Log Loss: 1.47341238195


DecisionTreeClassifier
Accuracy: 66.6666666667%
Log Loss: 11.512925465


GaussianNB
Accuracy: 57.0707070707%
Log Loss: 14.8272524928




### Test Data

In [5]:
test_df = pd.read_csv('../test.csv')
test_id = test_df.pop('id')

x_test = test_df.values