In [8]:
# Importing necessary dependencies. 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
def warn(*args,**kwargs):pass
import warnings
warnings.warn = warn 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
import sys
import os
import zipfile

In [17]:
datasets_path = os.path.join("..","data")
zip_file_path = os.path.join(datasets_path, "leaf-classification.zip")
train_file_path = os.path.join(datasets_path, "train.csv.zip")
test_file_path = os.path.join(datasets_path,"test.csv.zip")

In [18]:
#unzip file and setup using python

with zipfile.ZipFile(zip_file_path,"r") as z:
    z.extractall(datasets_path)
    
with zipfile.ZipFile(train_file_path,"r") as z:
    z.extractall(datasets_path)
    
with zipfile.ZipFile(test_file_path,"r") as z:
    z.extractall(datasets_path)

In [22]:
train = pd.read_csv(os.path.join(datasets_path,"train.csv"))
test = pd.read_csv(os.path.join(datasets_path,"test.csv"))

### Objective : Predict species based on available features
+ It's a multiclass classification problem with probably 100+ classes each



In [26]:
train["species"].value_counts()

Prunus_X_Shmittii              10
Rhododendron_x_Russellianum    10
Populus_Adenopoda              10
Acer_Rufinerve                 10
Acer_Palmatum                  10
                               ..
Quercus_Hartwissiana           10
Quercus_Infectoria_sub         10
Quercus_Pyrenaica              10
Quercus_Afares                 10
Quercus_x_Hispanica            10
Name: species, Length: 99, dtype: int64

In [30]:
# Label Encoding
le = LabelEncoder().fit(train["species"])
labels = le.transform(train["species"])
classes = list(le.classes_)
test_ids = test["id"]
train = train.drop(['species','id'],axis=1)
test = test.drop(['id'],axis=1)

In [98]:
# Stratfied Shuffle split makes sure validation set contains similar class distributions
sss = StratifiedShuffleSplit(n_splits = 10,test_size=0.2,random_state=23)

In [60]:
for train_indices, valid_indices in sss.split(train,labels):
    X_train, X_valid = train.values[train_indices],train.values[valid_indices]
    y_train, y_valid = labels[train_indices], labels[valid_indices]

In [83]:
labels.shape, train.shape

((990,), (990, 192))

In [78]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((792, 192), (198, 192), (792,), (198,))

In [70]:
# Import Sklearn Models 
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC,NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [72]:
# Instantiate all classifiers in a pipeline
classifiers = [KNeighborsClassifier(3),
               SVC(kernel="rbf", C=0.025, probability= True),
               NuSVC(probability = True),
               DecisionTreeClassifier(),
               RandomForestClassifier(),
               AdaBoostClassifier(),
               GradientBoostingClassifier(),
               GaussianNB(),
               LinearDiscriminantAnalysis(),
               QuadraticDiscriminantAnalysis()]

In [74]:
log_cols = ["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

In [97]:
for clf in classifiers:
    clf.fit(X_train,y_train)
    name = clf.__class__.__name__
    print('='*30)
    print(name)
    print('***Results****')
    train_predictions = clf.predict(X_valid)
    acc = accuracy_score(y_valid,train_predictions)
    print(f"Accuracy {acc}")
    
    # consider applying loss function as a forward pass
    predicted_probabilities = clf.predict_proba(X_valid)
    ll = log_loss(y_valid, predicted_probabilities)
    print(f"Log Loss {ll}")
    log_entry = pd.DataFrame([[name,acc*100,ll]], columns = log_cols)
    log = log.append(log_entry)
    print("="*38)

KNeighborsClassifier
***Results****
Accuracy 0.8888888888888888
Log Loss 1.5755075129933762
SVC
***Results****
Accuracy 0.8585858585858586
Log Loss 4.676194291770735
NuSVC
***Results****
Accuracy 0.9292929292929293
Log Loss 2.399257243610487
DecisionTreeClassifier
***Results****
Accuracy 0.6262626262626263
Log Loss 12.908431581936412
RandomForestClassifier
***Results****
Accuracy 0.9747474747474747
Log Loss 0.7754982522507964
AdaBoostClassifier
***Results****
Accuracy 0.045454545454545456
Log Loss 4.200034383188024
GradientBoostingClassifier
***Results****
Accuracy 0.601010101010101
Log Loss 2.4294833318104625
GaussianNB
***Results****
Accuracy 0.5707070707070707
Log Loss 14.827252492813216
LinearDiscriminantAnalysis
***Results****
Accuracy 0.9797979797979798
Log Loss 0.2299344821365334
QuadraticDiscriminantAnalysis
***Results****
Accuracy 0.04040404040404041
Log Loss 33.14327027794469


In [92]:
predicted_probabilities = clf.predict_proba(X_valid)

In [104]:
predicted_probabilities

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [99]:
log

Unnamed: 0,Classifier,Accuracy,Log Loss
0,KNeighborsClassifier,88.888889,1.575508
0,SVC,85.858586,4.676194
0,NuSVC,92.929293,2.399257
0,DecisionTreeClassifier,62.626263,12.908432
0,RandomForestClassifier,97.474747,0.775498
0,AdaBoostClassifier,4.545455,4.200034
0,GradientBoostingClassifier,60.10101,2.429483
0,GaussianNB,57.070707,14.827252
0,LinearDiscriminantAnalysis,97.979798,0.229934
0,QuadraticDiscriminantAnalysis,4.040404,33.14327


## References
Original Implementation in Kaggle : https://www.kaggle.com/jeffd23/10-classifier-showdown-in-scikit-learn