## Dataset Description

The dataset consists approximately 1,584 images of leaf specimens (16 samples each of 99 species) which have been converted to binary black leaves against white backgrounds. Three sets of features are also provided per image (each of 64 attributes): a shape contiguous descriptor, an interior texture histogram, and a ﬁne-scale margin histogram.

Our task is to train a model to predict species given shape, texture and margin.

In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit

In [None]:
data = pd.read_csv('data/leaf/train.csv')

In [None]:
len(data['species'].unique())

In [None]:
le = LabelEncoder().fit(data.species)
labels = le.transform(data.species)
labels[1:10]

In [None]:
data = data.drop(['species', 'id'], axis=1) 

In [None]:
indices = StratifiedShuffleSplit(labels)

In [None]:
for tr_index, te_index in indices:
    X_train, X_test = data.values[tr_index], data.values[te_index]
    y_train, y_test = labels[tr_index], labels[te_index]

Let's try 10 out-of-the box classifiers and see how they perform. In practice, they will perform better once we tune their hyper parameter but this would give you ballpark figures.

In [None]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

In [None]:
from sklearn.metrics import accuracy_score, log_loss

accuracy, losses = [], []
for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    accuracy.append(acc)
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    losses.append(ll)
    
print("="*30)