## SVM on Leaf Classification Data Set

In [65]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.model_selection import GridSearchCV, train_test_split

# def warn(*args, **kwargs): pass
# import warnings
# warnings.warn = warn

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

leaf = pd.read_csv('train.csv')
#test = pd.read_csv('test.csv')

### Data Preparation

Class labels need to be number encoded for fitting with the model.  We additionally need to remove species and id from the dataset so that the model does not fit to those values. We use stratification for splitting the data into training and test sets because of the large number of classes (100) in a relatively small dataset (990 observations)

In [88]:
leaf = pd.read_csv('train.csv')

le = LabelEncoder().fit(leaf.species) 
labels = le.transform(leaf.species)           # encode species strings
leaf = leaf.drop(['species', 'id'], axis=1)  

X_train, X_test, y_train, y_test = train_test_split(leaf, labels, test_size=0.3, stratify=labels)

X_train.head()

Unnamed: 0,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,margin10,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
524,0.019531,0.029297,0.041016,0.001953,0.005859,0.021484,0.007812,0.0,0.007812,0.013672,...,0.0,0.003906,0.008789,0.019531,0.012695,0.003906,0.0,0.009766,0.0,0.000977
469,0.009766,0.023438,0.029297,0.056641,0.037109,0.013672,0.011719,0.0,0.003906,0.013672,...,0.0,0.0,0.007812,0.001953,0.089844,0.0,0.0,0.001953,0.020508,0.0
458,0.021484,0.056641,0.005859,0.048828,0.0,0.12891,0.0,0.0,0.007812,0.005859,...,0.057617,0.0,0.02832,0.0,0.014648,0.0,0.0,0.006836,0.0,0.041992
954,0.0,0.007812,0.023438,0.046875,0.011719,0.0,0.005859,0.0,0.005859,0.013672,...,0.013672,0.0,0.022461,0.0,0.017578,0.0,0.035156,0.0,0.006836,0.00293
229,0.003906,0.009766,0.066406,0.033203,0.025391,0.0,0.019531,0.005859,0.005859,0.013672,...,0.0,0.001953,0.013672,0.003906,0.001953,0.001953,0.0,0.00293,0.0,0.011719


### Parameter Tuning

First we'll use a grid search to determine the ideal SVM model based on the training data. The tuned parameters are:
* C: Penalty parameter C of the error term. This is used to regularlize the model and has a tradeoff between a smooth fit and exactly fitting the training data.
* kernel: kernel type for the algorithm, must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
* gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. A higher gamma value makes the model fit more to the training data
* degree: Only used for polynomial SVM. Determines the degree of the polynomial when making the hyperplane

In [98]:
Cs = [0.001, 0.01, 0.1, 1, 10, 25, 50, 100, 1000]
kernels = ["linear", "rbf", "sigmoid", "poly"]
gammas = [0.0001, 0.001, 0.01, 0.1, 1]
degrees = [2,3,4,5,6,7,8] # degree = 1 is identical to linear kernel
#different param dict for each kernel to remove redundancies
param_grid = [{'kernel' : ["linear"] ,'C': Cs},
             {'kernel': ["rbf", "sigmoid"], 'C': Cs, 'gamma': gammas},
             {'kernel' : ["poly"], 'C': Cs, 'gamma': gammas, 'degree': degrees}]
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 25, 50, 100, 1000]}, {'kernel': ['rbf', 'sigmoid'], 'C': [0.001, 0.01, 0.1, 1, 10, 25, 50, 100, 1000], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1]}, {'kernel': ['poly'], 'C': [0.001, 0.01, 0.1, 1, 10, 25, 50, 100, 1000], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1], 'degree': [2, 3, 4, 5, 6, 7, 8]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [93]:
print("Best parameter set from grid search:\n")
grid_search.best_params_
print("Best Accuracy: " + grid_search.best_score_)
means = grid_search.cv_results_['mean_test_score']
sds = grid_search.cv_results_['std_test_score']
for mean, sd, params in zip(means, sds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
print()

Best parameters set found on development set:

{'C': 50, 'degree': 2, 'gamma': 0.0001, 'kernel': 'linear'}

Grid scores on development set:

0.788 (+/-0.035) for {'C': 0.001, 'degree': 2, 'gamma': 0.0001, 'kernel': 'linear'}
0.742 (+/-0.052) for {'C': 0.001, 'degree': 2, 'gamma': 0.0001, 'kernel': 'poly'}
0.788 (+/-0.035) for {'C': 0.001, 'degree': 2, 'gamma': 0.0001, 'kernel': 'rbf'}
0.788 (+/-0.035) for {'C': 0.001, 'degree': 2, 'gamma': 0.0001, 'kernel': 'sigmoid'}
0.788 (+/-0.035) for {'C': 0.001, 'degree': 2, 'gamma': 0.001, 'kernel': 'linear'}
0.742 (+/-0.052) for {'C': 0.001, 'degree': 2, 'gamma': 0.001, 'kernel': 'poly'}
0.788 (+/-0.035) for {'C': 0.001, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf'}
0.788 (+/-0.035) for {'C': 0.001, 'degree': 2, 'gamma': 0.001, 'kernel': 'sigmoid'}
0.788 (+/-0.035) for {'C': 0.001, 'degree': 2, 'gamma': 0.01, 'kernel': 'linear'}
0.742 (+/-0.052) for {'C': 0.001, 'degree': 2, 'gamma': 0.01, 'kernel': 'poly'}
0.788 (+/-0.035) for {'C': 0.001, 'de

In [None]:
svm = SVC(grid_search.best_params, probability=True)