### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss

#### Reading data

In [2]:
df_train= pd.read_csv("D:/Data Science/Machine Learning Track/SVM/train.csv")
df_test = pd.read_csv("D:/Data Science/Machine Learning Track/SVM/test.csv")

In [3]:
df_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [4]:
df_train.shape

(990, 194)

In [5]:
df_test.shape

(594, 193)

In [6]:
# labelencoding the species column in train dataset
labelencoder = LabelEncoder()
df_train.species = labelencoder.fit_transform(df_train.species)


In [7]:
#Saving column names
classes = list(labelencoder.classes_)
# Saving test ids
test_ids = df_test.id

In [8]:
df_train.species

0       3
1      49
2      65
3      94
4      84
5      40
6      54
7      78
8      53
9      89
10     98
11     16
12     74
13     50
14     58
15     31
16     43
17      4
18     75
19     44
20     83
21     84
22     13
23     66
24     15
25      6
26     73
27     22
28     73
29     31
       ..
960    85
961    89
962    94
963    45
964    48
965    86
966    81
967    14
968     4
969    77
970    56
971    82
972     2
973    85
974    70
975    88
976     0
977    75
978    14
979    86
980    81
981    97
982    70
983    72
984    34
985    40
986     5
987    11
988    78
989    50
Name: species, Length: 990, dtype: int64

#### Defining dependent and independent variables

In [9]:
x_train = df_train.iloc[:, 2:194]

In [10]:
x_train.head(1)

Unnamed: 0,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,margin10,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,0.001953,0.033203,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391


In [11]:
y_train = df_train.iloc[:,1:2]

In [12]:
y_train.head(1)

Unnamed: 0,species
0,3


In [13]:
x_test = df_test.iloc[:,1:194]

In [14]:
x_train.head(1)

Unnamed: 0,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,margin10,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,0.001953,0.033203,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391


#### Let's apply cross validation to find best values for Kernel,c and gamma.

In [15]:
parameters = {'kernel':('linear', 'rbf', 'poly'), 'C':[1, 10], 'gamma':[.1, .01, .001]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X=x_train,y=y_train.values.ravel())

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gamma': [0.1, 0.01, 0.001], 'kernel': ('linear', 'rbf', 'poly'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [16]:
print('Best score for data1:', clf.best_score_)

Best score for data1: 0.8686868686868687


In [17]:
print('Best C:',clf.best_estimator_.C) 
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)

Best C: 10
Best Kernel: linear
Best Gamma: 0.1


#### Let's retrain our model using the bext values of  Kernel, C and gamma

In [18]:
clf = svm.SVC(C = 10,kernel='linear',gamma= 0.1,probability= True)
clf.fit(X=x_train,y=y_train.values.ravel())

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
# predicting from training dataset to calculate Accuracy and loss
y_pred = clf.predict(x_train)
acc = accuracy_score(y_train, y_pred)
print("Accuracy: {:.4%}".format(acc))
y_pred = clf.predict_proba(x_train)
ll = log_loss(y_train, y_pred)
print("Log Loss: {}".format(ll))

Accuracy: 96.2626%
Log Loss: 1.9717195096850941


In [20]:
# predicting from test set
y_pred = clf.predict_proba(x_test)

In [21]:
# Format DataFrame
submission = pd.DataFrame(y_pred, columns=classes)
submission.insert(0, 'id', test_ids)
submission.reset_index()

# Export Submission
submission.to_csv('D:/Data Science/Machine Learning Track/SVM/submission.csv', index = False)
submission.tail()

Unnamed: 0,id,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
589,1576,0.015964,0.14985,0.011904,0.010249,0.016284,0.007497,0.008531,0.002581,0.058582,...,0.007016,0.006019,0.003726,0.00475,0.0093,0.011523,0.007773,0.005925,0.00301,0.030728
590,1577,0.009882,0.012435,0.00282,0.013876,0.001962,0.002701,0.006924,0.002712,0.021834,...,0.005945,0.004528,0.006146,0.003829,0.035779,0.03328,0.015109,0.006395,0.004347,0.026106
591,1579,0.018648,0.008921,0.007185,0.008385,0.002913,0.008499,0.009685,0.001868,0.008935,...,0.002677,0.005911,0.00255,0.004052,0.005694,0.004072,0.002632,0.011071,0.003024,0.014088
592,1580,0.004388,0.004231,0.010901,0.014009,0.002169,0.004622,0.0085,0.010688,0.004563,...,0.007655,0.00922,0.002125,0.034981,0.005305,0.011695,0.002937,0.008284,0.0031,0.008947
593,1583,0.002161,0.004999,0.011145,0.0062,0.001581,0.011467,0.012776,0.001644,0.004041,...,0.00189,0.005486,0.001677,0.002662,0.002942,0.005194,0.002047,0.009938,0.003515,0.00812
