## 43-Support Vector Machine Model

### import packages and load datasets

In [None]:
# import needed packages
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFECV

In [None]:
# read data
full_data = pd.read_csv('data/cumulative_data_fe.csv')
train_data = pd.read_csv('data/train_data_fe.csv')
test_data = pd.read_csv('data/test_data_fe.csv')

### feature elimination with adaptive LASSO method

In [None]:
# define the dependent variables and independent variables of training dataset
y_train_data = pd.DataFrame(train_data['stone_soil_enc'])
x_train_data = train_data.drop(['id', 'img_id', 'stone_soil', 'stone_soil_enc'], axis = 1)

In [None]:
x_train_data.columns

Index(['da', 'dp', 'fwidth', 'flength', 'fthickness', 'elength', 'ethickness',
       'ewidth', 'volume', 'area', 'perimeter', 'chull_area',
       'chull_perimeter', 'sphericity', 'l_t_ratio', 't_l_aspect_ratio',
       'compactness', 'roundness', 'ellipse_ratio', 'circularity', 'solidity',
       'concavity', 'convexity', 'extent', 'hash', 'transparency', 'curvature',
       'surface_area', 'l_w_ratio', 'w_l_ratio', 'w_t_ratio', 't_w_ratio',
       'chull_surface_area', 'sieve', 'angularity', 'ellipticity',
       'fiber_length', 'fiber_width', 'krumbein_rnd', 'thick_vol_prod',
       'thick_perm_prod', 'thick_trans_prod', 'rnd_ell_prod'],
      dtype='object')

In [None]:
# use LASSO to get the sparse solution and make festure elimination
model_rfe = RFECV(Lasso(alpha = 0.001), cv = 5)
x = model_rfe.fit(x_train_data, y_train_data)
rfe = model_rfe.ranking_
features = ['da', 'dp', 'fwidth', 'flength', 'fthickness', 'elength', 'ethickness',
       'ewidth', 'volume', 'area', 'perimeter', 'chull_area',
       'chull_perimeter', 'sphericity', 'l_t_ratio', 't_l_aspect_ratio',
       'compactness', 'roundness', 'ellipse_ratio', 'circularity', 'solidity',
       'concavity', 'convexity', 'extent', 'hash', 'transparency', 'curvature',
       'surface_area', 'l_w_ratio', 'w_l_ratio', 'w_t_ratio', 't_w_ratio',
       'chull_surface_area', 'sieve', 'angularity', 'ellipticity',
       'fiber_length', 'fiber_width', 'krumbein_rnd', 'thick_vol_prod',
       'thick_perm_prod', 'thick_trans_prod', 'rnd_ell_prod']
rfe_df = pd.DataFrame({'features': features, 'rfe_rank': rfe}, index = np.arange(len(features)))



In [None]:
# choose the 10 most informative variables
rfe_df[rfe_df['rfe_rank'] <= 10]

Unnamed: 0,features,rfe_rank
33,sieve,10
34,angularity,9
35,ellipticity,8
36,fiber_length,7
37,fiber_width,6
38,krumbein_rnd,5
39,thick_vol_prod,4
40,thick_perm_prod,3
41,thick_trans_prod,2
42,rnd_ell_prod,1


In [None]:
# define the new independent variables dataset
column = rfe_df[rfe_df['rfe_rank'] <= 10]
x_train_data1 = x_train_data[column['features'].values]
x_train_data1.head

<bound method NDFrame.head of           sieve  angularity  ellipticity  fiber_length  fiber_width  \
0      0.005423    0.000000     0.035570      0.000000     0.000000   
1      0.008980    0.498113     0.029433      0.000000     0.000000   
2      0.006473    0.513208     0.024371      0.000000     0.000000   
3      0.009038    0.513208     0.033913      0.004139     0.009476   
4      0.005423    0.000000     0.035570      0.000000     0.000000   
...         ...         ...          ...           ...          ...   
62885  0.007872    0.513208     0.063838      0.006917     0.006317   
62886  0.350808    0.155660     0.183362      0.568617     0.212389   
62887  0.005831    0.622642     0.035705      0.000000     0.000000   
62888  0.013470    0.343819     0.156886      0.015816     0.008189   
62889  0.004665    0.513208     0.068408      0.000000     0.000000   

       krumbein_rnd  thick_vol_prod  thick_perm_prod  thick_trans_prod  \
0           1.00000    1.845767e-09        

### an example of constructing support vector machine model

In [None]:
# train a SVM model on training dataset
clf = SVC(C=0.8, kernel='rbf', gamma=2, max_iter = -1, decision_function_shape='ovr')
clf.fit(x_train_data1, y_train_data.values.ravel())

SVC(C=0.8, gamma=2)

In [None]:
# define a function to get the predicted accuracy of model
def show_accuracy(a, b):
    accuracy = a == b.values.ravel()
    print('Accuracy:%.3f'%np.mean(accuracy))

In [None]:
# calculate the accuracy for one SVM model
y_train_predict = clf.predict(x_train_data1)
show_accuracy(y_train_predict, y_train_data)

Accuracy:0.934


### use cross validation to choose the relatively optimal SVM model

In [None]:
# use cross validation method to find the relatively optimal SVM model
parameter1 = []
parameter2 = []
accuracy1 = []
for C in range(5, 10, 1):
    for gamma in range(15, 25, 5):
        accuracy = cross_val_score(SVC(C = C/10, kernel = 'rbf', gamma = gamma/10, max_iter = -1, decision_function_shape = 'ovr'), x_train_data1, y_train_data.values.ravel(), cv = 5, scoring = 'f1').mean()
        parameter1.append(C/10)
        parameter2.append(gamma/10)
        accuracy1.append(accuracy)

In [None]:
# show the accuracy of different SVM models
print(accuracy1)

[0.02528298725106922, 0.029433952860791434, 0.02713026249391446, 0.031276416255574015, 0.027593794112786618, 0.032196803571588675, 0.02897871261417877, 0.032658613834952, 0.03036039653232566, 0.033115066714178784]


In [None]:
# choose the relatively optimal SVM model
max_accuracy_index = accuracy1.index(max(accuracy1))
optimal_C = parameter1[max_accuracy_index]
optimal_gamma = parameter2[max_accuracy_index]

In [None]:
# define the dependent variables and independent variables of test dataset
y_test_data = pd.DataFrame(test_data['stone_soil_enc'])
x_test_data = test_data.drop(['id', 'img_id', 'stone_soil', 'stone_soil_enc'], axis = 1)
x_test_data1 = x_test_data[column['features'].values]

In [None]:
# apply the relatively optimal model on test data
clf1 = SVC(C = optimal_C, kernel = 'rbf', gamma = optimal_gamma, max_iter = -1, decision_function_shape = 'ovr')
clf.fit(x_test_data1, y_test_data.values.ravel())

SVC(C=0.8, gamma=2)

In [None]:
# calculate the accuracy of SVM model on test data
y_test_predict = clf.predict(x_test_data1)
show_accuracy(y_test_predict, y_test_data)

Accuracy:0.932


We use the Support Vector Machine model that has C = 0.8 and gamma = 2 as the final SVM model, and the accuracy on test dataset is 0.932.