# Support Vector Machine

## Load Packages and Prep Data

In [57]:
# custom utils
import utils

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.svm import SVC

In [46]:
SVC?

[0;31mInit signature:[0m
[0mSVC[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mC[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkernel[0m[0;34m=[0m[0;34m'rbf'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdegree[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgamma[0m[0;34m=[0m[0;34m'scale'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcoef0[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshrinking[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprobability[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache_size[0m[0;34m=[0m[0;36m200[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclass_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[

In [33]:
# load data
X_train, y_train, X_test, y_test = utils.load_data()

(62889, 46)
(15723, 46)


## Model 1
- Defaults

In [37]:
# fit SVM model
svm_1 = SVC()
x = svm_1.fit(X_train, y_train)

In [38]:
# cross validation with f1 scoring
score = utils.f1_cv(svm_1, X_train, y_train)

[0.5392 0.5437 0.5437 0.5471 0.5444]
0.5436


## Model 2
- Regularize by removing correlated features

### Feature Selection

In [41]:
# drop correlated features

correlated_features = set()
correlation_matrix = X_train.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

print('Number of redundant features:',len(correlated_features))
print('Removed features:',correlated_features)
print()
X_train_selected = X_train.drop(columns = correlated_features)
print('Remaining features:',list(X_train_selected.columns))

Number of redundant features: 31
Removed features: {'fwidth', 'w_l_ratio', 'roundness', 'surface_area', 'thick_vol_prod', 'rnd_ell_prod', 'ellipticity', 'chull_area', 'concavity', 'ellipse_ratio', 'dp', 'circularity', 'sieve', 'chull_perimeter', 'fiber_width', 'compactness', 'fthickness', 'chull_surface_area', 'ethickness', 't_l_aspect_ratio', 'thick_perm_prod', 'l_w_ratio', 'l_t_ratio', 'angularity', 'elength', 't_w_ratio', 'ewidth', 'perimeter', 'flength', 'area', 'fiber_length'}

Remaining features: ['da', 'volume', 'sphericity', 'solidity', 'convexity', 'extent', 'transparency', 'curvature', 'w_t_ratio', 'krumbein_rnd', 'thick_trans_prod']


### Fit Model

In [44]:
# fit SVM model
svm_2 = SVC()
x = svm_2.fit(X_train_selected, y_train)

### Cross-Validation

In [45]:
# cross validation with f1 scoring
score = utils.f1_cv(svm_2, X_train, y_train)

[0.4885 0.4842 0.493  0.4963 0.4795]
0.4883


## Model 3
- Hyperparameter tuning by grid search (could try `RandomizedSearchCV`)

### Grid Search

In [47]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C':[1.0, 10.0, 100.0, 1000.0], 'gamma':[0.001, 0.0001], 'kernel':['rbf', 'linear']}

In [48]:
svm = SVC()

In [None]:
grid_search = GridSearchCV(svm, param_grid, cv = 5, scoring = 'f1',return_train_score=True)
x = grid_search.fit(X_train_selected,y_train)

In [52]:
# scores with each set of parameters
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.0 {'C': 1.0, 'gamma': 0.001, 'kernel': 'rbf'}
0.07170820949513906 {'C': 1.0, 'gamma': 0.001, 'kernel': 'linear'}
0.0 {'C': 1.0, 'gamma': 0.0001, 'kernel': 'rbf'}
0.07170820949513906 {'C': 1.0, 'gamma': 0.0001, 'kernel': 'linear'}
0.0 {'C': 10.0, 'gamma': 0.001, 'kernel': 'rbf'}
0.09285023017898461 {'C': 10.0, 'gamma': 0.001, 'kernel': 'linear'}
0.0 {'C': 10.0, 'gamma': 0.0001, 'kernel': 'rbf'}
0.09285023017898461 {'C': 10.0, 'gamma': 0.0001, 'kernel': 'linear'}
0.026202303794624328 {'C': 100.0, 'gamma': 0.001, 'kernel': 'rbf'}
0.0937138866653483 {'C': 100.0, 'gamma': 0.001, 'kernel': 'linear'}
0.0 {'C': 100.0, 'gamma': 0.0001, 'kernel': 'rbf'}
0.0937138866653483 {'C': 100.0, 'gamma': 0.0001, 'kernel': 'linear'}
0.08044069072199443 {'C': 1000.0, 'gamma': 0.001, 'kernel': 'rbf'}
0.09369123866063753 {'C': 1000.0, 'gamma': 0.001, 'kernel': 'linear'}
0.026202303794624328 {'C': 1000.0, 'gamma': 0.0001, 'kernel': 'rbf'}
0.09369123866063753 {'C': 1000.0, 'gamma': 0.0001, 'kernel': 'linear'}


In [53]:
# best parameters
grid_search.best_params_

{'C': 100.0, 'gamma': 0.001, 'kernel': 'linear'}

In [54]:
# store best model
svm_tuned = grid_search.best_estimator_

In [58]:
# cross validation with f1 scoring
score = utils.f1_cv(svm_tuned, X_train, y_train)

[0.1009 0.0881 0.0902 0.101  0.0883]
0.0937


## Test

In [56]:
# test the performance of the selected model
y_pred = svm_tuned.predict(X_test.drop(columns = correlated_features))

# scores
utils.pred_metrics(y_test, y_pred)

# confusion matrix
utils.cm_plot(y_test,y_pred)

Accuracy:	0.9335368568339375
Precision:	0.8225806451612904
Recall:		0.047004608294930875
F1:		0.08892763731473408
