## Data Preparation

In [3]:
# Import packages
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import reciprocal, uniform

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import random
random.seed(666)

In [19]:
# Load data and extract Xs and Ys
df = pd.read_excel('dataclus1.xlsx')

X = df.iloc[:,1:5]



for col in df.columns[6:]:
  df[col] = df[col].astype('category')


y2 = df.iloc[:,5]
y3 = df.iloc[:,6]
y4 = df.iloc[:,7]
y5 = df.iloc[:,8]
y6 = df.iloc[:,9]
y7 = df.iloc[:,10]
y_all = [y2,y3,y4,y5,y6,y7]

## Implement algorithm

In [14]:
# Train and test split
X_train, X_test, y_train, y_test = train_test_split(X,y4,test_size = 0.30, random_state = 666)

# Train the model on train set
model = SVC()
model.fit(X_train, y_train)
 
# print prediction results
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.87      0.58      0.69       151
           2       0.92      0.95      0.93       523
           3       0.79      0.88      0.83        56
           4       0.81      0.90      0.86       280

    accuracy                           0.88      1010
   macro avg       0.85      0.83      0.83      1010
weighted avg       0.88      0.88      0.87      1010



## Hyperparameter Tuning

### 1. Grid Search

In [15]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['linear','poly','rbf','sigmoid']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 1)
 
# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 57.2min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [16]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

grid_predictions = grid.predict(X_test)
 
# print classification report
print(classification_report(y_test, grid_predictions))

{'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
              precision    recall  f1-score   support

           1       0.86      0.91      0.88       151
           2       0.99      0.97      0.98       523
           3       0.93      0.91      0.92        56
           4       0.94      0.95      0.95       280

    accuracy                           0.95      1010
   macro avg       0.93      0.94      0.93      1010
weighted avg       0.95      0.95      0.95      1010



### 2. Randomized Search

In [20]:
# defining parameter range
param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 1000),'kernel': ['linear','poly','rbf','sigmoid']}
 
grid = RandomizedSearchCV(SVC(), param_distributions, refit = True, verbose = 1)
 
# fitting the model for grid search
grid.fit(X_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

grid_predictions = grid.predict(X_test)
 
# print classification report
print(classification_report(y_test, grid_predictions))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


{'C': 968.9793147036413, 'gamma': 0.008726839750960865, 'kernel': 'rbf'}
SVC(C=968.9793147036413, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=3,
    gamma=0.008726839750960865, kernel='rbf', max_iter=-1, probability=False,
    random_state=None, shrinking=True, tol=0.001, verbose=False)
              precision    recall  f1-score   support

           1       0.90      0.81      0.85       151
           2       0.98      0.99      0.98       523
           3       0.96      0.89      0.93        56
           4       0.91      0.96      0.94       280

    accuracy                           0.95      1010
   macro avg       0.94      0.91      0.92      1010
weighted avg       0.95      0.95      0.95      1010



[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   14.6s finished


## Apply on different clusters

### One-vs-Rest

In [26]:
# Define a function 
def SVC_R(X,y):
# Train and test split
  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 666)

# Train the model on train set
  model = SVC(kernel='rbf', gamma=0.1, C=1000)
  model.fit(X_train, y_train)
 
# print prediction results
  predictions = model.predict(X_test)
  print(f'the accuracy report is')
  print(classification_report(y_test, predictions))
  print(f'the confusion matrix is')
  print(confusion_matrix(y_test, predictions))

In [27]:
# Apply on all y values
y_all = [y2,y3,y4,y5,y6,y7]

for i in y_all:
  a = SVC_R(X,i)
  print()
  print(a)
  print()

the accuracy report is
              precision    recall  f1-score   support

           1       0.96      0.89      0.93       223
           2       0.97      0.99      0.98       787

    accuracy                           0.97      1010
   macro avg       0.97      0.94      0.95      1010
weighted avg       0.97      0.97      0.97      1010

the confusion matrix is
[[199  24]
 [  8 779]]

None

the accuracy report is
              precision    recall  f1-score   support

           1       0.96      0.98      0.97       500
           2       0.93      0.96      0.94       314
           3       0.95      0.85      0.90       196

    accuracy                           0.95      1010
   macro avg       0.95      0.93      0.94      1010
weighted avg       0.95      0.95      0.95      1010

the confusion matrix is
[[492   5   3]
 [  8 301   5]
 [ 11  19 166]]

None

the accuracy report is
              precision    recall  f1-score   support

           1       0.86      0.91    

### One-vs-One

In [29]:
# Define a function 
def SVC_O(X,y):
# Train and test split
  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 666)

# Train the model on train set
  model = SVC(kernel='rbf', gamma=0.1, C=1000,decision_function_shape='ovo')
  model.fit(X_train, y_train)
 
# print prediction results
  predictions = model.predict(X_test)
  print(f'the accuracy report is')
  print(classification_report(y_test, predictions))
  print(f'the confusion matrix is')
  print(confusion_matrix(y_test, predictions))

In [30]:
# Apply on all y values
y_all = [y2,y3,y4,y5,y6,y7]

for i in y_all:
  a = SVC_O(X,i)
  print()
  print(a)
  print()

the accuracy report is
              precision    recall  f1-score   support

           1       0.96      0.89      0.93       223
           2       0.97      0.99      0.98       787

    accuracy                           0.97      1010
   macro avg       0.97      0.94      0.95      1010
weighted avg       0.97      0.97      0.97      1010

the confusion matrix is
[[199  24]
 [  8 779]]

None

the accuracy report is
              precision    recall  f1-score   support

           1       0.96      0.98      0.97       500
           2       0.93      0.96      0.94       314
           3       0.95      0.85      0.90       196

    accuracy                           0.95      1010
   macro avg       0.95      0.93      0.94      1010
weighted avg       0.95      0.95      0.95      1010

the confusion matrix is
[[492   5   3]
 [  8 301   5]
 [ 11  19 166]]

None

the accuracy report is
              precision    recall  f1-score   support

           1       0.86      0.91    

## Reference: 
1. https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/

2. https://towardsdatascience.com/hyperparameter-tuning-with-python-keras-xgboost-guide-7cb3ef480f9c

3. https://towardsdatascience.com/gradient-boosting-classification-explained-through-python-60cc980eeb3d

4. https://medium.com/analytics-vidhya/hyperparameter-tuning-an-svm-a-demonstration-using-hyperparameter-tuning-cross-validation-on-96b05db54e5b

5. https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

6. https://stackoverflow.com/questions/66617194/passing-random-variables-to-sklearn-random-search-randomizedsearchcv
