<a href="https://colab.research.google.com/github/sasansharee/Sasan_MMA_Projects/blob/main/Auto_Scout_project_SVM_Classification_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
df_org = pd.read_csv('df_33_cleaned_filled_outlier_dummies.csv')

In [None]:
df = df_org.copy()

**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(['fuel_Diesel'], axis = 1)
y = df['fuel_Diesel']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

### **Modelling and Model Performance**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
def eval_metric (model, X_train, y_train, X_test, y_test):
  y_train_pred = model.predict(X_train)
  y_pred = model.predict(X_test)

  print('Test_Set:')
  print(confusion_matrix(y_test, y_pred))
  print(classification_report(y_test, y_pred))

  print()

  print('Train_Set:')
  print(confusion_matrix(y_train, y_train_pred))
  print(classification_report(y_train, y_train_pred))

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((11939, 138), (3980, 138), (11939,), (3980,))

In [None]:
sum(y_train), sum(y_test), sum(y_train) / len(y_train), sum(y_test) / len(y_test)

(5474, 1825, 0.4584973615880727, 0.4585427135678392)

**Without Scalling**

In [None]:
svc = SVC(random_state = 42)
svc.fit(X_train, y_train)

In [None]:
eval_metric(svc, X_train, y_train, X_test, y_test)

Test_Set:
[[1670  485]
 [ 574 1251]]
              precision    recall  f1-score   support

           0       0.74      0.77      0.76      2155
           1       0.72      0.69      0.70      1825

    accuracy                           0.73      3980
   macro avg       0.73      0.73      0.73      3980
weighted avg       0.73      0.73      0.73      3980


Train_Set:
[[5042 1423]
 [1727 3747]]
              precision    recall  f1-score   support

           0       0.74      0.78      0.76      6465
           1       0.72      0.68      0.70      5474

    accuracy                           0.74     11939
   macro avg       0.73      0.73      0.73     11939
weighted avg       0.74      0.74      0.74     11939



In [None]:
from sklearn.model_selection import cross_validate

In [None]:
scores_svc = cross_validate(svc, X_train, y_train, scoring = ['precision', 'recall', 'f1'], cv = 10)
scores_svc

{'fit_time': array([34.12228632, 18.96719432, 11.92004681, 11.90899467, 10.94027042,
        13.95889878, 14.65532112, 12.9594326 , 16.65679836, 15.8993268 ]),
 'score_time': array([3.39295292, 0.98198748, 0.95843363, 0.98391175, 1.15312719,
        1.1050899 , 1.03825903, 1.23225236, 1.53352189, 1.28563714]),
 'test_precision': array([0.72900763, 0.72064777, 0.72275335, 0.73306773, 0.71968191,
        0.73120301, 0.74451098, 0.6969697 , 0.69074074, 0.72277228]),
 'test_recall': array([0.69708029, 0.64963504, 0.68978102, 0.67153285, 0.66179159,
        0.71115174, 0.68190128, 0.67276051, 0.68190128, 0.66727605]),
 'test_f1': array([0.71268657, 0.68330134, 0.70588235, 0.70095238, 0.68952381,
        0.721038  , 0.71183206, 0.68465116, 0.68629255, 0.69391635])}

In [None]:
scores_svc_df = pd.DataFrame(scores_svc, index = range(1, 11))
scores_svc_df.iloc[:, 2:].mean()

test_precision    0.721136
test_recall       0.678481
test_f1           0.699008
dtype: float64

**With Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(steps = [('scaler', StandardScaler()),
                         ('svc', SVC())])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
eval_metric(pipe, X_train, y_train, X_test, y_test)

Test_Set:
[[2118   37]
 [  22 1803]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2155
           1       0.98      0.99      0.98      1825

    accuracy                           0.99      3980
   macro avg       0.98      0.99      0.99      3980
weighted avg       0.99      0.99      0.99      3980


Train_Set:
[[6426   39]
 [  50 5424]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6465
           1       0.99      0.99      0.99      5474

    accuracy                           0.99     11939
   macro avg       0.99      0.99      0.99     11939
weighted avg       0.99      0.99      0.99     11939



In [None]:
scores_svc_pipe = cross_validate(pipe, X_train, y_train, scoring = ['precision', 'recall', 'f1'], cv = 10)
scores_svc_pipe

{'fit_time': array([5.38705206, 5.51683736, 5.53672957, 5.62089252, 5.71801114,
        5.80050278, 5.65854859, 5.66220903, 5.47935939, 5.61988711]),
 'score_time': array([0.38601112, 0.36388302, 0.40809131, 0.4080143 , 0.38178134,
        0.38871837, 0.41097283, 0.37136316, 0.43110347, 0.24242973]),
 'test_precision': array([0.98188406, 0.98348624, 0.98025135, 0.97644928, 0.97449909,
        0.9801085 , 0.97996357, 0.98710866, 0.96953405, 0.97632058]),
 'test_recall': array([0.98905109, 0.97810219, 0.99635036, 0.98357664, 0.97806216,
        0.99085923, 0.98354662, 0.97989031, 0.98903108, 0.97989031]),
 'test_f1': array([0.98545455, 0.98078683, 0.98823529, 0.98      , 0.97627737,
        0.98545455, 0.98175182, 0.98348624, 0.97918552, 0.97810219])}

In [None]:
scores_svc_pipe_df = pd.DataFrame(scores_svc_pipe, index = range(1, 11))
scores_svc_pipe_df.iloc[:, 2:].mean()

test_precision    0.978961
test_recall       0.984836
test_f1           0.981873
dtype: float64

**With Best Parameters (GridsearchCV)**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
pipe2 = Pipeline(steps = [('scaler', StandardScaler()),
                          ('SVC', SVC(class_weight = 'balanced', probability = True, random_state = 42))])

In [None]:
param_grid = {'SVC__C': [0.001, 0.05, 0.01],
              'SVC__gamma': ['scale', 'auto', 0.2, 0.3],
              'SVC__kernel': ['rbf', 'linear']}

In [None]:
svm_grid = GridSearchCV(pipe2, param_grid, scoring = 'recall', cv = 10)
svm_grid.fit(X_train, y_train)


In [None]:
svm_grid.best_params_

{'SVC__C': 0.05, 'SVC__gamma': 'scale', 'SVC__kernel': 'linear'}

In [None]:
svm_grid.best_index_

np.int64(9)

In [None]:
svm_grid.best_score_

np.float64(0.9819136230801051)

In [None]:
svm_grid_df = pd.DataFrame(svm_grid.cv_results_)

In [None]:
svm_grid_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_SVC__C,param_SVC__gamma,param_SVC__kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,128.530412,14.001681,2.261751,0.211929,0.001,scale,rbf,"{'SVC__C': 0.001, 'SVC__gamma': 'scale', 'SVC_...",1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,19
1,34.314155,0.532184,0.143551,0.011163,0.001,scale,linear,"{'SVC__C': 0.001, 'SVC__gamma': 'scale', 'SVC_...",0.976277,0.963504,0.981752,0.968978,0.972578,0.978062,0.972578,0.968921,0.967093,0.972578,0.972232,0.005152,9
2,140.451139,7.341059,2.347939,0.09486,0.001,auto,rbf,"{'SVC__C': 0.001, 'SVC__gamma': 'auto', 'SVC__...",1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,19
3,31.292923,0.815972,0.148365,0.0116,0.001,auto,linear,"{'SVC__C': 0.001, 'SVC__gamma': 'auto', 'SVC__...",0.976277,0.963504,0.981752,0.968978,0.972578,0.978062,0.972578,0.968921,0.967093,0.972578,0.972232,0.005152,9
4,144.761409,8.09494,2.364609,0.071534,0.001,0.2,rbf,"{'SVC__C': 0.001, 'SVC__gamma': 0.2, 'SVC__ker...",1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,19
5,32.906212,2.56036,0.140664,0.021028,0.001,0.2,linear,"{'SVC__C': 0.001, 'SVC__gamma': 0.2, 'SVC__ker...",0.976277,0.963504,0.981752,0.968978,0.972578,0.978062,0.972578,0.968921,0.967093,0.972578,0.972232,0.005152,9
6,160.975164,5.401573,2.384593,0.100455,0.001,0.3,rbf,"{'SVC__C': 0.001, 'SVC__gamma': 0.3, 'SVC__ker...",1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,19
7,35.113424,1.16895,0.138553,0.009088,0.001,0.3,linear,"{'SVC__C': 0.001, 'SVC__gamma': 0.3, 'SVC__ker...",0.976277,0.963504,0.981752,0.968978,0.972578,0.978062,0.972578,0.968921,0.967093,0.972578,0.972232,0.005152,9
8,82.795543,5.187467,0.952798,0.095275,0.05,scale,rbf,"{'SVC__C': 0.05, 'SVC__gamma': 'scale', 'SVC__...",0.963504,0.95438,0.967153,0.95073,0.963437,0.961609,0.95064,0.957952,0.967093,0.959781,0.959628,0.005798,13
9,10.370149,0.282218,0.038577,0.002919,0.05,scale,linear,"{'SVC__C': 0.05, 'SVC__gamma': 'scale', 'SVC__...",0.981752,0.976277,0.990876,0.983577,0.981718,0.985375,0.978062,0.978062,0.983547,0.97989,0.981914,0.004041,1


In [None]:
eval_metric(svm_grid, X_train, y_train, X_test, y_test)

Test_Set:
[[2117   38]
 [  24 1801]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2155
           1       0.98      0.99      0.98      1825

    accuracy                           0.98      3980
   macro avg       0.98      0.98      0.98      3980
weighted avg       0.98      0.98      0.98      3980


Train_Set:
[[6365  100]
 [  95 5379]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      6465
           1       0.98      0.98      0.98      5474

    accuracy                           0.98     11939
   macro avg       0.98      0.98      0.98     11939
weighted avg       0.98      0.98      0.98     11939



In [None]:
pipe3 = Pipeline(steps = [('scaler', StandardScaler()),
                          ('SVC', SVC(C = 0.05, gamma = 'scale', kernel = 'linear', class_weight = 'balanced',
                                      probability = True, random_state = 42))])

In [None]:
scores_grid = cross_validate(pipe3, X_train, y_train, scoring = ['precision', 'recall', 'f1'], cv = 10)

In [None]:
scores_grid_df = pd.DataFrame(scores_grid, index = range(1, 11))
scores_grid_df

Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1
1,10.509845,0.046298,0.976407,0.981752,0.979072
2,10.116909,0.035883,0.987085,0.976277,0.981651
3,11.389016,0.048367,0.983696,0.990876,0.987273
4,10.213261,0.037666,0.981785,0.983577,0.98268
5,10.689426,0.042743,0.971067,0.981718,0.976364
6,11.356061,0.043047,0.987179,0.985375,0.986276
7,10.581424,0.039154,0.969203,0.978062,0.973612
8,11.58908,0.040922,0.987085,0.978062,0.982553
9,11.789468,0.041926,0.971119,0.983547,0.977293
10,11.566966,0.044797,0.97989,0.97989,0.97989


In [None]:
scores_grid_df.iloc[:, 2:].mean()

test_precision    0.979452
test_recall       0.981914
test_f1           0.980666
dtype: float64

In [None]:
svm_grid.decision_function(X_test)

array([-6.84622144,  2.78820043,  1.54324522, ..., -3.41076645,
       -3.16522009, -4.19779644])

In [None]:
svm_grid.predict(X_test)

array([0, 1, 1, ..., 0, 0, 0])