In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("C:/Users/Administrator.DAI-PC2/Desktop/ML/Day 5/Kyphosis.csv")
le = LabelEncoder()
y = le.fit_transform(df["Kyphosis"])
X = df.drop("Kyphosis", axis = 1)

svc = SVC(C= 0.1, kernel = 'linear', probability = True, random_state=24)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=24, stratify=y)

kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state=24)

svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)
print(accuracy_score(y_test, y_pred))

y_pred_prob = svc.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob))

0.72
0.4164581753857312


In [3]:
# SVC

params = {'C': [0.1, 0.5, 1, 1.5, 2, 3]}
#params = {'C': np.linspace(0.001, 5, 10)}
gcv = GridSearchCV(svc, param_grid = params, cv = kfold, scoring = 'neg_log_loss')
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)
pd_cv = pd.DataFrame(gcv.cv_results_)

-0.4480718873347128
{'C': 0.1}


In [4]:
pd_cv

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.020312,0.00478,0.003783,0.000745,0.1,{'C': 0.1},-0.457968,-0.380817,-0.477345,-0.410142,-0.514088,-0.448072,0.047486,1
1,0.049185,0.017354,0.002988,2e-06,0.5,{'C': 0.5},-0.456996,-0.394854,-0.475039,-0.404974,-0.585428,-0.463458,0.068087,2
2,0.090601,0.030133,0.003386,0.000488,1.0,{'C': 1},-0.458763,-0.394908,-0.478519,-0.405092,-0.599656,-0.467388,0.073249,3
3,0.125897,0.050795,0.003187,0.000399,1.5,{'C': 1.5},-0.459049,-0.394926,-0.478536,-0.405091,-0.599658,-0.467452,0.073241,4
4,0.167667,0.074285,0.003185,0.000398,2.0,{'C': 2},-0.459047,-0.394935,-0.478519,-0.405144,-0.59966,-0.467461,0.07323,5
5,0.216652,0.104025,0.003186,0.000399,3.0,{'C': 3},-0.459046,-0.395388,-0.478513,-0.405123,-0.599637,-0.467541,0.073135,6


In [5]:
# SVC with scaling using pipeline

std_scaler = StandardScaler()
std_mm = MinMaxScaler()
pipe = Pipeline([('SCL', None), ('SVC',svc)])
params = {'SVC__C': np.linspace(0.001, 5, 20), 'SCL':[std_scaler, std_mm, None]}
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, scoring = 'neg_log_loss')
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

-0.45028007376389756
{'SCL': StandardScaler(), 'SVC__C': 0.5272105263157895}


In [6]:
#kernel = poly

svc1 = SVC(C= 0.1, kernel = 'poly', probability = True, random_state=24)

pipe = Pipeline([('SCL', None), ('SVC',svc1)])
params = {'SVC__C': np.linspace(0.001, 5, 20), 'SCL':[std_scaler, std_mm, None], 'SVC__degree' : [2,3], 'SVC__coef0': np.linspace(0, 3, 5)}
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, scoring = 'neg_log_loss', verbose = 2)
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=2; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=2; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=2; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=2; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=2; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=3; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=3; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=3; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=3; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__coef0=0.0, SVC__degree=3; total

In [7]:
#kernel = rbf

svc2 = SVC(C= 0.1, kernel = 'rbf', probability = True, random_state=24)

pipe = Pipeline([('SCL', None), ('SVC',svc2)])
params = {'SVC__C': np.linspace(0.001, 5, 20), 'SCL':[std_scaler, std_mm, None], 'SVC__degree' : [2,3], 'SVC__gamma': np.linspace(0.001, 5, 5)}
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, scoring = 'neg_log_loss', verbose = 2)
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__degree=2, SVC__gamma=0.001; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__degree=2, SVC__gamma=0.001; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__degree=2, SVC__gamma=0.001; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__degree=2, SVC__gamma=0.001; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__degree=2, SVC__gamma=0.001; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__degree=2, SVC__gamma=1.2507499999999998; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__degree=2, SVC__gamma=1.2507499999999998; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__degree=2, SVC__gamma=1.2507499999999998; total time=   0.0s
[CV] END SCL=StandardScaler(), SVC__C=0.001, SVC__degree=2, SVC__gamma=1.2507499999999998; total time=   0.0s
[CV] END SC