In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('./Vehicle.csv')
df.head()

Unnamed: 0,Comp,Circ,D.Circ,Rad.Ra,Pr.Axis.Ra,Max.L.Ra,Scat.Ra,Elong,Pr.Axis.Rect,Max.L.Rect,Sc.Var.Maxis,Sc.Var.maxis,Ra.Gyr,Skew.Maxis,Skew.maxis,Kurt.maxis,Kurt.Maxis,Holl.Ra,Class
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus


In [4]:
df.isnull().sum()

Comp            0
Circ            0
D.Circ          0
Rad.Ra          0
Pr.Axis.Ra      0
Max.L.Ra        0
Scat.Ra         0
Elong           0
Pr.Axis.Rect    0
Max.L.Rect      0
Sc.Var.Maxis    0
Sc.Var.maxis    0
Ra.Gyr          0
Skew.Maxis      0
Skew.maxis      0
Kurt.maxis      0
Kurt.Maxis      0
Holl.Ra         0
Class           0
dtype: int64

**No missing data**

In [5]:
df['Class'].unique()

array(['van', 'saab', 'bus', 'opel'], dtype=object)

In [6]:
X = df.drop('Class', axis=1)
y = df['Class']
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42,
                                                    stratify=y)

In [7]:
pipeline1 = Pipeline([
    ('clf', SVC(probability=True, random_state=42))
])
Kfold = KFold(n_splits=5, shuffle=True, random_state=42)

### SVM w/o scaling Linear

In [8]:
params_1 = {
    'clf__C': np.linspace(0.001, 5, 10),
    'clf__kernel': ['linear'],
    'clf__decision_function_shape': ['ovo', 'ovr']
}
gcv1 = GridSearchCV(pipeline1, params_1, cv=Kfold, scoring='neg_log_loss')
gcv1.fit(X, y)

print(f"Best parameters for SVM without scaling in linear on Y: {gcv1.best_params_}")
print(f"Best score for SVM without scaling in linear on Y: {gcv1.best_score_}")

Best parameters for SVM without scaling in linear on Y: {'clf__C': 0.5564444444444444, 'clf__decision_function_shape': 'ovo', 'clf__kernel': 'linear'}
Best score for SVM without scaling in linear on Y: -0.44452531468291845


In [9]:
bm1 = gcv1.best_estimator_
y_pred1 = bm1.predict(X_test)
accuracy_score(y_test, y_pred1)

0.8818897637795275

### SVM w/o scaling RBF

In [10]:
params2 = {
    'clf__C': np.linspace(0.001, 5, 10),
    'clf__gamma': list(np.linspace(0.001, 5, 10)) + ['scale', 'auto'],
    'clf__decision_function_shape': ['ovo', 'ovr']
}
gcv2 = GridSearchCV(pipeline1, param_grid=params2, cv=Kfold, scoring='neg_log_loss')
gcv2.fit(X, y)

In [11]:
print(f"Best parameters for SVM without scaling in RBF on Y: {gcv2.best_params_}")
print(f"Best score for SVM without scaling in RBF on Y: {gcv2.best_score_}")

Best parameters for SVM without scaling in RBF on Y: {'clf__C': 5.0, 'clf__decision_function_shape': 'ovo', 'clf__gamma': 0.001}
Best score for SVM without scaling in RBF on Y: -0.537176737376624


In [12]:
bm2 = gcv2.best_estimator_
y_pred2 = bm2.predict(X_test)
accuracy_score(y_test, y_pred2)

0.984251968503937

### SVM with scaling Linear

In [13]:
pipeline2 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC(probability=True, random_state=42))
])

In [14]:
params3 = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'clf__C': np.linspace(0.001, 5, 10),
    'clf__kernel': ['linear'],
    'clf__gamma': list(np.linspace(0.001, 5, 10)) + ['scale', 'auto'],
    'clf__decision_function_shape': ['ovo', 'ovr']
}
gcv3 = GridSearchCV(pipeline2, param_grid=params3, cv=Kfold, scoring='neg_log_loss')
gcv3.fit(X, y)

print(f"Best parameters for SVM without scaling in RBF on Y: {gcv3.best_params_}")
print(f"Best score for SVM without scaling in RBF on Y: {gcv3.best_score_}")

Best parameters for SVM without scaling in RBF on Y: {'clf__C': 3.8891111111111107, 'clf__decision_function_shape': 'ovo', 'clf__gamma': 0.001, 'clf__kernel': 'linear', 'scaler': StandardScaler()}
Best score for SVM without scaling in RBF on Y: -0.4193231742289689


In [15]:
bm3 = gcv3.best_estimator_
y_pred3 = bm3.predict(X_test)
accuracy_score(y_test, y_pred3)

0.8661417322834646

### SVM with scaling RBF

In [16]:
params4 = {
    'scaler': [StandardScaler(), MinMaxScaler()],
    'clf__C': np.linspace(0.001, 5, 10),
    'clf__gamma': list(np.linspace(0.001, 5, 10)) + ['scale', 'auto'],
    'clf__decision_function_shape': ['ovo', 'ovr']
}
gcv4 = GridSearchCV(pipeline2, param_grid=params4, cv=Kfold, scoring='neg_log_loss')
gcv4.fit(X, y)

print(f"Best parameters for SVM without scaling in RBF on Y: {gcv4.best_params_}")
print(f"Best score for SVM without scaling in RBF on Y: {gcv4.best_score_}")

Best parameters for SVM without scaling in RBF on Y: {'clf__C': 5.0, 'clf__decision_function_shape': 'ovo', 'clf__gamma': 'scale', 'scaler': StandardScaler()}
Best score for SVM without scaling in RBF on Y: -0.40090238571477804


In [17]:
bm4 = gcv4.best_estimator_
y_pred4 = bm4.predict(X_test)
accuracy_score(y_test, y_pred4)

0.9094488188976378