In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [None]:
dataset = pd.read_csv('vehicle.csv')

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.describe().transpose()

In [None]:
dataset.dtypes

In [None]:
dataset['class'].value_counts()

In [None]:
dataset.groupby('class').size()

In [None]:
dataset.plot(kind='box', figsize=(20,10))
plt.show()

In [None]:
dataset.hist(figsize=(15,15))
plt.show()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.info()

In [None]:
for i in dataset.columns[:-1]:
    median_value = dataset[i].median()
    dataset[i] = dataset[i].fillna(median_value)

In [None]:
dataset.info()

In [None]:
for col_name in dataset.columns[:-1]:
    q1 = dataset[col_name].quantile(0.25)
    q3 = dataset[col_name].quantile(0.75)
    iqr = q3 - q1
    
    low = q1-1.5*iqr
    high = q3+1.5*iqr
    
    dataset.loc[ (dataset[col_name] < low) | (dataset[col_name] > high), col_name] = dataset[col_name].median()
    

In [None]:
dataset.plot(kind='box', figsize=(20,10))

In [None]:
sns.pairplot(dataset,diag_kind='kde')

In [None]:
dataset.corr()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_df = scaler.fit_transform(dataset.drop(columns = 'class'))

In [None]:
X = scaled_df
y = dataset['class']

X_train, X_test, Y_train, Y_test = train_test_split(X,y, random_state = 10)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

In [None]:
# Training an SVC using the actual attributes(scaled)

model = SVC(gamma = 'auto')

model.fit(X_train,Y_train)

score_using_actual_attributes = model.score(X_test, Y_test)

print(score_using_actual_attributes)

In [None]:
model = SVC()

params = {'C': [0.01, 0.1, 0.5, 1], 'kernel': ['linear', 'rbf'], 'gamma' : ['auto', 'scale' ]}

model1 = GridSearchCV(model, param_grid=params, verbose=5)

model1.fit(X_train, Y_train)

print("Best Hyper Parameters:\n", model1.best_params_)

In [None]:
model = SVC(C=1, kernel="rbf", gamma='auto')

scores = cross_val_score(model, X, y, cv=10)

CV_score = scores.mean()
print(CV_score)

Let's create Principle Components out of the attributes

In [None]:
from sklearn.decomposition import PCA

pca = PCA().fit(scaled_df)

plt.plot(np.cumsum(pca.explained_variance_ratio_))
print(np.cumsum(pca.explained_variance_ratio_))

Let's pick 8 PCs as the first 8 capture more than 95% of the variance in the data

In [None]:
pca = PCA(n_components=8)

X = pca.fit_transform(scaled_df)
Y = dataset['class']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=10)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

In [None]:
# Training an SVC using the PCs instead of the actual attributes 
model = SVC(gamma= 'auto')

model.fit(X_train,Y_train)

score_PCs = model.score(X_test, Y_test)

print(score_PCs)

In [None]:
model = SVC(C=1, kernel="rbf", gamma='auto')

scores = cross_val_score(model, X, y, cv=10)

CV_score_pca = scores.mean()
print(CV_score_pca)

# Result

In [None]:
result = pd.DataFrame({'SVC' : ['All scaled attributes', '8 Principle components'],
                      'Accuracy' : [score_using_actual_attributes,score_PCs],
                      'Cross-validation score' : [CV_score,CV_score_pca]})

In [None]:
result

What was achieved using 18 attributes has been achieved using just 8 principle components. yay!