In [None]:
import numpy as np
import pandas as pd


import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline

from sklearn import model_selection
from sklearn import metrics

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV


In [None]:
vehicle_df = pd.read_csv("../input/vehicle/vehicle.csv")

print('Dataframe Shape : ', vehicle_df.shape);
print('\n')
vehicle_df.info()

In [None]:
vehicle_df.head(10)

In [None]:
data = vehicle_df.isnull().sum()
df = pd.DataFrame({'columns': vehicle_df.columns, 'missing_count': data.values})
df = df[df['missing_count'] > 0]
print(df.sort_values(['missing_count'], ascending=False))
print()
print('Missing data in ', df['columns'].size, ' columns.')
print('Missing data columns : ', df[df['missing_count'] > 0]['columns'].values)

## Initial data analysis summary
- There are 846 rows with 19 columns.
- The categorical column 'class' represents the category of vehicles.
- The null values are present in 14 columns listed above.


In [None]:
# d. 5 point summary of numerical attributes
vehicle_df.describe().round(2).T


In [None]:
vehicle_df2 = vehicle_df.copy()
vehicle_df2.fillna(vehicle_df2.mean(), inplace=True)
vehicle_df2.drop("class", axis=1, inplace=True)

fig, axes = plt.subplots(nrows=6, ncols=3, figsize=(25, 25))
for i, column in enumerate(vehicle_df2.columns):
    sns.distplot(vehicle_df2[column],ax=axes[i//3,i%3])

In [None]:
# Class column data distribution
vehicle_df['class'].value_counts()

In [None]:
#plt.subplots(figsize=(100, 100))
#sns.boxplot(data=vehicle_df2, orient="h")

fig, axes = plt.subplots(nrows=6, ncols=3, figsize=(25, 25))
#fivepoint = pd.DataFrame(columns=['Model Name', 'Accuracy', 'Recall', 'Precision'])
for i, column in enumerate(vehicle_df2.columns):
    sns.boxplot(vehicle_df2[column],ax=axes[i//3,i%3], dodge=False, whis=1.5)
    

In [None]:
# Max value based on boxplot to filter outliers of 8 columns where outliers are identified. 
max_df = pd.DataFrame([[255,77,13,288,990,87,19,40]],columns=['radius_ratio', 'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1'])

total_outliers = 0
for i, column in enumerate(max_df.columns):
    #print(column, max_df[column][0], vehicle_df[column][vehicle_df[column] > max_df[column][0]].size)
    total_outliers += vehicle_df[column][vehicle_df[column] > max_df[column][0]].size
    
print('Total Outliers ', total_outliers)
print('Total Outliers %', round((total_outliers/len(vehicle_df.index))*100) )


### Statistical Summary
- The data is not distributed equally for the 3 vehicle classes. The 50% of the data belongs to car class.
- As seen in distribution graph, most of the columns have bimodal distribution of data and some are multimodal.
- radius_ratio, axis_aspect_ration, length_aspect_ratio & scaled_radius_of_gyration.1 columns have very long right tail.
- The scale of the columns are very different so would need normalization. 
- As per boxplot, 8 columns has outliers and 5 columns has many number of outliers. 
- The outliers are approx 6%.

## 1. Data Pre-processing

In [None]:
vehicle_df_new = vehicle_df.copy();

# Fill null
vehicle_df_new.fillna(vehicle_df_new.mean(), inplace=True)

# Remove outliers based on max value identified earlier from boxplot
for i, column in enumerate(max_df.columns):
    vehicle_df_new = vehicle_df_new[vehicle_df_new[column] < max_df[column][0]]
    
# Convert class column to categorical 
vehicle_df_new['class'] = pd.Categorical(vehicle_df_new['class']).codes

## rest the index post cleaning the outliers
vehicle_df_new = vehicle_df_new.reset_index(drop=True)

vehicle_df_new.info()
vehicle_df_new.head()

## 2. Attribute Relationship Analysis

#### - Find groups with correlated columns for feature selection & PCA
#### - Find low correlated columns to ignore from PCA


In [None]:
# independant variables
X = vehicle_df_new.drop(['class'], axis=1)
# the dependent variable
y = vehicle_df_new[['class']]

sns.pairplot(X, diag_kind='kde')   # plot density curve instead of histogram on the diag

In [None]:
corr = vehicle_df_new.corr().round(2)
plt.figure(figsize=(20,20))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns,cmap='RdBu')

#### - Pair plot & Heatmap Summary
- 11 columns are highly correlated to each other considered as group1. PCA will be performed for Dimensionality Reduction.
- 3 other columns are highly correlated to each other considered as group2. PCA will be performed for Dimensionality Reduction.
- In group 2, scaled_radius_of_gyration.1 and hollows_ratio has -ve correlation. Whereas skewness_about.2 and hollows_ratio has +ve correlation.
- 4 columns ( pr.axis_aspect_ratio, max.length_aspect_ratio, skewness_about, skewness_about.1 ) do not have correlation to any other columns which will be directly considered for model.


In [None]:
c = ['class']

# High correlation columns (Group 1)
cols_hc1 = ['compactness','circularity','distance_circularity','radius_ratio','scatter_ratio','elongatedness',
           'pr.axis_rectangularity','max.length_rectangularity','scaled_variance','scaled_variance.1',
           'scaled_radius_of_gyration']
sns.pairplot(vehicle_df_new[[*cols_hc1, *c]], diag_kind='kde', hue='class')

In [None]:
# High correlated columns (Group 2)
cols_hc2 = ['hollows_ratio', 'scaled_radius_of_gyration.1', 'skewness_about.2']

sns.pairplot(vehicle_df_new[[*cols_hc2, *c]], diag_kind='kde', hue='class')

In [None]:
# Low correlation columns
cols_lc = ['pr.axis_aspect_ratio','max.length_aspect_ratio','skewness_about','skewness_about.1']

sns.pairplot(vehicle_df_new[[*cols_lc, *c]], diag_kind='kde', hue='class')

### Attribute Relationship Summary
- Following columns are highly correlated hence PCA will be done
    compactness,circularity, distance_circularity, radius_ratio,scatter_ratio, elongatedness, pr.axis_rectangularity, max.length_rectangularity, scaled_variance, scaled_variance.1, scaled_radius_of_gyration

- Following 3 columns are highly correlated to each other but not to other columns so either we can keep 1 column and drop other 2 or we can do PCA separatly and merge the PCA feature to final list. 
    hollows_ratio, scaled_radius_of_gyration.1, skewness_about.2

- Following columns are very low correlated to other columns hence will be ignore from PCA. These will be merged to PCA columns for model building.
    pr.axis_aspect_ratio, max.length_aspect_ratio, skewness_about, skewness_about.1



## 3. PCA - Dimensionality Reduction

In [None]:
#Scale the values
from scipy.stats import zscore
XScaled=X.apply(zscore)
XScaled.head()

In [None]:
# Apply PCA on Group 1 of high Corelation columns
X1 = XScaled[cols_hc1]
pca1 = PCA(n_components=len(cols_hc1), whiten=False)
pca1.fit(X1)

In [None]:
print('Original number of features:', len(cols_hc1))
#print('Reduced number of features:', pca1.shape[1])
print()
print('Eigen Values', pca1.explained_variance_)
print()
#print('Eigen Vector', pca1.components_)
#print()
#print('Percentage  ', pca1.explained_variance_ratio_)

percent_variance = np.asarray([float(format(num, '.3f')) for num in pca1.explained_variance_ratio_])
percent_variance = np.round(np.asarray(percent_variance) * 100, decimals =2)
print('Percentage  ', percent_variance)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15, 5))

ax1.bar(list(range(0,len(cols_hc1))),pca1.explained_variance_ratio_,align='center')
ax1.set(xlabel='Eigen Value', ylabel='Variation explained')

ax2.step(list(range(0,len(cols_hc1))),np.cumsum(pca1.explained_variance_ratio_), where='mid')
#ax2.plot(pca1.explained_variance_)
ax2.set(xlabel='Eigen Value', ylabel='Cumulative of variation explained')
plt.show()

In [None]:
# With 4 variables we can explain over 95% of the variation in the original data of group1 columns. 
# And with 5 variables we can explain more than 98%
pca1_95 = PCA(n_components=0.95, whiten=True)
X_pca1_95 = pca1_95.fit_transform(X1)
print('Original number of features:', len(cols_hc1))
print('Reduced number of features:', X_pca1_95.shape[1])
print(X_pca1_95.shape)
sns.pairplot(pd.DataFrame(X_pca1_95))

In [None]:
# Apply PCA on Group 2 of high Corelation columns
X2 = XScaled[cols_hc2]

pca2 = PCA(n_components=len(cols_hc2), whiten=False)
pca2.fit(X2)
print('Eigen Values', pca2.explained_variance_)
print('Percentage  ', np.round(pca2.explained_variance_ratio_ * 100, decimals =2))


# With 2 variables we can explain over 95% of the variation in the original data of group2 columns
pca2 = PCA(n_components=0.95, whiten=True)
X_pca2 = pca2.fit_transform(X2)

print('Original number of features:', len(cols_hc2))
print('Reduced number of features:', X_pca2.shape[1])
print(X_pca2.shape)
sns.pairplot(pd.DataFrame(X_pca2))

In [None]:
#Reduced group1 of 11 columns to 4 columns with 95% variance
x_pca1_95_df = pd.DataFrame(data = X_pca1_95)

#Reduced group1 of 3 columns to 2 columns 
x_pca2_df = pd.DataFrame(data = X_pca2)

#Combind the 3 data frames a) Group1 PCA columns (95% variance), b) Group2 PCA columns, c) No correlation columns
X_new = pd.merge(x_pca1_95_df,x_pca2_df,right_index=True, left_index=True);
X_new = pd.merge(X_new,XScaled[cols_lc],right_index=True, left_index=True);

print('Final Shape', X_new.shape)
X_new.head(10)

## 4. SVM Classifier

#### SVM model with original features

In [None]:
## Split the train and test data into 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size = 0.3, random_state = 1)
## build the SVM model on training data
svc_org = SVC()
svc_org.fit(X_train,y_train)
prediction= svc_org.predict(X_test)
print(XScaled.shape)
#print("Class Distribution:\n",y['class'].value_counts())
print("Train Data Score", round(svc_org.score(X_train, y_train), 3))
print("Test Data Score ", round(svc_org.score(X_test,y_test), 3))
print("Confusion Matrix:\n   bus car van\n",metrics.confusion_matrix(prediction,y_test))
target_names = ['bus', 'car', 'van']
print(metrics.classification_report(y_test, prediction, target_names=target_names))

#### SVM model with features selected using PCA and visual analysis

In [None]:
## Split the train and test data into 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.3, random_state = 1)
## build the SVM model on training data
svc_pca = SVC()
svc_pca.fit(X_train,y_train)
prediction= svc_pca.predict(X_test)
print(X_new.shape)
print(svc_pca)
print("Train Data Score", round(svc_pca.score(X_train, y_train), 3))
print("Test Data Score", round(svc_pca.score(X_test,y_test),3))
print("Confusion Matrix:\n   bus car van\n",metrics.confusion_matrix(prediction,y_test))
target_names = ['bus', 'car', 'van']
print(metrics.classification_report(y_test, prediction, target_names=target_names))
metrics.classification_report(y_test, prediction, target_names=target_names)

### SVM Summary

Common
- There is a slight increase in test data accuracy from 97.1% to 97.5% when PCA variables are used.
- The precision & recall performance of bus is decreased slightly. However not much change in case of car & bus.

As per confusion matrix comparasion
- The accuracy of the modal with PCA variables is slightly different than original columns.
- The correct prediction of bus reduced from 66 to 64 and no change in incorrect prediction for both car & van.
- The correct prediction of car remains same but incorrect prediction increased for bus and reduced for van.
- The correct prediction of van increased from 46 to 49 and incorrect prediction is unchanged.

As per classification metrics 
- The precision for bus is reduced by 1% but for car & van remains same. 
- The recall for van is increased by 6% but for bus reduced by 3% and not change for car.


## 5. Hyper Parameters Tuning

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Parameter Grid
param_grid = [{'kernel': ['linear'], 'C': [0.01, 0.05, 0.5, 1.0, 10, 25, 50]},
              {'kernel': ['rbf'], 'C': [0.01, 0.05, 0.5, 1.0, 10, 25, 50]}
             ] 
# Make grid search classifier
clf_grid = GridSearchCV(SVC(), param_grid, verbose=1)
 
# Train the classifier
clf_grid.fit(X_train, y_train)
 
# clf = grid.best_estimator_()
print("Best Parameters:\n", clf_grid.best_params_)
print("Best Estimators:\n", clf_grid.best_estimator_)

#### Using Best SVM Parameters

In [None]:
## Split the train and test data into 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.3, random_state = 1)
## build the SVM model on training data
svc_pca_tun = SVC(C=10, kernel = "rbf")
svc_pca_tun.fit(X_train,y_train)
prediction= svc_pca_tun.predict(X_test)
print(X_new.shape)
print("Train Data Score", round(svc_pca_tun.score(X_train, y_train), 3))
print("Test Data Score", round(svc_pca_tun.score(X_test,y_test),3))
print("Confusion Matrix:\n   bus car van\n",metrics.confusion_matrix(prediction,y_test))
target_names = ['bus', 'car', 'van']
print(metrics.classification_report(y_test, prediction, target_names=target_names))


#### SVM Parameter Tuning Summary

- The test data accuracy increased further by 0.4%.
- Accuracy of car is improved by 1 in correct prediction and by 1 in incorrect predication of bus.


#### K-FOLD Cross validation

In [None]:
pred_kfold = cross_val_score(svc_org, XScaled, y, cv=10) 
print("Accuracy with SVM on original data: %0.2f (+/- %0.2f)" % (pred_kfold.mean(), pred_kfold.std() * 2))

pred_kfold = cross_val_score(svc_pca, X_new, y, cv=10) 
print("Accuracy with SVM on PCA data: %0.2f (+/- %0.2f)" % (pred_kfold.mean(), pred_kfold.std() * 2))

pred_kfold = cross_val_score(svc_pca_tun, X_new, y, cv=10) 
print("Accuracy with SVM with tuned params on PCA data: %0.2f (+/- %0.2f)" % (pred_kfold.mean(), pred_kfold.std() * 2))
