In [None]:
import pandas as pd
import numpy as pi
import seaborn as sn

import matplotlib.pyplot as plt
%matplotlib inline


from sklearn import metrics
from sklearn.model_selection import train_test_split
from scipy.stats import zscore

In [None]:
# Setting the display layout for the coding environment

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

sn.set(font_scale=1.0)

In [None]:
import os

print('Listing files in the folder')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data Set Aanalysis for the Vehicle Data
- Loading the data set and checking out the dimensions
- Analysing various elements of the data

In [None]:
vehicle_data = pd.read_csv("/kaggle/input/vehicle2/vehicle-2.csv")

print("\nDimensions of the data")

print("Shape of the data :{0}".format(vehicle_data.shape))
print("Size of the data :{0}".format(vehicle_data.size))
print("nDim of the data :{0}".format(vehicle_data.ndim))
print("Shape x * y :{0}".format(vehicle_data.shape[0]*vehicle_data.shape[1]))

print("\nData types of the Data")
print(vehicle_data.info())

print("\nData elements from file")
vehicle_data.head(20).T

## Checking for the missing or the null values

- The numeric columns does not have invalid characters only the NaN 

In [None]:
# Identify the types and the missing values of the data

print("\nList of Null values in each column")
print(vehicle_data.isna().sum())

In [None]:
# Replace the null value with the median

vehicle_modified_data = vehicle_data.drop(['class'],axis=1)

vehicle_modified_data=vehicle_modified_data.apply(lambda x: x.fillna(x.median()),axis=0)

vehicle_modified_data['type'] = vehicle_data['class']

print("\nVerify all the null values are replaced")
print(vehicle_modified_data.isna().sum())

### Summary of the Dataset Analysis

- Large number of attributes in the dataset, total 19 attributes
- All the attributes are linear in nature
- Data types for all attributes looks fine
- All the attribute are relevant, no need to drop any of the attributes
- **'cartype'** is the Target variable, will do detailed analysis on that attribute later

#### Complexties in the data set

- Values of the attributes are in different units, hence needs to be converted to the common type by preprocessors
- Some of the attributes are seem to be related or dependent, identifying the column might pose a challenge


In [None]:
# define functions to identify the number of outliers

def Identify_Outliers(data_column):
    
    dataFrame = pd.DataFrame(data_column)
    
    Quar1 = dataFrame.quantile(0.25)  
    Quar3 = dataFrame.quantile(0.75)  
    
    IQR = Quar3 - Quar1
    
    return ((dataFrame < (Quar1-1.5*IQR)) | (dataFrame> (Quar3+1.5*IQR))).sum()
    

print("\n Number of outliers in each attribute \n")
for (columnName, columnData) in vehicle_modified_data.iteritems(): 
    print(Identify_Outliers(vehicle_modified_data[columnName]))

data_columns = vehicle_modified_data.columns

In [None]:
# Visualising the columns with higher amount of outliers 

fig, axis = plt.subplots(2, 2, figsize=(25, 12), sharex=False)

axis[0,0].set_title("pr.axis_aspect_ratio")
sn.boxplot(vehicle_modified_data["pr.axis_aspect_ratio"],color='green',orient='h',ax=axis[0,0]);

axis[0,1].set_title("max.length_aspect_ratio")
sn.boxplot(vehicle_modified_data["max.length_aspect_ratio"],color='green',orient='h',ax=axis[0,1])

axis[1,0].set_title("scaled_radius_of_gyration.1")
sn.boxplot(vehicle_modified_data["scaled_radius_of_gyration.1"],color='orange',orient='h',ax=axis[1,0])

axis[1,1].set_title("skewness_about")
sn.boxplot(vehicle_modified_data["skewness_about"],color='orange',orient='h',ax=axis[1,1])


plt.show()

### Summary of the Outliers Analysis

- The oultiers on these datasets are nominal as compared to the number of data
- SVM is sensitivie to outlier, but with the number of outliers it should not have more impact on the analysis
- Not correcting the outliers as these values have less unique values, modifiying the outliers will impact the data quality

In [None]:
#col_median =vehicle_modified_data['max.length_aspect_ratio'].median()
#vehicle_modified_data['max.length_aspect_ratio']=pi.where(vehicle_modified_data['max.length_aspect_ratio']>13 ,13,vehicle_modified_data['max.length_aspect_ratio'])

In [None]:
# Linear info for the data
print('\nFive point summary for the attributes')
vehicle_modified_data.describe().T

In [None]:
print('Shape of Data: ', vehicle_modified_data.shape)

print('\nMedian for the data')
print(vehicle_modified_data.median())

print('\nMode for the data')
print(vehicle_modified_data.mode())

In [None]:
vehicle_columns =  vehicle_modified_data.columns

print('\nSkewing of the data\n')

for col_name in vehicle_columns:
    
    if(col_name == 'type'):
        continue
    print('{0} Parameter is Right Skewed:   {1}'.format(col_name,vehicle_modified_data[col_name].mean() > vehicle_modified_data[col_name].median()))
    
print('\n{0} Parameter is Left Skewed: {1}'.format('elongatedness',vehicle_modified_data['elongatedness'].mean() < vehicle_modified_data['elongatedness'].median()))
print('{0} Parameter is Left Skewed: {1}'.format('hollows_ratio',vehicle_modified_data['hollows_ratio'].mean() < vehicle_modified_data['hollows_ratio'].median()))

### Summary of the Five point analysis

- The data seemed to be skewed mostly on the right, the visualisation will give clue on the distribution
- Only the elongatedness and hollows_ratio is skewed to the left, it could be because of the outliers

In [None]:
def Display_BoxPlot(col,axis_rad,color):
    axis_rad.set_title(col)
    sn.distplot(vehicle_modified_data[col],color=color,ax=axis_rad);

sn.set(font_scale=1.5)

fig, axis = plt.subplots(4, 4, figsize=(30, 35), sharex=False)

Display_BoxPlot(vehicle_columns[0],axis[0,0],'green')
Display_BoxPlot(vehicle_columns[1],axis[0,1],'green')
Display_BoxPlot(vehicle_columns[2],axis[0,2],'green')
Display_BoxPlot(vehicle_columns[3],axis[0,3],'green')

Display_BoxPlot(vehicle_columns[4],axis[1,0],'orange')
Display_BoxPlot(vehicle_columns[5],axis[1,1],'orange')
Display_BoxPlot(vehicle_columns[6],axis[1,2],'orange')
Display_BoxPlot(vehicle_columns[7],axis[1,3],'orange')

Display_BoxPlot(vehicle_columns[8],axis[2,0],'red')
Display_BoxPlot(vehicle_columns[9],axis[2,1],'red')
Display_BoxPlot(vehicle_columns[10],axis[2,2],'red')
Display_BoxPlot(vehicle_columns[11],axis[2,3],'red')

Display_BoxPlot(vehicle_columns[12],axis[3,0],'blue')
Display_BoxPlot(vehicle_columns[13],axis[3,1],'blue')
Display_BoxPlot(vehicle_columns[14],axis[3,2],'blue')
Display_BoxPlot(vehicle_columns[15],axis[3,3],'blue')

plt.show()

### Summary of the Univariate analysis

- The distribution looks good for the compactness, radius_ratio, skweness_about.1, scaled_radius_of_gyration
- Scatter_ratio and the pr.axis_rectangularity shows a double distribution, non-normal

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(20, 7), sharex=False)

Display_BoxPlot(vehicle_columns[16],axis[0],'green')
Display_BoxPlot(vehicle_columns[17],axis[1],'green')

plt.show()

In [None]:
print("\nNumber of Unique values in each attribute")
print(vehicle_modified_data.nunique())

In [None]:
print("\nTotal values in categorical variables" )
print(vehicle_modified_data['type'].value_counts())
print(vehicle_modified_data['pr.axis_rectangularity'].value_counts())

fig = plt.subplots(figsize=(8, 5), sharex=False)

chart=sn.countplot(y='type',data=vehicle_modified_data);

plt.show()


fig = plt.subplots(figsize=(8, 5), sharex=False)

chart=sn.countplot(y='pr.axis_rectangularity',data=vehicle_modified_data);
plt.show()



In [None]:
# Identify the Correlation between the variables

fig, axis = plt.subplots(2, 1, figsize=(30, 30), sharex=False)

sn.set(font_scale=1.2)

sn.heatmap(vehicle_modified_data.corr(), mask=pi.triu(vehicle_modified_data.corr()),
           annot_kws={"size": 14}, annot=True,fmt='.3f',ax=axis[0],cmap='BrBG');

corr_thresold=0.4
vehicle_corr_threshold=vehicle_modified_data.corr()>corr_thresold

sn.heatmap(vehicle_corr_threshold,mask=pi.triu(vehicle_modified_data.corr()), annot_kws={"size": 16},annot=True,fmt='d',ax=axis[1]);

plt.show()

In [None]:
sn.set(font_scale=1.5)


#not_null_columns = ['circularity','distance_circularity','radius_ratio','pr.axis_aspect_ratio','scatter_ratio',
#                   'elongatedness','pr.axis_rectangularity','scaled_variance','scaled_variance.1','scaled_radius_of_gyration',
#                    'scaled_radius_of_gyration.1','skewness_about','skewness_about.1','skewness_about.2']

plot_vars=['compactness','circularity','distance_circularity','radius_ratio','scatter_ratio','pr.axis_rectangularity',
           'max.length_rectangularity','scaled_variance','scaled_variance.1','scaled_radius_of_gyration']

graph=sn.pairplot(vehicle_modified_data,kind='scatter',x_vars=plot_vars,y_vars=plot_vars,hue='type',diag_kind='kde');

graph.fig.set_size_inches(35,35)

## Summary of analysis for the correlated  variables
- scaled_variance is strongly related to scatter_ratio which show very positive dependency
- radius ratio seems to be strongly clouded toghether except few outlier, potentialy not to be included part of the feature selection

In [None]:
plot_vars=['pr.axis_aspect_ratio','max.length_aspect_ratio','elongatedness','scaled_radius_of_gyration.1','skewness_about','skewness_about.1',
           'skewness_about.2','hollows_ratio']

graph=sn.pairplot(vehicle_modified_data,kind='scatter',x_vars=plot_vars,y_vars=plot_vars,hue='type',diag_kind='kde');

graph.fig.set_size_inches(35,35)

## Summary of the less correlated variables

- elongatedness and the max_length_aspect_ratio seems to have the similarities of higher the corresponding values lower the values
- The scaled_radius_of_gyration.1 is showing negative correlation for the hollows_ratio

In [None]:
fig, axis = plt.subplots(2, 3, figsize=(20, 12), sharex=False)


sn.swarmplot(data=vehicle_modified_data,y='scatter_ratio',x='type',hue='type',ax=axis[0,0]);
sn.swarmplot(data=vehicle_modified_data,y='max.length_rectangularity',x='type',hue='type',ax=axis[0,1]);
sn.swarmplot(data=vehicle_modified_data,y='scaled_radius_of_gyration',x='type',hue='type',ax=axis[0,2]);


sn.violinplot(data=vehicle_modified_data,y='distance_circularity',x='type',hue='type',ax=axis[1,0]);
sn.swarmplot(data=vehicle_modified_data,y='radius_ratio',x='type',hue='type',ax=axis[1,1]);
sn.violinplot(data=vehicle_modified_data,y='circularity',x='type',hue='type',ax=axis[1,2]);

plt.show()

## Preparing the data for the analysis

- Making the values to the common units
- Not removing any of the attributes as of now for the model evaluation

In [None]:
axis_x = vehicle_modified_data.drop(['type'],axis=1)
axis_y = vehicle_modified_data['type']

axis_x_scaled=axis_x.apply(zscore)

print(f"\nScales Axis Shape: {axis_x_scaled.shape}")
axis_x_scaled.head(10).T

## Cluster Group for identying catergories
- Kmeans cluser for identifying different categories in the data, just a sampling exercise

In [None]:
axis_x_copy=axis_x_scaled.copy()

from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans 

clusters=range(1,19)
meanDistortions=[]

for k in clusters:
    model=KMeans(n_clusters=k)
    model.fit(axis_x_copy)
    prediction=model.predict(axis_x_copy)
    meanDistortions.append(sum(pi.min(cdist(axis_x_copy, model.cluster_centers_, 'euclidean'), axis=1)) / axis_x_copy.shape[0])


plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')

In [None]:
# Let us first start with K = 5
kmeans_model=KMeans(5)
kmeans_model.fit(axis_x_copy)
axis_x_predicted=kmeans_model.predict(axis_x_copy)

axis_x_copy['Group'] = axis_x_predicted
axis_x_copy.groupby(['Group']).mean()

## Identifying the features for the model through the coeff
- the pairplot visualisation helped in analysing the features
- In order to identify the feature importance, using the Coeff function and filtering features based on the values

In [None]:
from sklearn import svm 

#spliting the data into 70/30
x_train_fearure,x_test_feature,y_train_feature,y_test_feature = train_test_split(axis_x_scaled,axis_y,test_size=0.3,random_state=80)

svm_model = svm.SVC(kernel='linear')
svm_model.fit(x_train_fearure,y_train_feature)

ceof_data=pd.concat([pd.Series(svm_model.coef_[0]),pd.Series(x_train_fearure.columns)],axis=1)

print("\n Feature coeff\n")
print(ceof_data.sort_values(by=[0]))

### Summary of the Feature selection

- Removing the features which are less than -1.0, this method helps to remove some of un related features

## Splitting the data to training and testing

- Normal approach to split it into 70:30, the same will be following

In [None]:
# droping the variable while are not identified independent
axis_x_drop_scaled = axis_x_scaled.drop(['radius_ratio','hollows_ratio','elongatedness'],axis=1)

#spliting the data into 70/30
x_train_scale,x_test_scale,y_train_scale,y_test_scale = train_test_split(axis_x_drop_scaled,axis_y,test_size=0.3,random_state=120)

print(f"\nShape of the final Data: {axis_x_drop_scaled.shape}\n")
axis_x_drop_scaled.head()

In [None]:
print('\nSummary of the training and test data \n')
veh_car = len(vehicle_modified_data.loc[vehicle_modified_data['type'] == 'car'])
veh_van = len(vehicle_modified_data.loc[vehicle_modified_data['type'] == 'van'])
veh_bus = len(vehicle_modified_data.loc[vehicle_modified_data['type'] == 'bus'])

print (f"{len(x_train_scale)/len(axis_x_drop_scaled)*100} % data in the Training")
print (f"{len(x_test_scale)/len(axis_x_drop_scaled)*100} % data in the Testing")

print("\nPercent of the Vehicle types in Total")
print (f"Car: {veh_car} in total {len(axis_y)} {veh_car/len(axis_y)*100}%")
print (f"Van: {veh_van} in total {len(axis_y)} {veh_van/len(axis_y)*100}%")
print (f"Bus: {veh_bus} in total {len(axis_y)} {veh_bus/len(axis_y)*100}%")

print("\nPercent of the Vehicle types in Training data")
print (f"Car: {len(y_train_scale.loc[y_train_scale[:]=='car'])} in total {len(y_train_scale)} {len(y_train_scale.loc[y_train_scale[:]=='car'])/len(y_train_scale)*100}%")
print (f"Van: {len(y_train_scale.loc[y_train_scale[:]=='van'])} in total {len(y_train_scale)} {len(y_train_scale.loc[y_train_scale[:]=='van'])/len(y_train_scale)*100}%")
print (f"Bus: {len(y_train_scale.loc[y_train_scale[:]=='bus'])} in total {len(y_train_scale)} {len(y_train_scale.loc[y_train_scale[:]=='bus'])/len(y_train_scale)*100}%")

print("\nPercent of the Vehicle types in Test data")
print (f"Car: {len(y_test_scale.loc[y_test_scale[:]=='car'])} in total {len(y_test_scale)} {len(y_test_scale.loc[y_test_scale[:]=='car'])/len(y_test_scale)*100}%")
print (f"Van: {len(y_test_scale.loc[y_test_scale[:]=='van'])} in total {len(y_test_scale)} {len(y_test_scale.loc[y_test_scale[:]=='van'])/len(y_test_scale)*100}%")
print (f"Bus: {len(y_test_scale.loc[y_test_scale[:]=='bus'])} in total {len(y_test_scale)} {len(y_test_scale.loc[y_test_scale[:]=='bus'])/len(y_test_scale)*100}%")

## Support vector machine

In [None]:
from sklearn import svm

svm_model = svm.SVC(gamma=.2,C=0.8)
svm_model.fit(x_train_scale,y_train_scale)

svm_predict_train = svm_model.predict(x_train_scale)
svm_predict_test = svm_model.predict(x_test_scale)

svm_train_accuracy_normal=metrics.accuracy_score(y_train_scale,svm_predict_train);
svm_test_accuracy_normal=metrics.accuracy_score(y_test_scale,svm_predict_test);

print("\nAccuracy of the SVM Model\n")
print(f"Train Accuracy: {svm_train_accuracy_normal}")
print(f"Test  Accuracy: {svm_test_accuracy_normal}")


In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfoldcross = KFold(n_splits=12, random_state=130,shuffle=True)
score = cross_val_score(svm_model, axis_x_scaled, axis_y, cv=kfoldcross)

kfold_score_normal=score.mean()
print("Kfold Crossvalidation score")
print(f"Accuracy: {kfold_score_normal}")

## PCA - Analysis for the attributes
- Preparing the PCA with all the attributes
- Based on the variance in the data choosing the right principal components for the analysis
- Using the original data and scaling with the standard scaler for the PCA analysis

In [None]:
from sklearn import preprocessing

column_names = axis_x.columns

# pre-processing the split data using the preprocesser technique, standard processor
scaler_model = preprocessing.StandardScaler()
scaled_data= scaler_model.fit_transform(axis_x)

axis_x_standardscaled = pd.DataFrame(scaled_data,columns=column_names)
axis_x_standardscaled.head()

## Building the PCA for analysis

In [None]:
from sklearn.decomposition import PCA

components_selected=18
pca = PCA(n_components=components_selected)
pca.fit(axis_x_standardscaled)

print(f"\n The explained variance based on the {components_selected} components\n")
print(pca.explained_variance_ratio_)


cumm_var_ratio = pd.Series(pi.cumsum(pca.explained_variance_ratio_))
print(f"\nThe cummulative variance ratio base on the {components_selected} components\n")
print(cumm_var_ratio)


plt.bar(list(range(1,19)),pca.explained_variance_ratio_,alpha=1, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()

plt.step(list(range(1,19)),pi.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()

### Summary of the PCA analysis (Dimensionality Reduction)

- Evaluation based on the 95% cum variance explained, the eigen value is 7.
- The 7 dimensions is very reasonable to explain the 95% over the original data

In [None]:
components_selected=7
pca = PCA(n_components=components_selected)
pca.fit(axis_x_standardscaled)

print(f"\n The explained variance based on the {components_selected} components\n")
print(pca.explained_variance_ratio_)


cumm_var_ratio = pd.Series(pi.cumsum(pca.explained_variance_ratio_))
print(f"\nThe cummulative variance ratio base on the {components_selected} components\n")
print(cumm_var_ratio)


axis_x_transformed = pd.DataFrame(pca.transform(axis_x_standardscaled))
axis_x_transformed.head()

In [None]:
graph=sn.pairplot(axis_x_transformed,kind='scatter',diag_kind='kde');

graph.fig.set_size_inches(35,35)

### Analysis of the the Pair plot
- The PCA component helped to identify 7 columns for the model training
- The column 0,1,3,4,5 has almost no correlation with the other column, which seems to be good selection for the model training
- The columnt although it seems to have less coorelation, the x value is close to 0 for all the value of y. With some values is those are some kind of outliers possibly
- The column 0 shows a double peak possible 2 clusters of the data overlapping, culter analysis will reveal some picture. Not part of scope now

## Model Tuning
- Based on the the data dervied by the PCA, training the models again and check the accuracy
- The scaled data we have used for transform needs to be split

In [None]:
#spliting the data transformed using the PCA


x_train_trans,x_test_trans,y_train_trans,y_test_trans = train_test_split(axis_x_transformed,axis_y,test_size=0.3,random_state=160)

print('\nSummary of the training and test data \n')
veh_car = len(vehicle_modified_data.loc[vehicle_modified_data['type'] == 'car'])
veh_van = len(vehicle_modified_data.loc[vehicle_modified_data['type'] == 'van'])
veh_bus = len(vehicle_modified_data.loc[vehicle_modified_data['type'] == 'bus'])

print (f"{len(x_train_trans)/len(axis_x_transformed)*100} % data in the Training")
print (f"{len(x_test_trans)/len(axis_x_transformed)*100} % data in the Testing")

print("\nPercent of the Vehicle types in Training data")
print (f"Car: {len(y_train_trans.loc[y_train_trans[:]=='car'])} in total {len(y_train_trans)} {len(y_train_trans.loc[y_train_trans[:]=='car'])/len(y_train_trans)*100}%")
print (f"Van: {len(y_train_trans.loc[y_train_trans[:]=='van'])} in total {len(y_train_trans)} {len(y_train_trans.loc[y_train_trans[:]=='van'])/len(y_train_trans)*100}%")
print (f"Bus: {len(y_train_trans.loc[y_train_trans[:]=='bus'])} in total {len(y_train_trans)} {len(y_train_trans.loc[y_train_trans[:]=='bus'])/len(y_train_trans)*100}%")

print("\nPercent of the Vehicle types in Test data")
print (f"Car: {len(y_test_trans.loc[y_test_trans[:]=='car'])} in total {len(y_test_trans)} {len(y_test_trans.loc[y_test_trans[:]=='car'])/len(y_test_trans)*100}%")
print (f"Van: {len(y_test_trans.loc[y_test_trans[:]=='van'])} in total {len(y_test_trans)} {len(y_test_trans.loc[y_test_trans[:]=='van'])/len(y_test_trans)*100}%")
print (f"Bus: {len(y_test_trans.loc[y_test_trans[:]=='bus'])} in total {len(y_test_trans)} {len(y_test_trans.loc[y_test_trans[:]=='bus'])/len(y_test_trans)*100}%")


In [None]:
from sklearn import svm

svm_model = svm.SVC(gamma=.2,C=1)
svm_model.fit(x_train_trans,y_train_trans)

svm_predict_train_pca = svm_model.predict(x_train_trans)
svm_predict_test_pca = svm_model.predict(x_test_trans)

svm_train_accuracy_pca = metrics.accuracy_score(y_train_trans,svm_predict_train_pca)
svm_test_accuracy_pca = metrics.accuracy_score(y_test_trans,svm_predict_test_pca)

print("\nAccuracy of the SVM Model\n")
print(f"Train Accuracy: {svm_train_accuracy_pca}")
print(f"Test  Accuracy: {svm_test_accuracy_pca}")

In [None]:

kfoldcross = KFold(n_splits=12, random_state=190,shuffle=True)
score = cross_val_score(svm_model, axis_x_transformed, axis_y, cv=kfoldcross)

kfold_score_pca=score.mean()
print("Kfold Crossvalidation score")
print(f"Accuracy: {kfold_score_pca}")

In [None]:
def Display_Barplot (title,x_axis, y_axis, data,palette, axis):
    
    chart=sn.barplot(y=y_axis,x=x_axis,data=data,palette=palette,ax=axis)
    for patch in chart.patches:
        chart.annotate(format(patch.get_height(), '.5f'), 
                       (patch.get_x() + patch.get_width() / 2., patch.get_height()), 
                       ha = 'center', va = 'center', 
                       xytext = (0, 9), textcoords = 'offset points')
    chart.set_title(title)

In [None]:

data_plot = pd.DataFrame((['SVM',svm_train_accuracy_normal,svm_test_accuracy_normal,kfold_score_normal],
                                 ['SVM After PCA',svm_train_accuracy_pca,svm_test_accuracy_pca,kfold_score_pca]),
                         columns=['Model','train_accuracy','test_accuracy','kfold_score'])


sn.set(font_scale=1.5)
fig, axis = plt.subplots(1, 3, figsize=(30, 10), sharex=False)


Display_Barplot('Train Accuracy Comparison','Model','train_accuracy',data_plot,'Spectral',axis[0])
Display_Barplot('Test Accuracy Comparison','Model','test_accuracy',data_plot,'Spectral',axis[1])
Display_Barplot('Train Accuracy Comparison','Model','kfold_score',data_plot,'icefire',axis[2])

plt.show()

data_plot.head()


## Summary of the Analysis

- The above plot shows the variation in the training and the test data, the model seems to be more fit on the training data than test data
- The feature selected before PCA was 15 and after PCA was 7. but the accuracy score has a very less difference. This results in less computation can acheive a good accuracy
- The feature selection after the PCA also has the same difference in the test data.
- Kfold fold variance also showed a decrease in score after the PCA
- The decrease is accuracy postively mean the model is getting better, no overfit as well the number of features we have used has greatly reduced which mean much lesser computation
