# Automobile Dataset

### loading the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the dataset

In [None]:
auto=pd.read_csv("../input/automobiles/Automobile.csv")
auto

### Checking the head and Tail of data

In [None]:
auto.head()

In [None]:
auto.tail()

### checking the columns

In [None]:
auto.columns

### checking the summary statistics

In [None]:
auto.info()

### Fetching The categorical columns

In [None]:
auto[auto.dtypes[auto.dtypes=='object'].index]

## Univariate Analaysis

### Make

In [None]:
plt.figure(figsize=(10,10))
auto['make'].value_counts().plot(kind='bar',color='orange')
plt.show()

### Fuel_Type

In [None]:
plt.figure(figsize=(7,7))
auto['fuel_type'].value_counts(ascending=False).plot(kind='bar',colormap='Paired')
plt.show()

### Aspiration

In [None]:
plt.figure(figsize=(7,7))
auto['aspiration'].value_counts().plot(kind='bar',colormap='YlGn_r')
plt.legend()
plt.show()

### number_of_doors

In [None]:
plt.figure(figsize=(7,7))
auto['number_of_doors'].value_counts().plot(kind='bar',color=['orange','blue'])
plt.show()

### body_style

In [None]:
plt.figure(figsize=(7,7))
auto['body_style'].value_counts().plot(kind='bar',color=['blue','orange','yellow','red','red'])
plt.show()

## Creating a Subplot

In [None]:
fig,axes=plt.subplots(2,2,figsize=[10,8])
fig.suptitle('CategoryPlots',size=15,color='blue')   
auto['drive_wheels'].value_counts().plot(kind='bar',ax=axes[0][0],color='red')
axes[0][0].set_xlabel('Drive wheels',size=10,color='red')
auto['engine_location'].value_counts().plot(kind='bar',ax=axes[0][1],color='orange')
axes[0][1].set_xlabel('Engine_location',size=10,color='red')
auto['engine_type'].value_counts().plot(kind='bar',ax=axes[1][0],color='yellow')
axes[1][0].set_xlabel('Engine_type',size=10,color='red')
auto['number_of_cylinders'].value_counts().plot(kind='bar',ax=axes[1][1])
axes[1][1].set_xlabel('number_of_cylinders',size=10,color='red')
plt.subplots_adjust(hspace=1)
plt.show()
#drive_wheels	engine_location	engine_type	number_of_cylinders

## Numerical Columns

In [None]:
auto[auto.dtypes[auto.dtypes!='object'].index]

### Synboling

In [None]:
sns.distplot(auto['symboling'].dropna(),color="red")
plt.show()

### normalized_losses	

In [None]:
sns.distplot(auto['normalized_losses'].dropna(),color="orange")
plt.show()

### engine_size


In [None]:
sns.distplot(auto['engine_size'].dropna(),color='yellow')
plt.show()

### horsepower

In [None]:
sns.distplot(auto['horsepower'].dropna(),color="green")
plt.show()

### peak_rpm

In [None]:
sns.distplot(auto['peak_rpm'].dropna())
plt.show()

### Price

In [None]:
sns.distplot(auto['price'].dropna(),color="aqua")
plt.show()

## Bivaraite Analysis: Plotting with Respect to target Variable

In [None]:
auto

### Scatterplot for Symboling vs price

In [None]:
sns.scatterplot(x='symboling',y='price',data=auto)
plt.show()

### Box-plot for Categorical vs Numerical i.e Make VS Price 

In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(x='make',y='price',data=auto)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(x='aspiration',y='price',data=auto)                 #aspiration vs price
plt.xticks(rotation=90)
plt.show()

### number_of_doors vs Price

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(x='number_of_doors',y='price',data=auto)                 #aspiration vs price
plt.xticks(rotation=90)
plt.show()

### Plotting in 1 go using for loop

In [None]:
plotlist=auto.dtypes[auto.dtypes=='object'].index
plotlist

In [None]:
for i in plotlist:
    sns.boxplot(x=i,y='price',data=auto)
    plt.xticks(rotation=90)
    plt.show()

## Heatmap

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(auto.corr(),annot=True)
plt.show()

### Inferences:

1.engine size,curb_weight shows strong correlation with price of car. 

2.Horsepower also shows strong correlation with price and it is positive correlation

 ## Data Preprocessing

In [None]:
auto['number_of_doors']=auto['number_of_doors'].replace({'two':2,'four':4})

In [None]:
auto

## Dealing With Missing Value-Missing Value Treatment:

In [None]:
auto.isna().sum()

### Outlier Analaysis

### Selecting only numerical columns

In [None]:
plot=auto[auto.dtypes[auto.dtypes!='object'].index] 
plot

In [None]:
for i in plot.columns:
    sns.boxplot(plot[i])
    plt.show()

In [None]:
Q3=auto['normalized_losses'].quantile(0.75)
Q1=auto['normalized_losses'].quantile(0.25)
IQR=Q3-Q1
UP=Q3+1.5*IQR
LB=Q1-1.5*IQR

In [None]:
UP,LB

In [None]:
auto['normalized_losses'].quantile(0.99)

In [None]:
auto[auto['normalized_losses']>UP]['normalized_losses']

In [None]:
box=auto['normalized_losses'].replace(auto[auto['normalized_losses']>UP]['normalized_losses'],auto['normalized_losses'].quantile(0.99))

In [None]:
sns.boxplot(box)

### Now doing for all numerical columns

In [None]:
for i in plot.columns:
    Q3=plot[i].quantile(0.75)
    Q1=plot[i].quantile(0.25)
    IQR=Q3-Q1
    UP=Q3+1.5*IQR
    LB=Q1-1.5*IQR
    plot[i]=plot[i].replace(plot[plot[i]>UP][i],plot[i].quantile(0.99))
    plot[i]=plot[i].replace(plot[plot[i]<LB][i],plot[i].quantile(0.01))
    
    
    
    

In [None]:
for i in plot.columns:
    sns.boxplot(plot[i])
    plt.show()

### As we can see Outliers have been reduced

# Scaling

In [None]:
plot

### StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ss=StandardScaler()
plot_sc=ss.fit_transform(plot)
plot_sc=pd.DataFrame(plot_sc,columns=plot.columns)

In [None]:
plot_sc

In [None]:
plot_sc['symboling'].mean(),plot_sc['symboling'].std()

### As we can see mean has been reduced to 0 and standard deviation to 1

## MinMaxScaler:


In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
ms=MinMaxScaler()
plot_ms=ms.fit_transform(plot)
plot_ms=pd.DataFrame(plot_ms,columns=plot.columns)

In [None]:
plot_ms  #Values are lieing between 0 and 1 for min max scaler

## Plotting the graphs

In [None]:
plot_sc['symboling'].plot(kind='density',color='aqua')
plt.show()

In [None]:
plot['symboling'].plot(kind='density',color='cadetblue')
plt.show()

### We can see that after applying StandardScaler technique the characteristics remain same

### Robust Scaler:

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
rs=RobustScaler()
plot_rs=rs.fit_transform(plot)
plot_rs=pd.DataFrame(plot_rs,columns=plot.columns)

In [None]:
plot_rs

In [None]:
plot_rs['symboling'].plot(kind='kde')
plt.show()

## Normalizer:

In [None]:
from sklearn.preprocessing import Normalizer

In [None]:
norm=Normalizer(norm='l2')
plot_norm=norm.fit_transform(plot)
plot_norm=pd.DataFrame(plot_norm,columns=plot.columns)

In [None]:
plot_norm

## Encoding

In [None]:
cat_col=auto.select_dtypes(include='object')
cat_col

In [None]:
cat_col['drive_wheels'].value_counts()

In [None]:
cat_col['fuel_system'].value_counts()

In [None]:
cat_col['fuel_system']=cat_col['fuel_system'].replace({'mpfi':'fi','spfi':'fi','mfi':'fi',
                                                       '1bbl':'bbl','2bbl':'bbl','4bbl':'bbl','idi':'di','spdi':'di'})

In [None]:
cat_col

In [None]:
cat_col['engine_type'].value_counts()

In [None]:
cat_col['engine_type']=cat_col['engine_type'].replace({'dohc':'ohc'})

In [None]:
cat_col['engine_type'].value_counts()

In [None]:
cat_col['body_style'].value_counts()

In [None]:
frequencies=cat_col['make'].value_counts(normalize=True)

In [None]:
cat_col['make']=cat_col['make'].map(frequencies)

In [None]:
cat_col

In [None]:
cat_col['fuel_system'].value_counts()

In [None]:
cat_dum=pd.get_dummies(cat_col,columns=['fuel_type','aspiration','body_style','drive_wheels','engine_location','engine_type','number_of_cylinders','fuel_system'],drop_first=True)
cat_dum

In [None]:
final_data=pd.concat([plot_sc,cat_dum],axis=1)

In [None]:
final_data

 ### checking the sape of final Data

In [None]:
final_data.shape

In [None]:
final_data.columns

### checking the shape of original data

In [None]:
auto.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
out=final_data['price']
inp=final_data.drop('price',axis=1)

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(inp,out,test_size=0.3,random_state=0)

In [None]:
print(xtrain.shape)

In [None]:
print(xtest.shape)

In [None]:
print(ytest.shape)

In [None]:
print(ytrain.shape)