# Predict automobile price using Machine Learning




# Import all libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import all libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Reading the data:

In [5]:
cars_data = pd.read_csv('C:\\Users\\NITESH\\Downloads\\Predict-automobile-price-using-Machine-Learning-main\\AutoData (1).csv')
cars_data.head()

Unnamed: 0,symboling,make,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [None]:
cars_data.tail()

In [None]:
cars_data.shape

In [None]:
cars_data.dtypes

In [None]:
cars_data.info()

In [None]:
cars_data.describe()

# Data Cleaning:

In [None]:
cars_data.duplicated().sum()

In [None]:
cars_data.isnull().sum()

In [None]:
#symboling column- Its assigned insurance risk rating, 
#A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.
cars_data['symboling'].value_counts()

In [None]:
sns.pairplot(y_vars = 'symboling', x_vars = 'price' ,data = cars_data)

In [None]:
#Column CarName
cars_data['make'].value_counts()

In [None]:
cars_data['car_company'] = cars_data['make'].apply(lambda x:x.split(' ')[0])

In [None]:
#rechecking
cars_data.head()

In [None]:
#deleting the original column
cars_data = cars_data.drop(['make'], axis =1)

In [None]:
cars_data['car_company'].value_counts()

We see some spelling mistakes in the data.

In [None]:
cars_data['car_company'].replace('toyouta', 'toyota',inplace=True)
cars_data['car_company'].replace('Nissan', 'nissan',inplace=True)
cars_data['car_company'].replace('maxda', 'mazda',inplace=True)
cars_data['car_company'].replace('vokswagen', 'volkswagen',inplace=True)
cars_data['car_company'].replace('vw', 'volkswagen',inplace=True)
cars_data['car_company'].replace('porcshce', 'porsche',inplace=True)

In [None]:
#rechecking the data:
cars_data['car_company'].value_counts()

In [None]:
# fueltype - Car fuel type i.e gas or diesel
cars_data['fueltype'].value_counts()

In [None]:
#aspiration - Aspiration used in a car
cars_data['aspiration'].value_counts()

In [None]:
#doornumber - Number of doors in a car
cars_data['doornumber'].value_counts()

As there are numbers, let us convert these numbers into numeric form.

In [None]:
def number_(x):
    return x.map({'four':4, 'two': 2})
    
cars_data['doornumber'] = cars_data[['doornumber']].apply(number_)

In [None]:
#rechecking
cars_data['doornumber'].value_counts()

In [None]:
#carbody- body of car
cars_data['carbody'].value_counts()

In [None]:
#drivewheel - type of drive wheel
cars_data['drivewheel'].value_counts()

In [None]:
#enginelocation - Location of car engine
cars_data['enginelocation'].value_counts()

In [None]:
#wheelbase - Weelbase of car 
cars_data['wheelbase'].value_counts().head()

In [None]:
sns.distplot(cars_data['wheelbase'])
plt.show()

In [None]:
#carlength - Length of car
cars_data['carlength'].value_counts().head()

In [None]:
sns.distplot(cars_data['carlength'])
plt.show()

In [None]:
#enginetype - Type of engine.
cars_data['enginetype'].value_counts()

In [None]:
#cylindernumber- cylinder placed in the car
cars_data['cylindernumber'].value_counts()

As there are numbers, let us convert these numbers into numeric form.

In [None]:
def number(x):
    return x.map({'four':4,'six':6,'five':5,'eight':8,'two':2,'three':3,'twelve':12})
cars_data['cylindernumber']=cars_data[['cylindernumber']].apply(number)

In [None]:
cars_data['cylindernumber'].value_counts()

In [None]:
#fuelsystem - Fuel system of car
cars_data['fuelsystem'].value_counts()

**We have almost looked into most of the column individually. Now let us explore data visulaization to look for any patterns.**

# Data Visulaization:

In [None]:
cars_numeric = cars_data.select_dtypes(include =['int64','float64'])
cars_numeric.head()

In [None]:
cars_numeric.info()

In [None]:
plt.figure(figsize = (30,30))
sns.pairplot(cars_numeric)
plt.show()

**As there are lot of columns- we can't make out really much from above graph. Let us check the correlation.**

In [None]:
plt.figure(figsize = (20,20))
sns.heatmap(cars_data.corr(), annot = True ,cmap = 'YlGnBu')
plt.show()

**-Price is highly (positively) correlated with wheelbase, carlength, carwidth, curbweight, enginesize, horsepower.**

**-Price is negatively correlated to symboling, citympg and highwaympg.**

**-This suggest that cars having high mileage may fall in the 'economy' cars category, and are priced lower.**

**-There are many independent variables which are highly correlated: wheelbase, carlength, curbweight, enginesize etc.. all are positively correlated.**

In [None]:
categorical_cols = cars_data.select_dtypes(include = ['object'])
categorical_cols.head()

In [None]:
plt.figure(figsize = (20,12))
plt.subplot(3,3,1)
sns.boxplot(x = 'fueltype', y = 'price', data = cars_data)
plt.subplot(3,3,2)
sns.boxplot(x = 'aspiration', y = 'price', data = cars_data)
plt.subplot(3,3,3)
sns.boxplot(x = 'carbody', y = 'price', data = cars_data)
plt.subplot(3,3,4)
sns.boxplot(x = 'drivewheel', y = 'price', data = cars_data)
plt.subplot(3,3,5)
sns.boxplot(x = 'enginelocation', y = 'price', data = cars_data)
plt.subplot(3,3,6)
sns.boxplot(x = 'enginetype', y = 'price', data = cars_data)
plt.subplot(3,3,7)
sns.boxplot(x = 'fuelsystem', y = 'price', data = cars_data)

In [None]:
plt.figure(figsize = (20,12))
sns.boxplot(x = 'car_company', y = 'price', data = cars_data)

**1.From the price boxplot it is clear that The brands with the most expensive vehicles in the dataset belong to Bmw,Buick,Jaguar and porsche.**

**2. Whereas the lower priced cars belong to chevrolet**

**3. The median price of gas vehicles is lower than that of Diesel Vehicles.**

**4. 75th percentile of standard aspirated vehicles have a price lower than the median price of turbo aspirated vehicles.** 

**5. Two and four Door vehicles are almost equally priced. There are however some outliers in the price of 
two-door vehicles.** 

**6. Hatchback vehicles have the lowest median price of vehicles in the data set whereas hardtop vehicles have 
the highest median price.**

**7. The price of vehicles with rear placed engines is significantly higher than the price of vehicles with front 
placed engines.** 

**8. Almost all vehicles in the dataset have engines placed in the front of the vehicle. However, the price of 
vehicles with rear placed engines is significantly higher than the price of vehicles with front placed engines.** 

**9. The median cost of eight cylinder vehicles is higher than other cylinder categories.**

**10. It is clear that vehicles Multi-port Fuel Injection [MPFI] fuelsystem have the highest median price. There are 
also some outliers on the higher price side having MPFI systems.**

**11. Vehicles with OHCV engine type falls under higher price range.**

# Data preparation:

In [None]:
#creating dummies
cars_dummies = pd.get_dummies(categorical_cols, drop_first = True)
cars_dummies.head()

In [None]:
car_df  = pd.concat([cars_data, cars_dummies], axis =1)

In [None]:
car_df = car_df.drop(['fueltype', 'aspiration', 'carbody', 'drivewheel', 'enginelocation',
       'enginetype', 'fuelsystem', 'car_company'], axis =1)

In [None]:
car_df.info()

# Spliting the data into test and train

In [None]:
df_train, df_test = train_test_split(car_df, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
df_train.shape

In [None]:
df_test.shape

# Rescaling the data:

In [None]:
cars_numeric.columns

In [None]:
col_list = ['symboling', 'doornumber', 'wheelbase', 'carlength', 'carwidth','carheight', 'curbweight', 'cylindernumber', 'enginesize', 'boreratio',
            'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'price']

In [None]:
scaler = StandardScaler()

In [None]:
df_train[col_list] = scaler.fit_transform(df_train[col_list])

In [None]:
df_train.describe()

# Model building:

In [None]:
y_train = df_train.pop('price')
X_train = df_train

## Model building using RFE

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)

# Subsetting training data for 15 selected columns
rfe = RFE(lr,15)
rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
cols = X_train.columns[rfe.support_]
cols

## Model 1:

In [None]:
X1 = X_train[cols]
X1_sm = sm.add_constant(X1)

lr_1 = sm.OLS(y_train,X1_sm).fit()

In [None]:
print(lr_1.summary())

All the p- values are significant. Let us check VIF.

In [None]:
#VIF
vif = pd.DataFrame()
vif['Features'] = X1.columns
vif['VIF'] = [variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

We see that there are a few variables which have an infinite/large VIF. These variables aren't of use. But manually elimination is time consuming and makes the code unnecessarily long. So let's try and build a model with 10 features this time using RFE.

## Building the model with 10 variables:

In [None]:
lr2 = LinearRegression()

rfe2 = RFE(lr2,10)
rfe2.fit(X_train,y_train)

In [None]:
list(zip(X_train.columns,rfe2.support_,rfe2.ranking_))

In [None]:
supported_cols = X_train.columns[rfe2.support_]
supported_cols

## Model 2:

In [None]:
X2 = X_train[supported_cols]
X2_sm = sm.add_constant(X2)

model_2 = sm.OLS(y_train,X2_sm).fit()

In [None]:
print(model_2.summary())

Model looks fine. Let us check the vif.

In [None]:
#VIF
vif = pd.DataFrame()
vif['Features'] = X2.columns
vif['VIF'] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

## Model 3:

In [None]:
X3 = X2.drop(['car_company_subaru'], axis =1)
X3_sm = sm.add_constant(X3)

Model_3 = sm.OLS(y_train,X3_sm).fit()

In [None]:
print(Model_3.summary())

In [None]:
#VIF
vif = pd.DataFrame()
vif['Features'] = X3.columns
vif['VIF'] = [variance_inflation_factor(X3.values, i) for i in range(X3.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

Let us drop column -enginetype_ohcf.

## Model 4:

In [None]:
X4 = X3.drop(['enginetype_ohcf'], axis =1)
X4_sm = sm.add_constant(X4)

Model_4 = sm.OLS(y_train,X4_sm).fit()

In [None]:
print(Model_4.summary())

In [None]:
#VIF
vif = pd.DataFrame()
vif['Features'] = X4.columns
vif['VIF'] = [variance_inflation_factor(X4.values, i) for i in range(X4.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

VIF for car_company_peugeot is still high. Let us drop and rebuild the model.

## Model 5:

In [None]:
X5 = X4.drop(['car_company_peugeot'], axis =1)
X5_sm = sm.add_constant(X5)

Model_5 = sm.OLS(y_train,X5_sm).fit()

In [None]:
print(Model_5.summary())

In [None]:
#VIF
vif = pd.DataFrame()
vif['Features'] = X5.columns
vif['VIF'] = [variance_inflation_factor(X5.values, i) for i in range(X5.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

Let us drop variable enginetype_l.

## Model 6:

In [None]:
X6 = X5.drop(['enginetype_l'], axis =1)
X6_sm = sm.add_constant(X6)

Model_6 = sm.OLS(y_train,X6_sm).fit()

In [None]:
print(Model_6.summary())

In [None]:
#VIF
vif = pd.DataFrame()
vif['Features'] = X6.columns
vif['VIF'] = [variance_inflation_factor(X6.values, i) for i in range(X6.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

All the VIF values and p-values seem to be in a good range. Also the Adjusted R-squared is 89%. This model is explaining most of the variance without being too complex.

## Residual analysis:

In [None]:
y_train_pred = Model_6.predict(X6_sm)
y_train_pred.head()

In [None]:
Residual = y_train- y_train_pred

In [None]:
sns.distplot(Residual, bins =15)

Error term is normally distributed.

# Making Predictions:

In [None]:
df_test[col_list] = scaler.transform(df_test[col_list])

In [None]:
y_test = df_test.pop('price')
X_test = df_test

In [None]:
final_cols = X6.columns

In [None]:
X_test_model6= X_test[final_cols]
X_test_model6.head()

In [None]:
X_test_sm = sm.add_constant(X_test_model6)

In [None]:
y_pred = Model_6.predict(X_test_sm)

In [None]:
y_pred.head()

In [None]:
c = [i for i in range(1,63,1)]
plt.plot(c, y_test,color = 'Blue')
plt.plot(c, y_pred,color = 'red')


In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel('y_test')
plt.ylabel('y_pred')

Though the model is doing good at the beginning, still there are few high values which model is not able to explain.

# Evaluation:

In [None]:
r_squ = r2_score(y_test,y_pred)
r_squ

**So linear equation for price can be given as: 𝑝𝑟𝑖𝑐𝑒 = -0.0748 + 0.3978 × carwidth + 0.5204 x enginesize + enginelocation_rear x 2.0419 + 0.7640 xenginetype_rotor + 1.1294 Xcar_company_bmw - 0.5879 x car_company_renault**

**Which variables are significant in predicting the price of a car?**

enginesize

carwidth

enginetype_rotor

car_company_bmw

enginelocation_rear

car_company_renault (-vely)


**These are the variables that are significant in predicting the price of a car.**