# Predict automobile price using Machine Learning

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv("AutoData.csv")
df

In [None]:
df.describe()

In [None]:
df.info()

# EDA- Explanotory Data Analysis

- Lets begin some explanotory data analysis .we'll start by checking any missing value. 

- we can use seaborn to create a simple heatmap to see where we have missing data

In [None]:
sns.heatmap(df.isnull(),yticklabels = False, cbar = False, cmap = 'viridis')

- since there is no color change in any part of the graph means that there is no null values.

- let's continue on by visualizing some more data .

In [None]:
sns.set_style('whitegrid')
sns.countplot(x= 'fueltype', data = df)

- by this we can say that ,the gas fueltype car sell more as compare to diesel

In [None]:
sns.set_style('whitegrid')
sns.countplot(x= 'fueltype',hue = 'carbody', data = df, palette = 'rainbow')

- by this we can say that , the sedan is the most selling car type in both gas as well as diesel varient.

In [None]:
sns.set_style('whitegrid')
sns.countplot(x= 'fueltype',hue = 'drivewheel', data = df, palette = 'rainbow')

- by this we can say that, the forward wheel drive is popular in gas varient of car but in diesel type rare wheel drive is more popular.

In [None]:
sns.set_style('whitegrid')
sns.countplot(x= 'fueltype',hue = 'enginelocation', data = df, palette = 'rainbow')

- by this we can say that ,the front engine location is more popular in both gas as well as diesel varient cars.

In [None]:
sns.set_style('whitegrid')
sns.countplot(x= 'fueltype',hue = 'cylindernumber', data = df, palette = 'rainbow')

- by this we can say that , the four cilender cars are more popluar than other in both fuel type cars.

### Price : Target Variable

In [None]:
df.price.describe()

In [None]:
sns.distplot(df['price'],kde = False ,color = 'blue', bins = 40)

by this distplot we can say that, the cars have price range between 5000-20000 are more sold as compare to other.

In [None]:
barG = df[['symboling','price']].groupby("symboling").mean().plot(kind='bar',legend = False,color = 'green')
barG.set_xlabel("Symbol")
barG.set_ylabel("Price")
plt.show()

In [None]:
df.make.values[0:10]

- It is observed that make consists of two parts 'car company' and 'Car Model' seperated by ' '.

In [None]:
df['company'] = df.make.str.split(' ').str.get(0).str.upper()

In [None]:
df['company'].unique() 

In [None]:
df['company'] = df['company'].replace(['MAXDA'], 'MAZDA')
df['company'] = df['company'].replace(['PORCSHCE'], 'PORSCHE')
df['company'] = df['company'].replace(['TOYOUTA'], 'TOYOTA')
df['company'] = df['company'].replace(['VW', 'VOKSWAGEN'], 'VOLKSWAGEN')

In [None]:
df_avg = df[['company','price']].groupby("company", as_index = False).mean().rename(columns={'price':'Avgprice'})
barG = df_avg.plot(x = 'company', kind='bar',legend = False, sort_columns = True, figsize = (15,5), color= 'purple')
barG.set_xlabel("Company")
barG.set_ylabel("Avg Price")
plt.show()

- by this plot we can understand the average price of a company model.

In [None]:
df = df.merge(df_avg, on = 'company')
df['Car_cat'] = df['Avgprice'].apply(lambda x : "Budget" if x < 12000 else ("Mid_Range" if 12000 <= x < 24000 else "Luxury"))


In [None]:
df['mileage'] = df['citympg']*0.6 + df['highwaympg']*0.4

- on an average people run car in city more as compare to highway.
- the average milage is a average of preportion of city milage and highway milage.

In [None]:
auto = df.copy()

In [None]:
auto.drop(['make','symboling','doornumber','enginelocation','carheight','fuelsystem','stroke','compressionratio','peakrpm','citympg','highwaympg','company','Avgprice'], axis = 1, inplace = True)

In [None]:
auto

### Converting Categorical Variables


In [None]:
auto = pd.get_dummies(auto, drop_first = True)
auto

- ML aglorithm's did not understand categorical variables.

### Correlation

In [None]:
plt.figure(figsize=(30,30))
ax = sns.heatmap(auto.corr(), annot = True, linewidth = 3)
ax.tick_params(size = 10, labelsize = 10)
plt.title("Automobile industrie sale", fontsize = 25)
plt.show()

- by this we can pick best variable which is correlated to price.

## Linear regression model

> The simple linear regression model 

In [None]:
from sklearn.linear_model import LinearRegression
X = auto['enginesize'].values.reshape(-1, 1)
Y = auto['price'].values.reshape(-1, 1)
linear_regressor = LinearRegression() 
linear_regressor.fit(X, Y)
pre = linear_regressor.predict(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=51)
 
print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

+ Y_predict -

In [None]:
Y_pred = linear_regressor.predict(X_test)
Y_pred.shape

+ r2 value -

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,Y_pred)

In [None]:
plt.scatter(X, Y)
plt.plot(X, pre, color='green')
plt.show()

- by this plot the data point's and the pridicted line

> Mutiple linear rigression model

In [None]:
from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(auto, test_size=0.2, random_state=51)
 

- scaling the values for the better understanding by the model.

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()

In [None]:
import warnings
warnings.filterwarnings('ignore')

num_vars = ['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize','boreratio', 'horsepower', 'price','mileage']

df_train[num_vars] = sc.fit_transform(df_train[num_vars])

In [None]:
X_train = df_train.drop('price', axis=1)
y_train = df_train['price']
 
print('Shape of X = ', X_train.shape)
print('Shape of y = ', y_train.shape) 

- RFE -Recursive feature elimination


In [None]:
from sklearn.feature_selection import RFE

In [None]:
from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()

In [None]:
X_train

In [None]:
linear_regressor.fit(X_train, y_train)

rfe = RFE(linear_regressor, 10)  
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train_rfe = X_train[col]

In [None]:
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
linear_regressor = sm.OLS(y_train,X_train_rfe).fit()

- VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train_new1 = X_train_rfe.drop(["cylindernumber_twelve"], axis = 1)

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new1)
linear_regressor = sm.OLS(y_train,X_train_lm).fit()  
print(linear_regressor.summary())

In [None]:
vif = pd.DataFrame()
X = X_train_new1
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train_new2 = X_train_new1.drop(["carbody_sedan"], axis = 1)

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new2)
lm = sm.OLS(y_train,X_train_lm).fit()
print(lm.summary())

In [None]:
X_train_new3 = X_train_new2.drop(["carbody_hardtop"], axis = 1)

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new3)
linear_regressor = sm.OLS(y_train,X_train_lm).fit()
print(linear_regressor.summary())

In [None]:
vif = pd.DataFrame()
X = X_train_new3
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train_new4 = X_train_new3.drop(["curbweight"], axis = 1)

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new4)
linear_regressor = sm.OLS(y_train,X_train_lm).fit()
print(linear_regressor.summary())

In [None]:
X_train_new5 = X_train_new4.drop(["carbody_wagon"], axis = 1)

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new5)
linear_regressor = sm.OLS(y_train,X_train_lm).fit()
print(linear_regressor.summary())

In [None]:
vif = pd.DataFrame()
X = X_train_new5
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train_new6 = X_train_new5.drop(["enginetype_dohcv"], axis = 1)

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_new6)
linear_regressor = sm.OLS(y_train,X_train_lm).fit()
print(linear_regressor.summary())

In [None]:
num_vars = ['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize','boreratio', 'horsepower', 'price','mileage']

df_test[num_vars] = sc.fit_transform(df_test[num_vars])

In [None]:
X_test = df_test.drop('price', axis=1)
y_test = df_test['price']
 
print('Shape of X = ', X_test.shape)
print('Shape of y = ', y_test.shape) 

In [None]:
X_test_new = X_test[['carwidth', 'horsepower', 'Car_cat_Luxury', 'carbody_hatchback']]

import statsmodels.api as sm
X_test_new = sm.add_constant(X_test_new)

In [None]:
X_test_new.head()

In [None]:
X_test_new

In [None]:
X_test

- y_predict

In [None]:
y_pred = linear_regressor.predict(X_test_new)

In [None]:
y_pred

- r2 value

In [None]:
from sklearn.metrics import r2_score 
r2_score(y_test, y_pred)

In [None]:
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('Test Vs Prediction', fontsize=15)           
plt.xlabel('Test', fontsize=12)                          
plt.ylabel('Prediction', fontsize=12)               

#### We can see that the equation of our best fitted line is:

 price = 0.3505 carwidth + 0.4010 horsepower + 0.2858 Car_cat_Luxury -0.0318  carbody_hatchback -0.0757     

- these are some factors that drive price-
  >carwidth, horsepower and car_category_luxury are +ve.
  
  >carbody_hatchback and constant are -ve.