# Car Price Prediction Multiple Linear Regression

"Problem Statement"

A Chinese automobile company Geely Auto aspires to enter the US market by setting up their manufacturing unit there and producing cars locally to give competition to their US and European counterparts.

They have contracted an automobile consulting company to understand the factors on which the pricing of cars depends. Specifically, they want to understand the factors affecting the pricing of cars in the American market, since those may be very different from the Chinese market. The company wants to know:

Which variables are significant in predicting the price of a car
How well those variables describe the price of a car
Based on various market surveys, the consulting firm has gathered a large data set of different types of cars across the America market.

"Business Goal"

We are required to model the price of cars with the available independent variables. It will be used by the management to understand how exactly the prices vary with the independent variables. They can accordingly manipulate the design of the cars, the business strategy etc. to meet certain price levels. Further, the model will be a good way for management to understand the pricing dynamics of a new market.

In [None]:
#import required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv("../input/car-price-prediction/CarPrice_Assignment.csv")
#csv file dowanloaded from (ttps://www.kaggle.com/hellbuoy/car-price-prediction/download)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

# DATA CLEANING

In [None]:
df.isnull().sum()

In [None]:
df.CarName.unique()

Let consider only the company not all models and variants.

In [None]:
df.loc[:,'company'] = df.CarName.str.split(' ').str[0]

In [None]:
df.company = df.company.apply(lambda x: str(x).lower())

In [None]:
df.company.unique()

Still have some misspelled Name.

In [None]:
df['company'].replace('maxda','mazda',inplace=True)
df['company'].replace('porcshce','porsche',inplace=True)
df['company'].replace('toyouta','toyota',inplace=True)
df['company'].replace(['vokswagen','vw'],'volkswagen',inplace=True)
df.company.unique()

In [None]:
df.drop(columns = 'CarName', inplace=True)

In [None]:
df.fuelsystem.unique()

In [None]:
df['fuelsystem'].replace('mfi','mpfi',inplace=True)

In [None]:
df.drivewheel.unique()

In [None]:
df['drivewheel'].replace('4wd', 'fwd', inplace = True)

In [None]:
df.head()

In [None]:
plt.subplot(1,2,1)
plt.title("Car Price Spread")
sns.boxplot(y=df.price)

plt.subplot(1,2,2)
plt.title("Car Price Distribution Plot")
sns.histplot(df.price)


In [None]:
#outlier Analysis
fig, axs=plt.subplots(2,4,figsize=(16,8))
plt1=sns.boxplot(df["price"],ax=axs[0,0])
plt2=sns.boxplot(df["wheelbase"],ax=axs[0,1])
plt3=sns.boxplot(df["enginesize"],ax=axs[0,2])
plt4=sns.boxplot(df["carheight"],ax=axs[0,3])
plt1=sns.boxplot(df["stroke"],ax=axs[1,0])
plt2=sns.boxplot(df["peakrpm"],ax=axs[1,1])
plt3=sns.boxplot(df["horsepower"],ax=axs[1,2])
plt4=sns.boxplot(df["boreratio"],ax=axs[1,3])



plt.tight_layout()


In [None]:
#outlier treatment
Q1=df.price.quantile(0.25)
Q3=df.price.quantile(0.75)
IQR=Q3-Q1
df=df[(df.price >= Q1-1.5*IQR) & (df.price <= Q3+1.5*IQR)]


In [None]:
plt.boxplot(df.price)

In [None]:
fig, axs=plt.subplots(2,4,figsize=(16,8))
plt1=sns.boxplot(df["price"],ax=axs[0,0])
plt2=sns.boxplot(df["wheelbase"],ax=axs[0,1])
plt3=sns.boxplot(df["enginesize"],ax=axs[0,2])
plt4=sns.boxplot(df["carheight"],ax=axs[0,3])
plt1=sns.boxplot(df["stroke"],ax=axs[1,0])
plt2=sns.boxplot(df["peakrpm"],ax=axs[1,1])
plt3=sns.boxplot(df["horsepower"],ax=axs[1,2])
plt4=sns.boxplot(df["boreratio"],ax=axs[1,3])

plt.tight_layout()


In [None]:

Q1 =df.stroke.quantile(0.25)
Q3 = df.stroke.quantile(0.75)
IQR = Q3 - Q1
df = df[(df.stroke >= Q1 - 1.5*IQR) & (df.stroke <= Q3 + 1.5*IQR)]

In [None]:
fig, axs=plt.subplots(2,4,figsize=(16,8))
plt1=sns.boxplot(df["price"],ax=axs[0,0])
plt2=sns.boxplot(df["wheelbase"],ax=axs[0,1])
plt3=sns.boxplot(df["enginesize"],ax=axs[0,2])
plt4=sns.boxplot(df["carheight"],ax=axs[0,3])
plt1=sns.boxplot(df["stroke"],ax=axs[1,0])
plt2=sns.boxplot(df["peakrpm"],ax=axs[1,1])
plt3=sns.boxplot(df["horsepower"],ax=axs[1,2])
plt4=sns.boxplot(df["boreratio"],ax=axs[1,3])

plt.tight_layout()


In [None]:
# Visualizing categorical data boxplots
plt.figure(figsize=(20, 16))
plt.subplot(3,3,1)
sns.boxplot(x = 'symboling', y = 'price', data = df)
plt.subplot(3,3,2)
sns.boxplot(x = 'fueltype', y = 'price', data = df)
plt.subplot(3,3,3)
sns.boxplot(x = 'aspiration', y = 'price', data = df)
plt.subplot(3,3,4)
sns.boxplot(x = 'doornumber', y = 'price', data = df)
plt.subplot(3,3,5)
sns.boxplot(x = 'carbody', y = 'price', data = df)
plt.subplot(3,3,6)
sns.boxplot(x = 'drivewheel', y = 'price', data = df)
plt.subplot(3,3,7)
sns.boxplot(x = 'enginelocation', y = 'price', data = df)
plt.subplot(3,3,8)
sns.boxplot(x = 'cylindernumber', y = 'price', data = df)
plt.subplot(3,3,9)
sns.boxplot(x = 'fuelsystem', y = 'price', data = df)
plt.show()

In [None]:
sns.boxplot(x ='carbody',y='price',data=df)

In [None]:
sns.boxplot(x='fuelsystem',y='price',data=df)

In [None]:
#convert categorical varables into numerical variables for calculations.
doors = {'two': 2, 'four': 4}
fuel = {'gas': 0, 'diesel': 1 }
aspirat = {'std': 0, 'turbo': 1}

df.doornumber = df.doornumber.map(doors)
df.fueltype = df.fueltype.map(fuel)
df.aspiration = df.aspiration.map(aspirat)

In [None]:
cylinders = {'two': 2,'three':3,'four': 4,'five': 5, 'six': 6,'eight': 8, 'twelve':12}
df.cylindernumber = df.cylindernumber.map(cylinders)

In [None]:
df =df[["curbweight","enginesize","wheelbase","cylindernumber","horsepower","doornumber","fueltype","aspiration","price","carheight","carwidth","carlength","stroke","compressionratio","peakrpm","boreratio"]].copy()

Some coulmns drooped after visualization, as they are less significant.

# Numerical data visualization

In [None]:
sns.pairplot(df[["curbweight","enginesize","wheelbase","horsepower","price","carheight","stroke","compressionratio","peakrpm","boreratio"]])
plt.show()

In [None]:
plt.figure(figsize = (16, 10))
sns.heatmap(df.corr(), annot = True, cmap="YlGnBu")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(0)
df_train, df_test = train_test_split(df, train_size = 0.8, test_size = 0.2, random_state = 100)

# Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
num_vars = ['wheelbase','carheight','curbweight','carwidth','carlength','boreratio','stroke','fueltype','cylindernumber','doornumber','aspiration','compressionratio','horsepower','peakrpm','horsepower','price']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

In [None]:
df_train.head()

In [None]:
df_train.describe()

In [None]:
y_train=df_train.pop("price")
X_train=df_train

In [None]:
#importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
rfe = RFE(lr, 6)             # running RFE
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train.columns[~rfe.support_]

In [None]:
X_train_rfe = X_train[col]

In [None]:
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
lm = sm.OLS(y_train,X_train_rfe).fit()   # Running the linear model

In [None]:
#Let's see the summary of our linear model
print(lm.summary())

In [None]:
# Calculate the VIFs for the model
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train_rfe= X_train_rfe.drop(['curbweight'], axis=1)
lm = sm.OLS(y_train,X_train_rfe).fit()   

In [None]:
print(lm.summary())

In [None]:
vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train_rfe= X_train_rfe.drop(['carlength'], axis=1)
lm = sm.OLS(y_train,X_train_rfe).fit()   

In [None]:
print(lm.summary())

In [None]:
vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
y_train_price = lm.predict(X_train_rfe)

In [None]:
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error', fontsize = 16)
plt.xlabel('Errors', fontsize = 12)    

# Testing the Model

In [None]:
df_test[num_vars] = scaler.transform(df_test[num_vars])

df_test.describe()

In [None]:
y_test = df_test.pop('price')
X_test = df_test

In [None]:
X_test = X_test[col]


In [None]:
X_test= X_test.drop(['curbweight','carlength'], axis=1)

In [None]:
X_test_sm = sm.add_constant(X_test)

In [None]:

y_pred=lm.predict(X_test_sm)

In [None]:

fig = plt.figure()
plt.scatter(y_test,y_pred)
plt.xlabel('y_test_price', fontsize=12)
plt.ylabel('y_pred', fontsize=12)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

In [None]:
rmse = sqrt(mean_squared_error(y_test, y_pred))
print('Model RMSE:',rmse)

r2=r2_score(y_test, y_pred)
print('Model r2_score:',r2)

# Conclusion

Carwidth,horsepower,stroke,compressionratio are the components of the cars which affects cars price in USA Greatly.
 Geely Auto ,the Chinese company can check and modify above mentioned components accordingly and set their cars price.
 
 Price= -0.1320 + 0.5734(carwidth) +0.5390(horsepower) - 0.1406(stroke) + 0.1897(compressionratio)