In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.pandas.set_option('display.max_columns', None)
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
df=pd.read_csv("../input/car-price-prediction/CarPrice_Assignment.csv")
df.head()

## Basically, Dataset is having 205 rows & 26 columns some categorical variables and we need to convert it into dummies & car id is not required as we have the index so drop car id

In [None]:
df=df.drop(["car_ID"],axis=1)

In [None]:
#Data Shape
df.shape

In [None]:
df.columns

In [None]:
df['CarName'].unique()

In [None]:
df.isnull().sum()

In [None]:
#info of dataset
df.info()

## In Car name we have manufacture model names which is of no use further so removing model name.

In [None]:
CarCompany = df['CarName'].apply(lambda x : x.split(' ')[0])
df.insert(3,"CarCompany",CarCompany)

In [None]:
df=df.drop(["CarName"],axis=1)
df.head()

In [None]:
print(df["drivewheel"].unique())
print(df["fuelsystem"].unique())
print(df["enginetype"].unique())
print(df["carbody"].unique())

In [None]:
CarCompany.unique()

## As we have seen all unique values & in car manufacturer there some spelling mistakes which we have to correct it.

In [None]:
def replace_name(a,b):
    df.CarCompany.replace(a,b,inplace=True)

replace_name("maxda","mazda")
replace_name("nissan","Nissan")
replace_name("porcshce","porsche")
replace_name("vokswagen","volkswagen")
replace_name("vw","volkswagen")
replace_name("toyouta","toyota")



print(df["CarCompany"].unique())

## Now in cylinder number & door number are present in cateogrical replacing it with numeric value as meaning is same

In [None]:
df["cylindernumber"].value_counts()

In [None]:
def convert_feature(x):
    return x.map({"two":2,"three":3,"four":4,"five":5,"six":6,"eight":8,"twelve":12})
                  
df["cylindernumber"]=df[["cylindernumber"]].apply(convert_feature)

In [None]:
df["doornumber"].value_counts()

In [None]:
def number(x):
    return x.map({"two":2,"four":4})
df["doornumber"]=df[["doornumber"]].apply(number)

In [None]:
# Data Description
df.describe()

### Seperating the numeric & categorical data

In [None]:
df_numeric = df.select_dtypes(include =['int64','float64'])
df_numeric.head()

In [None]:
df_numeric.shape

In [None]:
df_categorical=df.select_dtypes(include=["object"])
df_categorical.head()

In [None]:
plt.figure(figsize=(30,35))
sns.pairplot(df_numeric)
plt.show()


# Visualising the data for categorical values

In [None]:
plt.figure(figsize=(30, 6))

plt.subplot(1,3,1)
plt1 = df.CarCompany.value_counts().plot(kind='bar')
plt.title('Manufacturer')
plt1.set(xlabel = 'Manufacturer', ylabel='Frequency of company')

plt.subplot(1,3,2)
plt1 = df.fueltype.value_counts().plot(kind='bar')
plt.title('Fuel type')
plt1.set(xlabel = 'Fuel Type', ylabel='Frequency of fuel type')

plt.subplot(1,3,3)
plt1 = df.carbody.value_counts().plot(kind='bar')
plt.title('Car Type ')
plt1.set(xlabel = 'Car Type', ylabel='Frequency of Car type')

plt.show()

In [None]:
plt.figure(figsize=(25, 6))

df1 = pd.DataFrame(df.groupby(['CarCompany'])['price'].mean().sort_values(ascending = False))
df1.plot.bar()
plt.title('Car Company vs Price')

### 1. Toyota is most popular car as sees
### 2. Gas fuel cars are more preffered.
### 3. Sedan is most favored car.
### 4. Jaguar,buick,porsche,bmw high cost cars.


In [None]:
plt.figure(figsize=(25,6))

plt.subplot(1,2,1)
plt.title('Engine Type Histogram')
sns.countplot(df.enginetype, palette=("Blues_d"))
plt.show()

In [None]:
df1 = pd.DataFrame(df.groupby(['enginetype'])['price'].mean().sort_values(ascending = False))
df1.plot.bar(figsize=(8,6))
plt.title('Engine Type vs price')
plt.show()

### In first graph ohc engine is most preferred due to its rigidity
### in second graph dohcv engine is having  high cost 

In [None]:
plt.figure(figsize=(30,10))

plt.subplot(1,4,1)
plt.title('Aspiration')
sns.countplot(x="aspiration", data=df)

plt.subplot(1,4,2)
plt.title('Number of Cylinders')
sns.countplot(x="cylindernumber", data=df)

plt.subplot(1,4,3)
plt.title('Fuel System')
sns.countplot(x="fuelsystem", data=df)

plt.subplot(1,4,4)
plt.title('Drive Wheel')
sns.countplot(x="drivewheel", data=df)

plt.show()

### 1.Gas Aspiration is favoured as compare to turbo as maybe it costs high.
### 2.Four Cylinder cars are more as compare to others
### 3. mpfi and 2bbl are most common type of fuel systems. mpfi and idi having the highest price range. But there are few data for other categories to derive any meaningful inference.
### 4. fwd drivewheel is preffered .


In [None]:
plt.figure(figsize=(30,10))

plt.subplot(1,3,1)
plt.title('Symboling')
sns.countplot(x="symboling", data=df)

plt.show()

### 0 & 1 are mostly sold cars

# Box Plots

In [None]:
plt.figure(figsize=(20,5))


plt.subplot(1,3,1)
plt.title('Fuel Type')
sns.boxplot(x="fueltype", y="price", data=df)

plt.subplot(1,3,2)
plt.title('Aspiration')
sns.boxplot(x="aspiration", y="price", data=df)

plt.subplot(1,3,3)
plt.title('doornumber')
sns.boxplot(x="doornumber", y="price", data=df)

plt.show()

### 1. Price of diesel cars is more
### 2. Turbo aspiration costs more than std as before we have seen in histogram
### 3. Two & Four doors costs are equal

In [None]:
plt.figure(figsize=(20,5))


plt.subplot(1,3,1)
plt.title('Carbody')
sns.boxplot(x="carbody", y="price", data=df)

plt.subplot(1,3,2)
plt.title('Drivewheel')
sns.boxplot(x="drivewheel", y="price", data=df)

plt.subplot(1,3,3)
plt.title('Engine Location')
sns.boxplot(x="enginelocation", y="price", data=df)

plt.show()

### 1.Hardtop cars cost is higher as compare to others
### 2.rwd drive wheel price is high as compare to fwd,4whd.
### 3.Rear engine are like having constant high range.

In [None]:
plt.figure(figsize = (20,12))
sns.boxplot(x = 'CarCompany', y = 'price', data = df)

### From the price boxplot it is clear that The brands with the most expensive vehicles in the dataset belong to Bmw,Buick,Jaguar and porsche.Whereas the lower priced cars belong to chevrolet

In [None]:
plt.figure(figsize=(20,8))


plt.subplot(1,3,1)
plt.title('Type of Engine')
sns.boxplot(x="enginetype", y="price", data=df)

plt.subplot(1,3,2)
plt.title('Cylinder number')
sns.boxplot(x="cylindernumber", y="price", data=df)

plt.subplot(1,3,3)
plt.title('Fuel System')
sns.boxplot(x="fuelsystem", y="price", data=df)

plt.show()

### It is clear that vehicles Multi-port Fuel Injection [MPFI] fuelsystem have the highest median price. There are 
### also some outliers on the higher price side having MPFI systems.

In [None]:
plt.figure(figsize=(20,12))

plt.subplot(1,3,1)
plt.title('Symboling')
sns.boxplot(x="symboling", y="price", data=df)

In [None]:
###For Numerical Features

def scatter(x,fig):
    plt.subplot(7,2,fig)
    plt.scatter(df[x],df["price"])
    plt.title(x+' vs Price')
    plt.xlabel(x)
    plt.ylabel("Price")
    
plt.figure(figsize=(15,30))

scatter('symboling',1)
scatter( 'wheelbase',2)
scatter('carlength',3)
scatter('carwidth',4)
scatter('carheight',5)
scatter('curbweight',6)
scatter('enginesize',7)
scatter('boreratio',8)
scatter('stroke',9)
scatter('compressionratio',10)
scatter('horsepower',11)
scatter('peakrpm',12)
scatter('citympg',13)
scatter('highwaympg',14)

plt.tight_layout()




#### From above corelation and scatter plots Car height,CompressionRatio,Peak RPM,city MPG,Highway MPG,symboling are not corelated to price. So, excluding these features we can proceed but city mpg and highway mpg are multicolinear so we can create a new vaiable by taking mean of both.

In [None]:
plt.figure(figsize = (20,20))
sns.heatmap(df.corr(), annot = True ,cmap = 'YlGnBu')
plt.show()

#### Price is highly (positively) correlated with wheelbase, carlength, carwidth, curbweight, enginesize, horsepower.

#### Price is negatively correlated to symboling, citympg and highwaympg.

#### This suggest that cars having high mileage may fall in the 'economy' cars category, and are priced lower.

#### There are many independent variables which are highly correlated: wheelbase, carlength, curbweight, enginesize etc.. all are positively correlated.

# Creating Dummies

In [None]:
#creating dummies
cars_dummies = pd.get_dummies(df_categorical, drop_first = True)
cars_dummies.shape

In [None]:
df_car = pd.concat([df, cars_dummies], axis =1)

In [None]:
df_car=df_car.drop(["CarCompany","fueltype","aspiration","carbody","drivewheel","enginelocation","enginetype","fuelsystem"],axis=1)


In [None]:
df_car.info()

#  Model building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_train, df_test = train_test_split(df_car, train_size = 0.7, test_size = 0.3, random_state = 100)

Rescaling the data

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
scaler = StandardScaler()


In [None]:
col_list=["symboling","doornumber","wheelbase","carlength","carwidth","carheight","curbweight","cylindernumber","enginesize",
          "boreratio","stroke","compressionratio","horsepower","peakrpm","citympg","highwaympg","price"]

In [None]:
df_train[col_list] = scaler.fit_transform(df_train[col_list])

In [None]:
df_train.describe()

In [None]:
y_train = df_train.pop('price')
X_train = df_train

### Model building using RFE with 15 variables

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)

# Subsetting training data for 15 selected columns
rfe = RFE(lr,15)
rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
cols = X_train.columns[rfe.support_]
cols

### Model=1

In [None]:

X1 = X_train[cols]
X1_sm = sm.add_constant(X1)

lr_1 = sm.OLS(y_train,X1_sm).fit()

In [None]:
print(lr_1.summary())

All the p- values are significant. Let us check VIF.

In [None]:
#VIF
vif = pd.DataFrame()
vif['Features'] = X1.columns
vif['VIF'] = [variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

As VIF values of some variables are high by our std practice VIF value should be less than 2.6. So, instead of removing it we will build the model by 10 variables.

In [None]:
lr2 = LinearRegression()

rfe2 = RFE(lr2,10)
rfe2.fit(X_train,y_train)

In [None]:
lr2 = LinearRegression()

rfe2 = RFE(lr2,10)
rfe2.fit(X_train,y_train)

In [None]:
supported_cols = X_train.columns[rfe2.support_]
supported_cols

### MODEL=2

In [None]:

X2 = X_train[supported_cols]
X2_sm = sm.add_constant(X2)

model_2 = sm.OLS(y_train,X2_sm).fit()

In [None]:
print(model_2.summary())

All the p- values are significant. Let us check VIF.

In [None]:

#VIF
vif = pd.DataFrame()
vif['Features'] = X2.columns
vif['VIF'] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

### MODEL=3 by dropping "CarCompany_peugeot"

In [None]:
X3 = X2.drop(['CarCompany_peugeot'], axis =1)
X3_sm = sm.add_constant(X3)

Model_3 = sm.OLS(y_train,X3_sm).fit()

In [None]:
print(Model_3.summary())

All the p- values are significant. Let us check VIF.

In [None]:
#VIF
vif = pd.DataFrame()
vif['Features'] = X3.columns
vif['VIF'] = [variance_inflation_factor(X3.values, i) for i in range(X3.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

### MODEL=4 by dropping "enginelocation_rear"

In [None]:
X4 = X3.drop(['enginelocation_rear'], axis =1)
X4_sm = sm.add_constant(X4)

Model_4 = sm.OLS(y_train,X4_sm).fit()

In [None]:
print(Model_4.summary())

All the VIF values and p-values seem to be in a good range. Also the Adjusted R-squared is 87%. 
This model is explaining most of the variance without being too complex.

In [None]:
#VIF
vif = pd.DataFrame()
vif['Features'] = X4.columns
vif['VIF'] = [variance_inflation_factor(X4.values, i) for i in range(X4.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = 'VIF', ascending = False)
vif

In [None]:
y_train_pred = Model_4.predict(X4_sm)
y_train_pred.head()

### Residuals

In [None]:
Residual = y_train- y_train_pred

In [None]:
sns.distplot(Residual, bins =15)

Error is normally distributed

### Prediction Analysis

In [None]:
df_test[col_list] = scaler.transform(df_test[col_list])

In [None]:
y_test = df_test.pop('price')
X_test = df_test

In [None]:
final_cols = X4.columns

In [None]:
X_test_model4= X_test[final_cols]
X_test_model4

In [None]:
X_test_sm = sm.add_constant(X_test_model4)

In [None]:
X_test_sm

In [None]:
y_test_pred = Model_4.predict(X_test_sm)

In [None]:
y_test_pred.head()

In [None]:

c = [i for i in range(1,63,1)]
plt.plot(c, y_test,color = 'Blue')
plt.plot(c, y_test_pred,color = 'red')
plt.xlabel("VOC")
plt.ylabel("VOC")


In [None]:

plt.scatter(y_test, y_test_pred)
plt.xlabel('y_test')
plt.ylabel('y_test_pred')

In [None]:
r_squ = r2_score(y_test,y_test_pred)
r_squ

### Final Variables to predict car price: "enginesize"	"CarCompany_bmw"	"CarCompany_buick"	"CarCompany_porsche"	"carbody_hardtop"	"carbody_hatchback"	"enginetype_l"	"enginetype_rotor"