In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm 
import sklearn
from sklearn.model_selection import train_test_split

In [None]:
cars_data = pd.read_csv("../input/car-price-prediction/CarPrice_Assignment.csv")   #Importing the data
cars_data.head()

## Data Understanding -

In [None]:
cars_data.columns   

In [None]:
cars_data.shape  # 205 data-points and 26 variables

In [None]:
cars_data.info()   #No null-values

In [None]:
cars_data.describe()   

In [None]:
# Lets verify the correlation between various variables
plt.figure(figsize=(20,10))
sns.heatmap(cars_data.corr(),annot = True)    
plt.show()

**Observation**
1. 'car_ID' variable is irrelavent
2. Some variable show a high correlation between each other which will affect the final mode


In [None]:
# Dropping the variables
cars_data.drop(['car_ID','carwidth','curbweight','wheelbase','highwaympg'],axis=1,inplace=True)

## Data Preparation -

**The CarName column is in the format 'Company_name-Car_name'.**
1. Lets keep company name only for the further analysis
2. Changing 'Alpha-Romero' to 'Alpha' 

In [None]:
cars_data['CarName'] = cars_data['CarName'].str.replace('-', ' ')
cars_data['CarName'] = cars_data['CarName'].apply(lambda x : x.split(' ',1)[0])
cars_data['CarName'].unique()

In [None]:
cars_data['CarName'].value_counts()

**Here there are some company names that are spelled wrong. Hence we need to correct those**
1. Toyota
2. Mazda
3. Nissan
4. Volkswagan
5. Porsche

In [None]:
cars_data['CarName'] = cars_data['CarName'].replace({"toyouta":"toyota","maxda":"mazda","Nissan":"nissan","vw":"volkswagen","vokswagen":"volkswagen","porcshce":"porsche"})

In [None]:
cars_data['CarName'].value_counts()

In [None]:
cars_data.head()

In [None]:
cars_data['fueltype'].value_counts()   #converting into binary variables

In [None]:
cars_data['fueltype'] = cars_data['fueltype'].apply(lambda x : 1 if x=='gas' else 0)
cars_data['fueltype'].value_counts()

In [None]:
cars_data['aspiration'].value_counts()   #converting into binary variables

In [None]:
cars_data['aspiration'] = cars_data['aspiration'].apply(lambda x : 1 if x=='std' else 0)
cars_data['aspiration'].value_counts()

In [None]:
cars_data['doornumber'].value_counts()   #converting into binary variables

In [None]:
cars_data['doornumber'] = cars_data['doornumber'].apply(lambda x : 2 if x=='four' else 1)
cars_data['doornumber'].value_counts()

In [None]:
cars_data['enginelocation'].value_counts()   #converting into binary variables

In [None]:
cars_data['enginelocation'] = cars_data['enginelocation'].apply(lambda x : 1 if x=='front' else 0)
cars_data['enginelocation'].value_counts()

In [None]:
cars_data['cylindernumber'].value_counts()

In [None]:
# Creating dummy variabels for left out categorical variables
cars_data = pd.get_dummies(cars_data)  
cars_data.head()

In [None]:
cars_data.info()

**Feature Scaling needs to be done as price column has a high value as compared to other columns**

In [None]:
from sklearn.preprocessing import MinMaxScaler  #Lets use min max scaler
scaler = MinMaxScaler()

In [None]:
#Scaling the numeric varibles only
num_vars = ['symboling', 'carlength', 'carheight','enginesize', 'boreratio', 'stroke', 'compressionratio','horsepower', 'peakrpm', 'citympg', 'price']

cars_data[num_vars] = scaler.fit_transform(cars_data[num_vars])



In [None]:
cars_data.describe()

## Model Building -

In [None]:
#Spliting the data into train(70%) and test(30%)
from sklearn.model_selection import train_test_split
df_train,df_test = train_test_split(cars_data,train_size=0.7,test_size = 0.3,random_state=100)

In [None]:
y_train = df_train.pop('price')  #Result variable
X_train = df_train               #Predictor variables

**We will be using mixed approach to find the best predictor variables**
***Recurrsive Feature Elimination and Manual feature reduction***

In [None]:
from sklearn.feature_selection import RFE 
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()          
lm.fit(X_train, y_train)
rfe = RFE(lm, 15)     #Taking 15 variables 
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

**The variables with value '1' are the top 15 variable that we will be considering**

In [None]:
rfe_drop = X_train.columns[~rfe.support_]
rfe_drop

In [None]:
X_train = X_train.drop(rfe_drop,axis=1)  #Removing the unwanted variables
X_train.columns

**Lets use statsmodels to build the model so as to use summary function to get into depth of the model**

In [None]:
import statsmodels.api as sm        
X_train_rfe_lm = sm.add_constant(X_train)

In [None]:
#First model
lm_1 = sm.OLS(y_train,X_train_rfe_lm).fit()
lm_1.summary()

**Observations**
1. Good r-squared value
2. Adj. r-squared value is close to r-squared value
**Lets check the VIF values as well**

In [None]:
df_VIF= cars_data.drop(rfe_drop,axis=1)

In [None]:
#Function to find the VIF values of the variables
def vif_cal(input_data, dependent_col):
    vif_df = pd.DataFrame( columns = ['Var', 'Vif'])
    x_vars=input_data.drop([dependent_col], axis=1)
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.OLS(y,x).fit().rsquared  
        vif=round(1/(1-rsq),2)
        vif_df.loc[i] = [xvar_names[i], vif]
    return vif_df.sort_values(by = 'Vif', axis=0, ascending=False, inplace=False)

In [None]:
vif_cal(input_data=df_VIF, dependent_col="price")

**'enginetype_rotor' and 'cylindernumber_two' has a large VIF values which means they is multicollinear with other variables**

In [None]:
#Removing 'enginetype_rotor' variable
X_train2 = X_train.drop(['enginetype_rotor'],axis=1)
X_train_rfe_lm2 = sm.add_constant(X_train2)

In [None]:
#Second Model
lm_1 = sm.OLS(y_train,X_train_rfe_lm2).fit()
lm_1.summary()

**Observations**
1. R-squared and Adj. R-squared are not affected alot.
2. Lets check the VIF table again

In [None]:
#Again checking the VIF values for 2nd model
df_VIF = df_VIF.drop('enginetype_rotor', axis =1)
vif_cal(input_data=df_VIF, dependent_col="price")

In [None]:
df_VIF.columns

In [None]:
#Checking the Correlations between all the remaining variables
plt.figure(figsize=(20,10))
sns.heatmap(df_VIF.corr(),annot=True)
plt.show()

**There are high correlation valaues between some variables, hence removing them**

In [None]:
df_VIF =df_VIF.drop(['enginesize','boreratio','stroke'],axis=1)
#Lets check the VIF tables again
vif_cal(input_data=df_VIF, dependent_col="price")

In [None]:
X_train2.columns

In [None]:
X_train3 = X_train2.drop(['enginesize','boreratio','stroke'],axis=1)

#Third model
X_train_rfe_lm3 = sm.add_constant(X_train3)
lm_2 = sm.OLS(y_train,X_train_rfe_lm3).fit()
lm_2.summary()


**Observations**
1. R-squared and Adj.R-squared values have reduced but are still high values
2. As these values are high, model explains the variance of the data properly.
3. VIF tables has values less than '5' so multicollinear variables are handeled.

## Residual Check - 

In [None]:
#Making predicitions on training data
y_train_predict = lm_2.predict(X_train_rfe_lm3)

In [None]:
#Plotting error terms
fig = plt.figure()
sns.distplot((y_train - y_train_predict), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 


**Error terms follow a normal distribution**

## Predictions -

In [None]:
df_test.head()

In [None]:
#Separating result and predictor variables
y_test = df_test.pop('price')
X_test = df_test

In [None]:
X_test.head()

In [None]:
#Adding constant term for statsmodels.api
X_test_new = X_test[X_train3.columns]
X_test_new = sm.add_constant(X_test_new)

In [None]:
X_test_new.head()

In [None]:
#Predicting...
y_test_pred = lm_2.predict(X_test_new)
y_test_pred.head()

In [None]:
y_test.head()

In [None]:
#Finally plotting the predicted y with y_test
fig = plt.figure()
plt.scatter(y_test,y_test_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)              
plt.xlabel('y_test', fontsize=18)                          
plt.ylabel('y_pred', fontsize=16) 

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_test_pred)

**The R-Squared value of the prediction is = 0.88. Hence 88.6% of the variance of data is explained by the model**