In [None]:
# Pandas
import pandas as pd
# Numpy
import numpy as np
# Libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# Train-test split
from sklearn.model_selection import train_test_split
# Min-max scling
from sklearn.preprocessing import MinMaxScaler
# Statsmodel 
import statsmodels.api as sm
# VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor
#R-squared
from sklearn.metrics import r2_score
# Label encoding
from sklearn.preprocessing import LabelEncoder
# Importing RFE
from sklearn.feature_selection import RFE
# Importing LinearRegression
from sklearn.linear_model import LinearRegression
# Supress warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Libraries for cross validation 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

from sklearn import datasets
from sklearn.model_selection import cross_val_score, cross_val_predict

In [None]:
pd.set_option('display.max_columns',None)
%matplotlib inline

In [None]:
# Read the data
df_car = pd.read_csv('CarPrice_Assignment.csv')
df_car.head()

In [None]:
df_car.shape

In [None]:
df_car.info()

In [None]:
df_car.describe()

In [None]:
car_company = df_car["CarName"].str.split(" ", n = 1, expand = True)
df_car['CarCompany'] = car_company[0]

# Dropping 'CarName' column
df_car.drop('CarName',axis=1,inplace=True)
df_car.head()

In [None]:
# Dropping car_ID column as it will not be used in our analysis
df_car.drop('car_ID',axis=1,inplace=True)

In [None]:
# Count missing values column wise
df_car.isnull().sum()

In [None]:
#Replacing '4wd' with 'fwd' in 'drivewheel' column
df_car['drivewheel'] = df_car['drivewheel'].replace('4wd','fwd')

In [None]:
# Replacing 'maxda' with 'mazda' in 'CarCompany' column
df_car['CarCompany'] = df_car['CarCompany'].replace('maxda','mazda')

In [None]:
# Replacing 'porcshce' with 'porsche' in 'CarCompany' column
df_car['CarCompany'] = df_car['CarCompany'].replace('porcshce','porsche')

In [None]:
# Replacing 'toyouta' with 'toyota' in 'CarCompany' column
df_car['CarCompany'] = df_car['CarCompany'].replace('toyouta','toyota')

In [None]:
# Replacing 'vokswagen' with 'volkswagen' in 'CarCompany' column
df_car['CarCompany'] = df_car['CarCompany'].replace('vokswagen','volkswagen')

In [None]:
# Replacing 'Nisaan' with 'nissan' in 'CarCompany' column
df_car['CarCompany'] = df_car['CarCompany'].replace('Nissan','nissan')

In [None]:
# Replacing 'vw' with 'volkswagen' in 'CarCompany' column
df_car['CarCompany'] = df_car['CarCompany'].replace('vw','volkswagen')

In [None]:
# Finding outliers in all the numerical columns with 1.5 IQR rule and removing the outlier records 
col_numeric = ['wheelbase','carlength','carwidth','carheight','curbweight',
                    'enginesize','boreratio','stroke','compressionratio','horsepower','peakrpm','citympg','highwaympg','price']

for col in col_numeric: 
    q1 = df_car[col].quantile(0.25)
    q3 = df_car[col].quantile(0.75)
    iqr = q3-q1
    range_low  = q1-1.5*iqr
    range_high = q3+1.5*iqr
    df_car = df_car.loc[(df_car[col] > range_low) & (df_car[col] < range_high)]

df_car.shape

In [None]:
# Listing categorical columns for checking data imbalance and plotting them
col_category = ['symboling','fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','enginetype',
           'cylindernumber','fuelsystem','CarCompany']

k=0
plt.figure(figsize=(20,25))
for col in col_category:    
    k=k+1
    plt.subplot(4, 3,k)    
    df_car[col].value_counts().plot(kind='bar');
    plt.title(col)

In [None]:
# Visualising the numerical variables
plt.figure(figsize=(12,12))
sns.pairplot(df_car[col_numeric])
plt.show()

In [None]:
# Boxplot for all categorical variables except CarCompany
# As X labels are not clearly visible for CarCompany. It is plotted in the next cell with bigger figure size.
k=0
plt.figure(figsize=(20,18))
for col in range (len(col_category)-1):    
    k=k+1
    plt.subplot(4, 3, k)   
    ax = sns.boxplot(x = col_category[col], y = 'price', data = df_car)

In [None]:
plt.figure(figsize=(15,8))
ax = sns.boxplot(x = 'CarCompany', y = 'price', data = df_car)
temp = ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, horizontalalignment='right')

In [None]:
# fueltype
# Convert "gas" to 1 and "diesel" to 0
df_car['fueltype'] = df_car['fueltype'].map({'gas': 1, 'diesel': 0})
df_car.head()

In [None]:
# aspiration
# Convert "std" to 1 and "turbo" to 0
df_car['aspiration'] = df_car['aspiration'].map({'std':1, 'turbo':0})
df_car.head()

In [None]:
# doornumber
# Convert "four" to 1 and "two" to 0
df_car['doornumber'] = df_car['doornumber'].map({'four':1, 'two':0})
df_car.head()

In [None]:
# drivewheel
# Convert "fwd" to 1 and "rwd" to 0
df_car['drivewheel'] = df_car['drivewheel'].map({'fwd':1, 'rwd':0})
df_car.head()

In [None]:
# enginelocation
# Convert "front" to 1 and "rear" to 0
df_car['enginelocation'] = df_car['enginelocation'].map({'front':1, 'rear':0})
df_car.head()

In [None]:
# Creating dummy variables for 'symboling'
# Dropping the redundant dummy variable (-2)
symboling_status = pd.get_dummies(df_car['symboling'],drop_first=True)
symboling_status.head()

In [None]:
# Renaming column names for better readability
symboling_status = symboling_status.rename(columns={-1:'symboling(-1)', 0:'symboling(0)', 1:'symboling(1)',2:'symboling(2)', 3:'symboling(3)'})
symboling_status.head()

In [None]:
# Concating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,symboling_status], axis=1)
df_car.head()

In [None]:
# Dropping the 'symboling' column as we don't need it anymore
df_car = df_car.drop('symboling',axis=1)
df_car.head()

In [None]:
# Creating dummy variables for 'carbody'
# Dropping the redundant dummy variable (convertible)
carbody_status = pd.get_dummies(df_car['carbody'],drop_first=True)
carbody_status.head()

In [None]:
# Renaming column names for better readability
carbody_status = carbody_status.rename(columns={'hardtop':'carbody(hardtop)', 'hatchback':'carbody(hatchback)', 'sedan':'carbody(sedan)','wagon':'carbody(wagon)'})
carbody_status.head()

In [None]:
# Concating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,carbody_status], axis=1)
df_car.head()

In [None]:
# Dropping the 'symboling' column as we don't need it
df_car = df_car.drop('carbody',axis=1)
df_car.head()

In [None]:
# Creating dummy variables for 'enginetype'
# Dropping the redundant dummy variable (dohc)
enginetype_status = pd.get_dummies(df_car['enginetype'], drop_first=True)
enginetype_status.head()

In [None]:
# Renaming column name for better readability
enginetype_status = enginetype_status.rename(columns={'dohcv':'enginetype(dohcv)', 'l':'enginetype(l)', 'ohc':'enginetype(ohc)', 
                                                      'ohcf':'enginetype(ohcf)','ohcv':'enginetype(ohcv)' ,'rotor':'enginetype(rotor)'})
enginetype_status.head()

In [None]:
# Concating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,enginetype_status], axis=1)
df_car.head()

In [None]:
# Dropping the 'enginetype' column as we don't need it
df_car = df_car.drop('enginetype',axis=1)
df_car.head()

In [None]:
# Creating dummy variables for 'cylindernumber'
# Dropping the redundant dummy variable (eight)
cylindernumber_status = pd.get_dummies(df_car['cylindernumber'], drop_first=True)
cylindernumber_status.head()

In [None]:
# Renaming column name for better readability
cylindernumber_status = cylindernumber_status.rename(columns={'five':'cylindernumber(five)', 'four':'cylindernumber(four)', 'six':'cylindernumber(six)', 
                                                      'three':'cylindernumber(three)','twelve':'cylindernumber(twelve)' ,'two':'cylindernumber(two)'})
cylindernumber_status.head()

In [None]:
# Concating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,cylindernumber_status], axis=1)
df_car.head()

In [None]:
# Dropping the 'cylindernumber' column as we don't need it
df_car = df_car.drop('cylindernumber',axis=1)
df_car.head()

In [None]:
# Creating dummy variables for 'fuelsystem'
# Dropping the redundant dummy variable (1bbl)
fuelsystem_status = pd.get_dummies(df_car['fuelsystem'], drop_first=True)
fuelsystem_status.head()

In [None]:
# Renaming column name for better readability
fuelsystem_status = fuelsystem_status.rename(columns={'2bbl':'fuelsystem(2bbl)', '4bbl':'fuelsystem(4bbl)', 'idi':'fuelsystem(idi)', 
                                                      'mfi':'fuelsystem(mfi)','mpfi':'fuelsystem(mpfi)' ,'spdi':'fuelsystem(spdi)',
                                                             'spfi':'fuelsystem(spfi)'})
fuelsystem_status.head()

: 

In [None]:
# Concating the dummy dataframe with original dataframe
df_car = pd.concat([df_car,fuelsystem_status], axis=1)
df_car.head()

In [None]:
df_car.info()

In [None]:
# Splitting train and test dataset into 70:30 percent ratio.
df_train, df_test = train_test_split(df_car, train_size=0.7, random_state=100)
print(df_train.shape)
print(df_test.shape)

In [None]:
# Create a list of numeric variables. We don't need categorical variables because they are already scalled in 0 and 1.
num_vars = ['wheelbase','carlength','carwidth','carheight','curbweight','enginesize','boreratio','stroke',
            'compressionratio','horsepower','peakrpm','citympg','highwaympg','price']

# Instantiate an object
scaler = MinMaxScaler()

# Fit the data in the object
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train.head()

In [None]:
df_train.describe()

In [None]:
#Let's check the correlation coefficients of all numerical variables except categorical variables to see which variables are highly correlated

plt.figure(figsize = (16, 8))
sns.heatmap(df_train[num_vars].corr(), annot = True, cmap="YlGnBu")
plt.show()

In [None]:
# Popping out the 'price' column for y_train
y_train = df_train.pop('price') 
# Creating X_train
X_train = df_train

In [None]:
y_train.head()

In [None]:
X_train.head()

In [None]:
# Creating the model using scikit learn 
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
# RFE with the number of variables 20. It will select top 20 features.
rfe = RFE(lm, 20)
ref = rfe.fit(X_train, y_train)

In [None]:
# Listing the feature variables with their RFE status and rank 
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# Listing the feature variables with their RFE status and rank 
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# Not selected features by RFE
X_train.columns[~rfe.support_]

In [None]:
# Creating X_train dataframe with RFE selected variables
X_train_1 = X_train[rfe_cols]
X_train_1.head()

In [None]:
# Adding constant because statsmodel library doesn't include the intercept by default. 
X_train_sm_1 = sm.add_constant(X_train_1)
# Creating model
lr_1 = sm.OLS(y_train, X_train_sm_1)
# Fit the model
lr_model_1 = lr_1.fit()
print(lr_model_1.summary())

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train_1.columns
vif['VIF'] = [variance_inflation_factor(X_train_1.values, i) for i in range(X_train_1.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing wheelbase
X_train_2 = X_train_1.drop('wheelbase',axis=1)
# Adding constant 
X_train_sm_2 = sm.add_constant(X_train_2)
# Creating and fitting the model
lr_model_2 = sm.OLS(y_train, X_train_sm_2).fit()
print(lr_model_2.summary())

In [None]:
# Create VIF for Model-2
vif = pd.DataFrame()
vif['Features'] = X_train_2.columns
vif['VIF'] = [variance_inflation_factor(X_train_2.values, i) for i in range(X_train_2.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing boreratio
X_train_3 = X_train_2.drop('boreratio',axis=1)

In [None]:
# Adding constant 
X_train_sm_3 = sm.add_constant(X_train_3)
# Creating and fitting the model
lr_model_3 = sm.OLS(y_train, X_train_sm_3).fit()
print(lr_model_3.summary())

In [None]:
# Create VIF for Model-3
vif = pd.DataFrame()
vif['Features'] = X_train_3.columns
vif['VIF'] = [variance_inflation_factor(X_train_3.values, i) for i in range(X_train_3.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing fuelsystem(4bbl)
X_train_4 = X_train_3.drop('fuelsystem(4bbl)',axis=1)

In [None]:
# Adding constant 
X_train_sm_4 = sm.add_constant(X_train_4)
# Creating model
lr_model_4 = sm.OLS(y_train, X_train_sm_4).fit()
print(lr_model_4.summary())

In [None]:
# Create VIF for Model-4
vif = pd.DataFrame()
vif['Features'] = X_train_4.columns
vif['VIF'] = [variance_inflation_factor(X_train_4.values, i) for i in range(X_train_4.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing enginetype(rotor)
X_train_5 = X_train_4.drop('enginetype(rotor)',axis=1)

In [None]:
# Adding constant 
X_train_sm_5 = sm.add_constant(X_train_5)
# Creating model
lr_model_5 = sm.OLS(y_train, X_train_sm_5).fit()
print(lr_model_5.summary())

In [None]:
# Create VIF for Model-5
vif = pd.DataFrame()
vif['Features'] = X_train_5.columns
vif['VIF'] = [variance_inflation_factor(X_train_5.values, i) for i in range(X_train_5.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing cylindernumber(two)
X_train_6 = X_train_5.drop('cylindernumber(two)',axis=1)


In [None]:
# Adding constant 
X_train_sm_6 = sm.add_constant(X_train_6)
# Creating model
lr_model_6 = sm.OLS(y_train, X_train_sm_6).fit()
print(lr_model_6.summary())

In [None]:
# Create VIF for Model-6
vif = pd.DataFrame()
vif['Features'] = X_train_6.columns
vif['VIF'] = [variance_inflation_factor(X_train_6.values, i) for i in range(X_train_6.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing enginetype(l)
X_train_7 = X_train_6.drop('enginetype(l)',axis=1)

In [None]:
# Adding constant 
X_train_sm_7 = sm.add_constant(X_train_7)
# Creating model
lr_model_7 = sm.OLS(y_train, X_train_sm_7).fit()
print(lr_model_7.summary())

In [None]:
# Create VIF for Model-7
vif = pd.DataFrame()
vif['Features'] = X_train_7.columns
vif['VIF'] = [variance_inflation_factor(X_train_7.values, i) for i in range(X_train_7.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing carwidth
X_train_8 = X_train_7.drop('carwidth',axis=1)

In [None]:
# Adding constant 
X_train_sm_8 = sm.add_constant(X_train_8)
# Creating model
lr_model_8 = sm.OLS(y_train, X_train_sm_8).fit()
print(lr_model_8.summary())


In [None]:
# Create VIF for Model-8
vif = pd.DataFrame()
vif['Features'] = X_train_8.columns
vif['VIF'] = [variance_inflation_factor(X_train_8.values, i) for i in range(X_train_8.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing CarCompany(isuzu)
X_train_9 = X_train_8.drop('CarCompany(isuzu)',axis=1)

In [None]:
# Adding constant 
X_train_sm_9 = sm.add_constant(X_train_9)
# Creating model
lr_model_9 = sm.OLS(y_train, X_train_sm_9).fit()
print(lr_model_9.summary())

In [None]:
# Create VIF for Model-9
vif = pd.DataFrame()
vif['Features'] = X_train_9.columns
vif['VIF'] = [variance_inflation_factor(X_train_9.values, i) for i in range(X_train_9.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing CarCompany(peugeot)
X_train_10 = X_train_9.drop('CarCompany(peugeot)',axis=1)

In [None]:
# Adding constant 
X_train_sm_10 = sm.add_constant(X_train_10)
# Creating model
lr_model_10 = sm.OLS(y_train, X_train_sm_10).fit()
print(lr_model_10.summary())

In [None]:
# Create VIF for Model-10
vif = pd.DataFrame()
vif['Features'] = X_train_10.columns
vif['VIF'] = [variance_inflation_factor(X_train_10.values, i) for i in range(X_train_10.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing aspiration
X_train_11 = X_train_10.drop('aspiration',axis=1)

In [None]:
# Adding constant 
X_train_sm_11 = sm.add_constant(X_train_11)
# Creating model
lr_model_11 = sm.OLS(y_train, X_train_sm_11).fit()
print(lr_model_11.summary())

In [None]:
# Create VIF for Model-11
vif = pd.DataFrame()
vif['Features'] = X_train_11.columns
vif['VIF'] = [variance_inflation_factor(X_train_11.values, i) for i in range(X_train_11.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Removing enginetype(ohc)
X_train_12 = X_train_11.drop('enginetype(ohc)',axis=1)

In [None]:
# Adding constant 
X_train_sm_12 = sm.add_constant(X_train_12)
# Creating model
lr_model_12 = sm.OLS(y_train, X_train_sm_12).fit()
print(lr_model_12.summary())

In [None]:
# Create VIF for Model-12
vif = pd.DataFrame()
vif['Features'] = X_train_12.columns
vif['VIF'] = [variance_inflation_factor(X_train_12.values, i) for i in range(X_train_12.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Find y_train prediction
y_train_pred = lr_model_12.predict(X_train_sm_12)

In [None]:
# Residual
residual = y_train - y_train_pred

In [None]:
# Distribution of residuals or error terms
sns.distplot(residual)

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(3,3,1)
plt.scatter(X_train_12['curbweight'], residual)
plt.xlabel('curbweight')
plt.ylabel('residual')
plt.subplot(3,3,2)
plt.scatter(X_train_12['carbody(hardtop)'], residual)
plt.xlabel('carbody(hardtop)')
plt.ylabel('residual')
plt.subplot(3,3,3)
plt.scatter(X_train_12['carbody(hatchback)'], residual)
plt.xlabel('carbody(hatchback)')
plt.ylabel('residual')
plt.subplot(3,3,4)
plt.scatter(X_train_12['carbody(sedan)'], residual)
plt.xlabel('carbody(sedan)')
plt.ylabel('residual')
plt.subplot(3,3,5)
plt.scatter(X_train_12['carbody(wagon)'], residual)
plt.xlabel('carbody(wagon)')
plt.ylabel('residual')
plt.subplot(3,3,6)
plt.scatter(X_train_12['CarCompany(audi)'], residual)
plt.xlabel('CarCompany(audi)')
plt.ylabel('residual')
plt.subplot(3,3,7)
plt.scatter(X_train_12['CarCompany(bmw)'], residual)
plt.xlabel('CarCompany(bmw)')
plt.ylabel('residual')
plt.subplot(3,3,8)
plt.scatter(X_train_12['CarCompany(porsche)'], residual)
plt.xlabel('CarCompany(porsche)')
plt.ylabel('residual')
plt.subplot(3,3,9)
plt.scatter(X_train_12['CarCompany(volvo)'], residual)
plt.xlabel('CarCompany(volvo)')
plt.ylabel('residual')


In [None]:
# Scale the test set variables with min-max scaler

# Transform the data
df_test[num_vars] = scaler.transform(df_test[num_vars])
df_test.head()

In [None]:
df_test.describe()

In [None]:
# Popping out the 'price' column for y_test
y_test = df_test.pop('price')

# Creating X_test
X_test = df_test

In [None]:
# Taking only the columns from the final model.
X_test = X_test[X_train_12.columns]
X_test.head()

In [None]:
# Add constant
X_test_sm = sm.add_constant(X_test)
X_test_sm.head()

In [None]:
# Predict the model on the test set
y_test_pred = lr_model_12.predict(X_test_sm)

In [None]:
# Evaluate the model with r-squared on the test set
r2 = r2_score(y_test, y_test_pred)
r2

In [None]:
# Plotting y_test and y_test_pred to understand the spread.
fig = plt.figure()
plt.scatter(y_test,y_test_pred)
fig.suptitle('y_test vs y_test_pred', fontsize=20)               
plt.xlabel('y_test', fontsize=18)                         
plt.ylabel('y_test_pred', fontsize=16)