In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
df_car = pd.read_csv('Car_Price_Assignment/CarPrice_Assignment.csv')

In [None]:
df_car.head()

In [None]:
#Importing all other required libraries
#  test train split
from sklearn.model_selection import train_test_split
#  feature scaling
from sklearn.preprocessing import MinMaxScaler
# statmodel linear regression
import statsmodels.api as sm
# RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
# Check for the VIF values  
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.metrics import mean_squared_error
from math import sqrt


In [None]:
df_car.shape

In [None]:
df_car.describe()

In [None]:
df_car.info()

In [None]:
#Checking for Null vals
df_car.isnull().sum()

In [None]:
#Seperating numeric vars
df_car_num = list(df_car.columns[df_car.dtypes != 'object'])

In [None]:
df_car_num

In [None]:
#Non-numeric vars
df_car_char = list(df_car.columns[df_car.dtypes == 'object'])

In [None]:
df_car_char

In [None]:
len(df_car_num) + len(df_car_char)

In [None]:
df_car.mean()

In [None]:
#Dropping useless columns like car_ID
df_car.drop(columns='car_ID', inplace = True)

In [None]:
df_car.info()

# checking the relationship of all predictor vars with target var, price:

In [None]:
# Function for pair plots. Plot each numeric var with the target var price
def doPairPlots(df_car_numeric):

    #plt.figure(figsize=(22, 22))
    fig, axs = plt.subplots(1,len(df_car_numeric), figsize=(3*len(df_car_numeric),2.5), sharey=True)

    for i, col_name in enumerate(df_car_numeric):
        sns.scatterplot(data=df_car, x=col_name, y="price", ax=axs[i])
    fig.set_figheight(5)
    fig.set_figwidth(20)
    plt.tight_layout()
    plt.show()
df_car_num1= ['wheelbase','carlength','carwidth','carheight']
df_car_num2= ['enginesize','curbweight','boreratio','stroke'] 
df_car_num3= ['compressionratio','horsepower','peakrpm','citympg','highwaympg']

In [None]:
doPairPlots(df_car_num1)

carlen and carwid are nearly linear with price

In [None]:
doPairPlots(df_car_num2)

enginesize, curbweight are almost linear with price

In [None]:
doPairPlots(df_car_num3)

horsepower is linear with price, and citympg/highwaympg are negatively linear with price

# Examining the heatmap

In [None]:
plt.figure(figsize = (13, 10))
sns.heatmap(df_car.corr(), annot = True)

We see that there are many pairs that have high correlation. Also, some variables like symboling and peakrpm show very little linear correlation with price.

In [None]:
#Extracting company name from CarName
df_car['company'] = df_car['CarName'].apply(lambda x: x.split( )[0].lower())
df_car['company'].replace('maxda','mazda',inplace=True)
df_car['company'].replace('porcshce','porsche',inplace=True)
df_car['company'].replace('toyouta','toyota',inplace=True)
df_car['company'].replace(['vokswagen','vw'],'volkswagen',inplace=True)
df_car['company'].unique()

In [None]:
df_car['company'].nunique()

In [None]:
#Given so many unique values, we should cut this number down by bucketing company names wrt price ranges. Let us divide them into economy, mid, and luxury
df_car.groupby('company')['price'].mean().sort_values(ascending = False)

In [None]:
company_buckets = {
    'cheverolet' : 'economy',
    'dodge' : 'economy',
    'plymouth' : 'economy',
    'honda' : 'economy',
    'subaru' : 'economy',
    'isuzu' : 'economy',
    'mitsubishi' : 'economy',
    'renault' : 'economy',
    'toyota' : 'economy',
    'volkswagen' : 'midSeg',
    'nissan' : 'midSeg',
    'mazda' : 'midSeg',
    'saab' : 'midSeg',
    'peugeot' : 'midSeg',
    'alfa-romero' : 'midSeg',
    'mercury' : 'luxury',
    'audi' : 'luxury',
    'volvo' : 'luxury',
    'bmw' : 'luxury',
    'buick' : 'luxury',
    'porsche' : 'luxury',
    'jaguar' : 'luxury',
    }
df_car['company_segment'] = df_car['company'].map(company_buckets)

In [None]:
df_car.shape

In [None]:
df_car_char = df_car_char[1:]
# df_car_char.append('company_segment')

In [None]:
df

In [None]:
def plotCatVsPrice(catVars):

    plt.figure(figsize=(25, 25))
    for var in catVars:
        plt.subplot(4,3,catVars.index(var)+1)
        sns.boxplot(x = var, y = 'price', data = df_car_2)
    plt.show()

In [None]:
plotCatVsPrice(df_car_char)

In [None]:
df_car_2 = df_car.drop(columns = ['company', 'CarName'])

In [None]:
df_car_2.shape

In [None]:
df_car_char = list(df_car_2.columns[df_car_2.dtypes == 'object'])

In [None]:
df_car_char #This is the categorical variables list

Now let us plot all categorical variables against price

In [None]:
plotCatVsPrice(df_car_char)

As you can see, fueltype, aspiration, and doornumber don't make a significant difference to price. 4wd and fwd performw nearly the same wrt price, while sedan and wagon are also fairly similar. You will notice there are some mispellings present, let us clean this data further.

In [None]:
df_car_2['enginetype'].replace(['ohcf', 'ohc'], 'ohcv', inplace=True)
df_car_2['enginetype'].replace('dohcv', 'dohc', inplace=True)
df_car_2['fuelsystem'].replace('spfi','spdi', inplace = True)
df_car_2['fuelsystem'].replace('mfi','mpfi', inplace = True)
df_car_2['drivewheel'].replace(['4wd','fwd'], '4wd&fwd', inplace=True)
df_car_2['carbody'].replace(['sedan', 'wagon'], 'sedan&wagon', inplace=True)

In [None]:
plotCatVsPrice(df_car_char)

As is visible now, l and dohc engine types are nearly the same. 1bbl and 2bbl fuelsystems are also nearly the same.

In [None]:
df_car_2['enginetype'].replace(['l', 'dohc'], 'dohc&l', inplace = True)
df_car_2['fuelsystem'].replace(['1bbl', '2bbl'], '1&2bbl', inplace=True)

In [None]:
plotCatVsPrice(df_car_char)

Let us drop fueltype, doornumber, and aspiration 

In [None]:
df_car_2.drop(['fueltype','doornumber', 'aspiration'], axis=1, inplace=True)

In [None]:
df_car_char = list(df_car_2.columns[df_car_2.dtypes == 'object'])

In [None]:
plotCatVsPrice(df_car_char)

As we can see, the most significant categorical variables to price are cylindernumber, company_segment, enginelocation, drivewheel, and carbody.

In [None]:
df_car_2.columns

In [None]:
#Now mapping some categorical variables to numeric values
df_car_2['enginelocation'] = df_car_2['enginelocation'].map({'front': 1, 'rear': 0})

In [None]:
df_car_2.shape

In [None]:
#Dummy variable assignment
df_car_dummies = pd.get_dummies(df_car_2) 

In [None]:
df_car_dummies.head()

In [None]:
# features to scale - Only numeric, non binary vars
features_to_normalize = ['symboling',  'enginesize', 'boreratio', 'stroke', 'compressionratio','horsepower'
                         , 'peakrpm',  'price','curbweight','wheelbase','highwaympg'
                         , 'carlength','carwidth','carheight','citympg' ]

In [None]:
np.random.seed(42)
df_train, df_test = train_test_split(df_car_dummies, train_size=0.7 ,test_size = 0.3, random_state=42)

In [None]:
df_train.shape

In [None]:
df_test.shape

Just checking shape to see if correct

In [None]:
#Instantiating scaling object for values
scaler = MinMaxScaler()

In [None]:
df_train[features_to_normalize] = scaler.fit_transform(df_train[features_to_normalize])

In [None]:
df_train.info()

In [None]:
df_train.describe()

Features have been scaled

In [None]:
# divide training data into X and y
y_train = df_train.pop('price')
X_train = df_train

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
y_train

In [None]:
#Creating VIF function
def calculateVIFFactors(df):
    
    df_VIF = pd.DataFrame()
    df_VIF['variables'] = df.columns
    df_VIF['calculated_VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    df_VIF['calculated_VIF'] = round(df_VIF['calculated_VIF'],2)
    df_VIF = df_VIF.sort_values(by = "calculated_VIF", ascending = False)
    return(df_VIF)

In [None]:
# function to simplify fitting the model and print summary

def fit_LRM(X_train):
    
    import statsmodels.api as sm

    X_train = sm.add_constant(X_train)
    model = sm.OLS(y_train,X_train).fit() 
    print(model.summary())
    return model

In [None]:
X_train = X_train.astype(float)
X_train_1 = sm.add_constant(X_train)
model = sm.OLS(y_train,X_train, missing='drop').fit()

In [None]:
print(model.summary())

In [None]:
# drop carlength as it has highest p-value               
X_train_2 = X_train_1.drop(['carlength'], axis=1)
model2 =fit_LRM(X_train_2) # second model

Given that we have too many variables, let us drop a few and try again

In [None]:
X_train_3 = X_train_2.drop(['boreratio','stroke', 'compressionratio', 'peakrpm', 'carheight',
            'cylindernumber_three', 'cylindernumber_twelve', 'highwaympg'], axis = 1)

In [None]:
model3 = fit_LRM(X_train_3)

Let us drop cylindernumber_six as it has pvalue of 1

In [None]:
X_train_4 = X_train_3.drop('cylindernumber_six', axis=1)

In [None]:
model4 = fit_LRM(X_train_4)

In [None]:
#Removing fuelsystem_4bbl for high p-value
X_train_5 = X_train_4.drop('fuelsystem_4bbl', axis=1)

In [None]:
model5 = fit_LRM(X_train_5)

Removing company segment economy and symboling for p-value:

In [None]:
X_train_6 = X_train_5.drop(['company_segment_economy', 'symboling'], axis=1)

In [None]:
model6 = fit_LRM(X_train_6)

In [None]:
X_train_7 = X_train_6.drop(['wheelbase','citympg', 'fuelsystem_1&2bbl'], axis=1)

In [None]:
model7 = fit_LRM(X_train_7)

In [None]:
X_train_8 = X_train_7.drop(['fuelsystem_mpfi', 'cylindernumber_five', 'carbody_sedan&wagon'], axis=1)

In [None]:
model8 = fit_LRM(X_train_8)

In [None]:
X_train_9 = X_train_8.drop(['carwidth', 'carbody_hatchback', 'enginetype_ohcv'], axis=1)

In [None]:
model9 = fit_LRM(X_train_9)

In [None]:
X_train_10 = X_train_9.drop(['fuelsystem_spdi', 'drivewheel_rwd', 'enginetype_dohc&l'], axis=1)

In [None]:
model10 = fit_LRM(X_train_10)

In [None]:
X_train_11 = X_train_10.drop(['drivewheel_4wd&fwd', 'fuelsystem_idi', 'cylindernumber_four'], axis=1)

In [None]:
model11 = fit_LRM(X_train_11)

In [None]:
X_train_12 = X_train_11.drop('carbody_hardtop', axis = 1)

In [None]:
model12 = fit_LRM(X_train_12)

We cannot drop horsepower as it is very important, seeing as other values are lesser than .05 we move on to VIF

In [None]:
calculateVIFFactors(X_train_12)

In [None]:
#We cannot drop enginesize, so we drop enginelocation instead
X_train_13 = X_train_12.drop(['enginelocation', 'cylindernumber_two'], axis=1)

In [None]:
model13 = fit_LRM(X_train_13)

In [None]:
calculateVIFFactors(X_train_13)

In [None]:
X_train_final = sm.add_constant(X_train_13)

In [None]:
model_final = sm.OLS(y_train,X_train_final).fit() # final model  

In [None]:
X_train_final.shape

In [None]:
y_train.shape

In [None]:
y_pred = model_final.predict(X_train_final)

In [None]:
y_pred.shape

In [None]:
# Plot error terms for train data
fig = plt.figure(figsize = (10,8))
sns.distplot((y_train - y_pred), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                   
plt.xlabel('Errors', fontsize = 18)    

In [None]:
# predictions with test data
df_test[features_to_normalize] = scaler.transform(df_test[features_to_normalize])

In [None]:
df_test.describe()

In [None]:
# seperating X_test and y_test
y_test = df_test.pop('price')
X_test = df_test 

In [None]:
X_train_13 = X_train_13.drop('const',axis=1)
X_test_final = X_test[X_train_13.columns]

In [None]:
X_test_final = sm.add_constant(X_test_final)

In [None]:
y_pred_I = model_final.predict(X_test_final)

In [None]:
# Plot error terms for test data
fig = plt.figure(figsize = (10,8))
sns.distplot((y_test - y_pred_I), bins = 20)
fig.suptitle('Error Terms in Test Data', fontsize = 20)                   
plt.xlabel('Errors', fontsize = 18)  

In [None]:
rmse = sqrt(mean_squared_error(y_test, y_pred_I))
print('Model RMSE:',rmse)

from sklearn.metrics import r2_score
r2=r2_score(y_test, y_pred_I)
print('Model test r2_score:',r2)

# As the R2 test value lies within 5% of the R2 value for train, it is acceptable

Features selected in final model:

1) curbweight
2) enginesize
3) horsepower
4) company_segment_luxury
5) cylindernumber_eight
6) company_segment_midSeg
7) enginetype_rotor
8) carbody_convertible	

R sq. Train: 0.918
Adj R sq. in Train: 0.913
R sq. in Test: 0.86