In [None]:
pip install plotly

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR


from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

# Visualization lib
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
# import plotly.io as pio
# pio.renderers.default = "svg"

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Load the data
data = pd.read_csv("/kaggle/input/car-price-prediction/CarPrice_Assignment.csv")
data.sample(3)

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

### 1. Handing Missing & Duplicate Values

In [None]:
# Checking for missing values
data.isnull().sum()

###### No missing values present

In [None]:
data[data.duplicated()]

###### No duplicate values present

### 2. Exploratory Data Analysis

In [None]:
# 1. Car ID : Unique id of each observation (Interger) - Drop as not required for prediction
data = data.drop(['car_ID'], axis = 'columns')

In [None]:
data['CompanyName'] = data['CarName'].apply(lambda x : x.split(' ')[0])
data = data.drop(['CarName'], axis = 'columns')

In [None]:
# Fixing the spelling error

data['CompanyName'].replace('maxda','mazda', inplace = True)
data['CompanyName'].replace('porcshce','porsche', inplace = True)
data['CompanyName'].replace('toyouta','toyota', inplace = True)
data['CompanyName'].replace('vokswagen','volkswagen', inplace = True)
data['CompanyName'].replace('vw','volkswagen', inplace = True)

In [None]:
# Getting all Categorical Features
categorical_features = []
categorical_features = list(data.select_dtypes(exclude = [np.float, np.int64]).columns)
# categorical_features.remove('CarName')
categorical_features.append('symboling')
categorical_features

In [None]:
# Getting all Numeric Features
numeric_features = data.select_dtypes(include = [np.float, np.int64]).columns
numeric_features = numeric_features.drop(['symboling'])
numeric_features

#### 1. Univariante Analysis

In [None]:
# conda install -c plotly plotly-orca


In [None]:
fig = make_subplots(rows = 1, cols = 2)
fig.add_trace(go.Histogram(
    x = data['price'],
    name = 'Car Price'
), row = 1, col = 1)

fig.add_trace(go.Box(
    y = data['price'],
    name = 'Car Price'
), row = 1, col = 2)

fig['layout'].update(title = 'Price Distribution (Target)', width = 950)
iplot(fig)

In [None]:
data.price.describe(percentiles = [0.25, 0.50, 0.75, 0.85, 0.90, 1]).T

* Price distribution seemed to be right skewed
* Majority of cars fall under car price 16k
* There is a significant difference in Mean (13276.71) & Median (10295)


In [None]:
fig = make_subplots(rows = 11, cols = 2)
index = 1
for row in range(1, 11):
    feature_name = categorical_features[index]
    df = data[feature_name].value_counts()
    fig.add_trace(go.Bar(
        x = df.index,
        y = df.values,
        name = feature_name
    ),row = row, col = 1)
    fig.add_trace(go.Box(
        x = data[feature_name],
        y = data['price'],
        name = feature_name + ' vs Price' 
    ), row = row, col = 2)
    index = index + 1
        
fig['layout'].update(height = 3000, width = 950)
iplot(fig)

#### Inferences for categorical features:

* Toyota seemed most favoured car, Jaguar and Buick seem to have highest average price.
* Number of car gas fueled are more than diseal, diesel has higher average price than gas.
* Sedan is most preferred car
* ohc Engine type most prefered & ohcv has higest price range
* symboling 0 & 1 most prefered, The cars with -1 symboling seems to be high priced (as it makes sense too, insurance risk rating -1 is quite good).
* hardtop and convertible have higher average price.
* doornumber variable is not affecting the price much. There is no sugnificant difference between the categories in it.
* It seems aspiration with turbo have higher price range than the std(though it has some high values outside the whiskers.)
* Very few datapoints for enginelocation categories to make an inference.
* Most common number of cylinders are four, six and five. Though eight cylinders have the highest price range.
* mpfi and 2bbl are most common type of fuel systems. mpfi and idi having the highest price range. But there are few data for other categories to derive any meaningful inference
* A very significant difference in drivewheel category. Most high ranged cars seeme to prefer rwd drivewheel.

In [None]:
# for feature in numeric_features:
#     fig = px.histogram(data, x = feature, marginal = 'box', color_discrete_sequence = ['lightblue'])
#     fig.show()

In [None]:
fig = make_subplots(rows = 10, cols = 2)
index = 1
for row in range(1, 11):
    feature_name = numeric_features[index]
    df = data[feature_name].value_counts()
    fig.add_trace(go.Histogram(
        x = data[feature_name],
        name = feature_name
    ),row = row, col = 1)
    fig.add_trace(go.Scatter(
        x = data[feature_name],
        y = data['price'],
        mode = 'markers',
        name = feature_name + ' vs Price'
    ), row = row, col = 2)
    index = index + 1
        
fig['layout'].update(height = 3000, width = 950)
iplot(fig)

* carwidth, carlength and curbweight seems to have a poitive correlation with price.
* carheight doesn't show any significant trend with price.
* enginesize, boreratio, horsepower, wheelbase - seem to have a significant positive correlation with price.
* citympg, highwaympg - seem to have a significant negative correlation with price.


In [None]:
correlatated_features = ['carwidth', 'carlength', 'curbweight', 'enginesize', 'boreratio', 'horsepower', 'wheelbase','citympg','highwaympg']
for feature in correlatated_features:
    fig = px.scatter(data, x = data[feature], y = data['price'], trendline = "ols")
    fig['layout'].update(title = feature + ' vs price')
    fig.show()

In [None]:
data.corr().style.background_gradient(cmap="Blues")

In [None]:
sns.pairplot(data)

## 3. Feature Engineering

In [None]:
cars_data = data
cars_data.head()

In [None]:
# Converting cylindernumber to Ordinal feature
cars_data['cylindernumber'] = cars_data['cylindernumber'].map({'four': 4, 'six' : 6, 'five' : 5, 'three' : 3, 'twelve' : 12, 'two' : 2, 'eight' : 8})
cars_data['doornumber'] = cars_data['doornumber'].map({'four': 4, 'two' : 2})

In [None]:
# Dummy variables

cars_data = pd.concat([cars_data, pd.get_dummies(cars_data['fueltype'], drop_first = True)], axis = 'columns')
cars_data = pd.concat([cars_data, pd.get_dummies(cars_data['aspiration'], drop_first = True)], axis = 'columns')
cars_data = pd.concat([cars_data, pd.get_dummies(cars_data['carbody'], drop_first = True)], axis = 'columns')
cars_data = pd.concat([cars_data, pd.get_dummies(cars_data['drivewheel'], drop_first = True)], axis = 'columns')
cars_data = pd.concat([cars_data, pd.get_dummies(cars_data['enginetype'], drop_first = True)], axis = 'columns')
cars_data = pd.concat([cars_data, pd.get_dummies(cars_data['enginelocation'], drop_first = True)], axis = 'columns')
cars_data = pd.concat([cars_data, pd.get_dummies(cars_data['CompanyName'], drop_first = True)], axis = 'columns')
cars_data = pd.concat([cars_data, pd.get_dummies(cars_data['fuelsystem'], drop_first = True)], axis = 'columns')

In [None]:
cars_data.drop(['fueltype','aspiration','carbody','drivewheel','enginetype','cylindernumber','enginelocation','fuelsystem','CompanyName'], axis = 'columns', inplace = True)

In [None]:
cars_data.head()

## 4. Model Training 

In [None]:
X = cars_data.drop(['price'], axis = 'columns')
y = cars_data['price']

In [None]:
# Train & Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
def show_metrics(y_test, y_pred):
    score = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print('R2: {:.3f}'.format(score))
    print('MSE: {:.3f}'.format(mse))
    print('RMSE: {:.3f}'.format(rmse))

In [None]:
# 4. Hyper parameter tunning for Random Forest 
params = {
            'criterion' : ['mse','mae'],
            'n_estimators': [100, 200, 500],
            'max_features': [2, 3],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'max_depth' : [10,20,30]
}
gd = GridSearchCV(estimator = RandomForestRegressor(), param_grid = params, verbose = True, n_jobs = -1)
gd.fit(X, y)
print(gd.best_score_)
print(gd.best_params_)

In [None]:
rf_model = RandomForestRegressor(criterion = 'mae', max_depth = 10, max_features = 3, min_samples_leaf = 1, min_samples_split = 2,
                                 n_estimators = 100)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
show_metrics(y_test, y_pred)

In [None]:
lr_model = LinearRegression()
lr_model.fit(X, y)
y_pred = lr_model.predict(X_test)
show_metrics(y_test, y_pred)

## 5. Model Interpretability

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(rf_model)

In [None]:
choosen_instance = X_test.iloc[[1]]
print(choosen_instance.horsepower)
shap_values = explainer.shap_values(choosen_instance)
shap.force_plot(explainer.expected_value, shap_values, choosen_instance)

In [None]:
choosen_instance = X_test.iloc[[10]]
shap_values = explainer.shap_values(choosen_instance)
shap.force_plot(explainer.expected_value, shap_values, choosen_instance)

In [None]:
# calculate SHAP
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_train, approximate=False, check_additivity=False)

shap.summary_plot(shap_values, X_train)