In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin


import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/Car details v3.csv')
df.head()

In [None]:
df.isnull().sum()/len(df)*100

In [None]:
df = df.dropna()
df = df.drop_duplicates()

df['mileage_kmpl'] = df['mileage'].apply(lambda x: float(x.split()[0]) if type(x)==str else x)
df['engine_CC'] = df['engine'].apply(lambda x: int(x.split()[0]) if type(x)==str else x)

df['max_power'] = df['max_power'].apply(lambda x: x.replace('bhp','') if type(x)==str else x)
df['max_power_bhp'] = df['max_power'].apply(lambda x: float(x))

df.drop(['mileage','max_power', 'engine'],axis=1,inplace=True)

In [None]:
df['brand'] = df['name'].apply(lambda x: x.split()[0])
df.drop('name',axis=1,inplace=True)

In [None]:
def bar_plot(x):
    fig = go.Figure([go.Bar(
    x=df[x].value_counts().index, 
    y=df[x].value_counts().values, 
    text=df[x].value_counts().values)])
    fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
    fig.update_layout(title = f'Number of cars per {x}',
                      xaxis = dict(tickmode = 'linear',dtick = 1))
    return fig

def box_plot(x,y):
    return px.box(df, 
                  x=x, 
                  y=y, 
                  points='all',
                  title= x + ' & ' + y,
                  width=800,
                  height=500)

# EDA

In [None]:
bar_plot('brand')

In [None]:
df['age'] = 2021 - df['year']
df.drop('year',axis = 1 , inplace=True)

In [None]:
bar_plot('age')

In [None]:
fig = px.violin(df, y="selling_price",box=True,points='all')
fig.show()

Removing outliers

In [None]:
df = df[df['selling_price']<10000000]

In [None]:
fig = px.violin(df, y="km_driven",box=True,points='all')
fig.show()

Removing outliers

In [None]:
df = df[df['km_driven']<1000000]

In [None]:
fig = px.violin(df, y="mileage_kmpl",box=True,points='all')
fig.show()

In [None]:
fig = px.scatter(df, x="km_driven", y="selling_price", color="owner")
fig.show()

In [None]:
fig = px.scatter(df, x="km_driven", y="mileage_kmpl", color="owner")
fig.show()

In [None]:
fig = px.scatter(df, x="engine_CC", y="mileage_kmpl", color="fuel")
fig.show()

In [None]:
fig = px.scatter(df, x="engine_CC", y="max_power_bhp", color="fuel")
fig.show()

In [None]:
box_plot('fuel','selling_price')

In [None]:
box_plot('seats','selling_price')

In [None]:
box_plot('transmission','selling_price')

In [None]:
box_plot('owner','selling_price')

In [None]:
fig = go.Figure()
brands =  df.brand.unique()

for brand in brands:
    fig.add_trace(go.Violin(x=df['brand'][df['brand'] == brand],
                            y=df['selling_price'][df['brand'] == brand],
                            name=brand,
                            meanline_visible=True))

fig.show()

Some brands are present in dataset only once. We'll classify them as **'other'**

In [None]:
brand_count = df.groupby('brand')['selling_price'].count().sort_values()
brands_with_one_occurance = brand_count[brand_count<2].index.tolist()

In [None]:
df['brand'] = df['brand'].replace(brands_with_one_occurance,'Other')

Car brands by average selling price

In [None]:
brands_avg_price = df.groupby('brand')['selling_price'].mean().sort_values(ascending=True).reset_index()
labels = np.arange(29)
brands_avg_price['label'] = labels

In [None]:
data = dict(
    avg_price=brands_avg_price.selling_price,
    brand=brands_avg_price.brand)
fig = px.funnel(data, x='avg_price', y='brand')
fig.update_layout(title='Avg selling price per car brand', height = 800)
fig.show()

Volvo, BMW and Jaguar are the most expensive brands in this dataset.

Let's encode the brand feature considering the mean selling_price

In [None]:
def func(x):
    return brands_avg_price[brands_avg_price['brand']==x].label.values[0]

In [None]:
df['brand_encoded'] = df['brand'].apply(lambda x: func(x))

The brand column can be removed now

In [None]:
df = df.drop('brand',axis=1)

Finally, correlation plot

In [None]:
corr = df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr,annot=True,cmap="RdYlGn")

In [None]:
corr.selling_price.sort_values()[:-1].plot(kind='barh')

**Age** and **max_power_bhp** correlate the most with selling_price

# Preparation and training

In [None]:
df['selling_price_M'] = df['selling_price']/1000000

In [None]:
df.describe().selling_price_M

Creating a selling price category column for splitting the dataset into train and test

In [None]:
df['selling_price_cat']= pd.cut(df['selling_price_M'],
                               bins=[0., 0.42, 0.65, np.inf],
                               labels=[1, 2, 3])

In [None]:
df['selling_price_cat'].value_counts()

Now the 'selling_price_M' can be dropped. Feature 'torque' won't be used for training.

In [None]:
df = df.reset_index(drop=True)
df = df.drop(['selling_price_M','torque'],axis=1)
y = df['selling_price']
df_train = df.drop('selling_price',axis=1)

In [None]:
df_train.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y1, y2 = train_test_split(
    df_train , pd.concat([df_train["owner"], df_train["selling_price_cat"]], axis=1), 
    stratify=pd.concat([df_train["owner"], df_train["selling_price_cat"]], axis=1),
    test_size=0.33
)

y_train = y[X_train.index]
y_test = y[X_test.index]

In [None]:
X_train = X_train.drop('selling_price_cat',axis=1)
X_test = X_test.drop('selling_price_cat',axis=1)

In [None]:
cat_col = []
num_col = []
for col in X_train.columns:
    if X_train[col].dtype == 'O':
        cat_col.append(col)
    else:
        num_col.append(col)
        
print(f'Numerical cols for training: {num_col}','\n'
     f'Categorical cols for training: {cat_col}','\n')

In [None]:
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_col, num_col):         
        self.cat_col = cat_col
        self.num_col = num_col
        
    def fit(self,X):
        return self

    def transform(self,X,y=None):
        num_cols = X[self.num_col].copy()
        
        for column in self.cat_col:
            dummies = pd.get_dummies(X[column], prefix = column)
            X = pd.concat([X, dummies], axis=1).drop([column], axis=1)
            
        cat_cols = X.drop(self.num_col, axis = 1)
        
        X = pd.concat([cat_cols, num_cols], axis = 1)
    
        return X

In [None]:
pipeline = Pipeline([
        ("encoder", Encoder(cat_col,num_col))
        ])

In [None]:
X_train_prep = pipeline.fit_transform(X_train)
X_test_prep  = pipeline.fit_transform(X_test)

# Random Forest Regressor

RF hyperparameter configuration

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = [
    {'n_estimators': [30, 60, 100], 
     'max_features': [0,3, 0.5, 0.7], 
     'min_samples_leaf': [2,3,5], 
     'oob_score':[True]},
    
    {'bootstrap': [False], 
     'n_estimators': [30, 60, 100], 
     'max_features': [0,3, 0.5, 0.7],
     'min_samples_leaf': [2,3,5], 
     'oob_score':[True]},
  ]

In [None]:
%time
rf = RandomForestRegressor(random_state=666)

grid_search = GridSearchCV(rf, params, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train_prep, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
m = grid_search.best_estimator_

Score functions

In [None]:
def rmse(predictions, actuals): 
    return math.sqrt(((predictions - actuals)**2).mean())

def print_score(m):
    print('RMSE for training:   ', rmse(m.predict(X_train_prep), y_train))
    print('R^2 for training:    ', m.score(X_train_prep, y_train))
    if hasattr(m, 'oob_score_'): 
        print('OoB score:           ', m.oob_score_)
        
def print_test_score(m):
    print('RMSE for test:   ', rmse(m.predict(X_test_prep), y_test))
    print('R^2 for test:    ', m.score(X_test_prep, y_test))

In [None]:
print_score(m)

In [None]:
rfr_score = rmse(m.predict(X_test_prep), y_test)
print_test_score(m)

In [None]:
feature_importance = pd.DataFrame({'Feature' : X_train_prep.columns, 'Importance' : m.feature_importances_})
feature_importance.sort_values('Importance', ascending=False, inplace=True)

In [None]:
fig = go.Figure([go.Bar(
y=feature_importance.Feature, 
x=feature_importance.Importance, 
text=feature_importance.Importance,
orientation='h')])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title = 'Feature importance RF Regressor')
fig.show()

In [None]:
rfr_predictions = m.predict(X_test_prep)

plt.scatter(y_test, rfr_predictions)
plt.xlabel('y')
plt.ylabel('Predicted')

In [None]:
rfr_err_rate = rfr_predictions - y_test
rfr_err_rate.hist(bins=50)

# XGB Regressor

In [None]:
import xgboost as xgb


dtrain = xgb.DMatrix(X_train_prep, label = y_train)
dtest = xgb.DMatrix(X_test_prep)

params = {"max_depth":2, "eta":0.1}
model = xgb.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)

In [None]:
%time
m = xgb.XGBRegressor(n_estimators=300, max_depth=2, learning_rate=0.1) #the params were tuned using xgb.cv
m.fit(X_train_prep, y_train)

In [None]:
model.loc[20:,["test-rmse-mean", "train-rmse-mean"]].plot()

In [None]:
feature_importance = pd.DataFrame({'Feature' : X_train_prep.columns, 'Importance' : m.feature_importances_})
feature_importance.sort_values('Importance', ascending=False, inplace=True)

In [None]:
fig = go.Figure([go.Bar(
y=feature_importance.Feature, 
x=feature_importance.Importance, 
text=feature_importance.Importance,
orientation='h')])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title = 'Feature importance XGB Regressor')
fig.show()

In [None]:
m = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) #the params were tuned using xgb.cv
m.fit(X_train_prep, y_train)

In [None]:
print_score(m)

In [None]:
xgb_score = rmse(m.predict(X_test_prep), y_test)
print_test_score(m)

In [None]:
xgb_preds = m.predict(X_test_prep)
plt.scatter(y_test,xgb_preds)
plt.xlabel('y')
plt.ylabel('Predicted')

In [None]:
xgb_err_rate = xgb_preds - y_test
xgb_err_rate.hist(bins=50)

# Model performance

In [None]:
def model_performance(model):
    if model == 'XGB':
        err_rate = xgb_err_rate
    elif model == 'RF':
        err_rate = rfr_err_rate
    errors = abs(err_rate)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print(f'{model}')
    print('Accuracy = {:0.2f}%.'.format(accuracy))

In [None]:
model_performance('RF')

In [None]:
model_performance('XGB')