In [None]:
import pandas as pd
import numpy as np
import math
from pathlib import Path
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/motorcycle-dataset/BIKE DETAILS.csv')
df.head(5)

# EDA

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.groupby('name').count().sort_values('year',ascending=False)

Most bike names belong to several brands. Let's make a **brand** feature for the **name** column

In [None]:
def bike_model(model_name, excl_honda_hero=False):
    model_list = []
    if excl_honda_hero:
        for i in df['name']:
            if model_name in i and 'Hero' not in i:
                model_list.append(i)
        return model_list
    else:
        for i in df['name']:
            if model_name in i:
                model_list.append(i)
        return model_list

In [None]:
royal_enfield = bike_model('Royal Enfield')
honda = bike_model('Honda',excl_honda_hero=True)
bajaj = bike_model('Bajaj')
yamaha = bike_model('Yamaha')
suzuki = bike_model('Suzuki')
hero = bike_model('Hero')
tvs = bike_model('TVS')
ktm = bike_model('KTM')

In [None]:
def brand(i):
    if i in royal_enfield:
        return 'Royal Enfield'
    elif i in honda:
        return 'Honda'
    elif i in bajaj:
        return 'Bajaj'
    elif i in yamaha:
        return 'Yamaha'
    elif i in hero:
        return 'Hero'    
    elif i in tvs:
        return 'TVS'    
    elif i in suzuki:
        return 'Suzuki'  
    elif i in ktm:
        return 'KTM' 
    else:
        return 'Other'

In [None]:
df['brand'] = df['name'].apply(lambda x: brand(x))
df.head()

Now there is a **brand** column, so **name** can be dropped for training

In [None]:
def bar_plot(x):
    fig = go.Figure([go.Bar(
    x=df[x].value_counts().index, 
    y=df[x].value_counts().values, 
    text=df[x].value_counts().values)])
    fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
    fig.update_layout(title = f'Number of bikes per {x}')
    return fig

In [None]:
bar_plot('brand')

In [None]:
fig = px.scatter(df, x="ex_showroom_price", y="selling_price")
fig.show()

**ex_showroom_price** correlates quite well with **selling_price**, however, some values are missing

Just in case we don't drop **ex_showroom_price**, we should impute missing values. 

In [None]:
no_nan_df = df.dropna()

In [None]:
no_nan_df['difference'] = (no_nan_df['ex_showroom_price']-no_nan_df['selling_price'])/no_nan_df['selling_price']

In [None]:
no_nan_df.difference.hist(bins=50)

In [None]:
coef = no_nan_df.difference.mean()
df['calculated_ex_showroom_price'] = df["selling_price"].apply(lambda x: coef*x + x)

If we want to keep **ex_showroom_price** for training, the column nan values can be imputed with **calculated_ex_showroom_price**.

In [None]:
fig = px.scatter(df, x="km_driven", y="selling_price", color="owner")
fig.show()

In [None]:
bar_plot('year')

In [None]:
df['age'] = 2021 - df['year']

**Year** column will be dropped for training as well.

In [None]:
bar_plot('age')

In [None]:
bar_plot('owner')

In [None]:
bar_plot('seller_type')

Price distribution per bike brand

In [None]:
fig = go.Figure()
brands =  df.brand.unique()

for brand in brands:
    fig.add_trace(go.Violin(x=df['brand'][df['brand'] == brand],
                            y=df['selling_price'][df['brand'] == brand],
                            name=brand,
                            meanline_visible=True))

fig.show()

In [None]:
 def box_plot(x,y):
    return px.box(df, 
                  x=x, 
                  y=y, 
                  points='all',
                  title= x + ' & ' + y,
                  width=800,
                  height=500)

In [None]:
box_plot('owner','selling_price')

In [None]:
box_plot('seller_type','selling_price')

Only a few values for type "Dealer" - the column will be dropped

# Preparation and training

First, I'll try to train without **'ex_showroom_price'** column

In [None]:
drop = ['name','seller_type','year','ex_showroom_price','calculated_ex_showroom_price']
df_train = df.drop(drop,axis=1)

In [None]:
train_cols = df_train.drop('selling_price',axis=1)

cat_col = []
num_col = []
for col in train_cols.columns:
    if train_cols[col].dtype == 'O':
        cat_col.append(col)
    else:
        num_col.append(col)
        
print(f'Numerical cols for training: {num_col}','\n'
     f'Categorical cols for training: {cat_col}','\n')

Creating a test set

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size=0.25)
for train_idx, test_idx in split.split(df_train,df_train['brand']):
    X_train = df_train.loc[train_idx]
    X_test = df_train.loc[test_idx]

In [None]:
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_col, num_col):         
        self.cat_col = cat_col
        self.num_col = num_col
        
    def fit(self,X):
        return self

    def transform(self,X,y=None):
        num_cols = X[self.num_col].copy()
        
        for column in self.cat_col:
            dummies = pd.get_dummies(X[column], prefix = column, dummy_na=True)
            X = pd.concat([X, dummies], axis=1).drop([column], axis=1)
            
        cat_cols = X.drop(self.num_col, axis = 1)
        
        X = pd.concat([cat_cols, num_cols], axis = 1)
    
        return X

In [None]:
pipeline = Pipeline([
        ("encoder", Encoder(cat_col,num_col))
    ])

In [None]:
target = 'selling_price'

y_train = X_train[target]
y_test = X_test[target]

X_train = X_train.drop(target,axis=1)
X_test = X_test.drop(target,axis=1)

In [None]:
X_train = pipeline.fit_transform(X_train)
X_test  = pipeline.fit_transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Run base RF regressor model
%time
m = RandomForestRegressor(n_estimators=60, min_samples_leaf = 3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)

#   Scores

In [None]:
def rmse(predictions, actuals): 
    return math.sqrt(((predictions - actuals)**2).mean())

def print_score(m):
    print('RMSE for training:   ', rmse(m.predict(X_train), y_train))
    print('R^2 for training:    ', m.score(X_train, y_train))
    if hasattr(m, 'oob_score_'): 
        print('OoB score:           ', m.oob_score_)
        
def print_test_score(m):
    print('RMSE for test:   ', rmse(m.predict(X_test), y_test))
    print('R^2 for test:    ', m.score(X_test, y_test))

In [None]:
print_score(m)

In [None]:
print_test_score(m)

Plotting predictions vs test values

In [None]:
import matplotlib.pyplot as plt
predictions = m.predict(X_test)

plt.scatter(y_test,predictions)
plt.xlabel('y')
plt.ylabel('Predicted')

Error rate

In [None]:
dif = predictions - y_test
dif.hist(bins=50)