In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/processed/data.csv')

In [3]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
0,Yamaha Fazer Dlx Standard,2014,12600,1st Owner,Raigad,34999
1,Yamaha RX135 Standard,1996,14500,2nd Owner,Madurai,85000
2,Bajaj Pulsar 150 [2001-2011] Kick Start,2007,51000,2nd Owner,Bangalore,25000
3,Royal Enfield Electra Twinspark Standard,2013,57500,1st Owner,Bangalore,100000
4,Hero Honda Glamour Alloy Drum Self,2006,69569,2nd Owner,Azamgarh,21000


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1327 entries, 0 to 1326
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   model_name  1327 non-null   object
 1   model_year  1327 non-null   int64 
 2   kms_driven  1327 non-null   int64 
 3   owner       1327 non-null   object
 4   location    1327 non-null   object
 5   price       1327 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 62.3+ KB


## Feature Engineering

### Handle model_name

In [6]:
# Let's simplify model_name variable
# Take only first 3 words from the value

def build_model_name(val):
    splits = val.strip().split(' ')

    if len(splits) > 3:
        return f"{splits[0]} {splits[1]} {splits[2]}"
    else:
        return val.strip()

df['model_name'] = df.model_name.apply(build_model_name)


In [7]:
df.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
0,Yamaha Fazer Dlx,2014,12600,1st Owner,Raigad,34999
1,Yamaha RX135 Standard,1996,14500,2nd Owner,Madurai,85000
2,Bajaj Pulsar 150,2007,51000,2nd Owner,Bangalore,25000
3,Royal Enfield Electra,2013,57500,1st Owner,Bangalore,100000
4,Hero Honda Glamour,2006,69569,2nd Owner,Azamgarh,21000


In [8]:
print("Model name has unique values:",len(df.model_name.unique()))

Model name has unique values: 215


In [9]:
top_models = df.model_name.value_counts()[:40].index

### Take only top models as main categories & make other categories as "others"

In [10]:
df['model_name'] = df.model_name.apply(lambda x: x if x in list(top_models) else "others")

In [11]:
df.tail()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
1322,others,2020,6000,1st Owner,Hyderabad,65000
1323,others,2018,12000,1st Owner,Srinagar,65000
1324,Yamaha YZF R15,2016,33000,2nd Owner,Gondia,80000
1325,Honda CB Unicorn,2011,50000,1st Owner,Mumbai,40000
1326,Bajaj Avenger Street,2016,23600,1st Owner,Bangalore,60000


### Handle location categories

In [12]:
top_locations = df.location.value_counts()[:25].index
top_locations

Index(['Bangalore ', 'Delhi ', 'Pune ', 'Mumbai ', 'Hyderabad ', 'Chennai ',
       'Ahmedabad ', 'Gurgaon ', 'Kolkata ', 'Thane ', 'Ghaziabad ',
       'Lucknow ', 'Jaipur ', 'Bhopal ', 'Noida ', 'Vijaywada ',
       'Navi Mumbai ', 'Surat ', 'Nagpur ', 'Indore ', 'Bhubaneswar ',
       'Patna ', 'Faridabad ', 'Coimbatore ', 'Rewari '],
      dtype='object')

### Take only top locations as main categories & make other categories as "others"

In [13]:
df['location'] = df.location.apply(lambda x: x if x in top_locations else "others")

In [14]:
df.tail()

Unnamed: 0,model_name,model_year,kms_driven,owner,location,price
1322,others,2020,6000,1st Owner,Hyderabad,65000
1323,others,2018,12000,1st Owner,others,65000
1324,Yamaha YZF R15,2016,33000,2nd Owner,others,80000
1325,Honda CB Unicorn,2011,50000,1st Owner,Mumbai,40000
1326,Bajaj Avenger Street,2016,23600,1st Owner,Bangalore,60000


In [15]:
df.owner.unique()[::-1]

array(['5th Owner', '4th Owner', '3rd Owner', '2nd Owner', '1st Owner'],
      dtype=object)

## Split Data

In [16]:
X = df.drop('price',axis=1)
y = df['price']

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1234)

In [18]:
X_train.head()

Unnamed: 0,model_name,model_year,kms_driven,owner,location
643,Bajaj Pulsar NS200,2013,90000,1st Owner,Pune
919,Honda CB Unicorn,2009,95000,1st Owner,others
680,others,2003,49000,1st Owner,others
778,Honda Activa [2000-2015],2009,41000,1st Owner,Gurgaon
641,Royal Enfield Classic,2020,11000,1st Owner,Bangalore


## Create column transformer for encoding

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler


In [20]:
owner_categories = list(df.owner.unique()[::-1])
owner_categories

['5th Owner', '4th Owner', '3rd Owner', '2nd Owner', '1st Owner']

In [21]:
column_transformer = ColumnTransformer([
    ("model_name_ohe",OneHotEncoder(dtype=np.int16,sparse=False,handle_unknown='ignore'),[0]),
    ("kms_driven_min_max_scaler",MinMaxScaler(),[2]),
    ("owner_ordinal_enc",OrdinalEncoder(categories='auto',handle_unknown='ignore',dtype=np.int16),[3]),
    ("location_ohe",OneHotEncoder(dtype=np.int16, sparse=False,handle_unknown='ignore'),[4]),
],remainder='passthrough')



## Model Building

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn import set_config

set_config(display='diagram')

In [23]:
def build_pipeline_with_estimator(estimator):
    return Pipeline([
    ('transformer',column_transformer),
    ('estimator',estimator),
])


In [24]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import cross_val_score
# MAPE
def mape(targets, predictions):
    return np.mean(np.abs((targets - predictions)) / targets) * 100

# Adjusted R^2
def adj_r2(ind_vars, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = ind_vars.shape[0]
    k = ind_vars.shape[1]
    return 1-((1-r2)*(n-1)/(n-k-1))

# Model performance check
def model_perf(model, inp, out):

    y_pred = model.predict(inp)
    y_act = out.values

    cross_val = cross_val_score(model, inp,out,cv=10)
    

    return pd.DataFrame({
                "RMSE": np.sqrt(mean_squared_error(y_act, y_pred)),
                "MAE": mean_absolute_error(y_act, y_pred),
                "MAPE": mape(y_act, y_pred),
                "R^2": r2_score(y_act, y_pred),
                "Adjusted R^2": adj_r2(inp, y_act, y_pred),
                "Cross Val Score (Mean)": cross_val.mean()
           }, index=[0])

In [25]:
liner_regressor = build_pipeline_with_estimator(LinearRegression())
liner_regressor.fit(X_train,y_train)

print('Linear Regression Train Performance.\n')
model_perf(liner_regressor,X_train,y_train)

Linear Regression Train Performance.



Unnamed: 0,RMSE,MAE,MAPE,R^2,Adjusted R^2,Cross Val Score (Mean)
0,99646.388638,36220.59799,75.636546,0.246927,0.24312,0.285164


In [26]:
from sklearn.ensemble import RandomForestRegressor

rf = build_pipeline_with_estimator(RandomForestRegressor())

rf.fit(X_train,y_train)

print('RandomForest Train Performance.\n')
model_perf(rf,X_train,y_train)

RandomForest Train Performance.



Unnamed: 0,RMSE,MAE,MAPE,R^2,Adjusted R^2,Cross Val Score (Mean)
0,45858.979792,11742.020151,14.270398,0.840499,0.839693,0.114848


In [27]:
rf2 = build_pipeline_with_estimator(RandomForestRegressor())

rf2.fit(X_train,y_train)

y_pred = rf2.predict(X_test)

print('R2 score',r2_score(y_test,y_pred))

R2 score 0.2850990545035412


In [28]:
from sklearn.linear_model import Lasso

model = build_pipeline_with_estimator(Lasso())

model.fit(X_train,y_train)

print('Lasso Train Performance.\n')
model_perf(model,X_train,y_train)

Lasso Train Performance.



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,RMSE,MAE,MAPE,R^2,Adjusted R^2,Cross Val Score (Mean)
0,99646.336849,36217.660794,75.601018,0.246928,0.24312,0.284009
