In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/processed/processed.csv')

In [3]:
df.shape

(5247, 9)

In [4]:
df.sample(3)

Unnamed: 0,kms_driven,owner,location,mileage,power,price,brand,engine,age
379,4700.0,first,bangalore,,20.7,190000.0,other,,1
4294,7700.0,first,other,,,127900.0,Royal Enfield,500.0,5
3249,23875.0,second,other,,,85000.0,Royal Enfield,350.0,7


In [5]:
X = df.drop('price',axis=1)
y = df['price']

In [6]:
X.head()

Unnamed: 0,kms_driven,owner,location,mileage,power,brand,engine,age
0,15144.0,first,other,40.0,19.8,Royal Enfield,350.0,5
1,25000.0,first,chennai,35.0,19.8,Royal Enfield,350.0,3
2,5169.0,first,bangalore,,19.0,Yamaha,150.0,2
3,86728.0,first,other,42.0,16.0,Yamaha,150.0,11
4,4400.0,second,chennai,40.0,19.0,Bajaj,220.0,10


In [7]:
df.brand.unique()

array(['Royal Enfield', 'Yamaha', 'Bajaj', 'other', 'Hero', 'TVS', 'KTM',
       'Honda', 'Harley-Davidson', 'Suzuki', 'UM'], dtype=object)

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.25,random_state=11)

In [9]:
y = y.apply(np.log1p)

## Category Encoding

In [10]:
X.head()

Unnamed: 0,kms_driven,owner,location,mileage,power,brand,engine,age
0,15144.0,first,other,40.0,19.8,Royal Enfield,350.0,5
1,25000.0,first,chennai,35.0,19.8,Royal Enfield,350.0,3
2,5169.0,first,bangalore,,19.0,Yamaha,150.0,2
3,86728.0,first,other,42.0,16.0,Yamaha,150.0,11
4,4400.0,second,chennai,40.0,19.0,Bajaj,220.0,10


## Impute missing values

In [11]:
from sklearn.impute import KNNImputer

num_cols = [ col for col in X.columns if X[col].dtypes != 'object' ]
print(num_cols)

['kms_driven', 'mileage', 'power', 'engine', 'age']


In [12]:
imputer = KNNImputer(n_neighbors=7,weights='distance')

imputer.fit(X_train[num_cols])

# X_train[num_cols] = imputer.transform(X_train[num_cols])
# X_test[num_cols] = imputer.transform(X_test[num_cols])
# imputer.fit(X_train[num_cols])

X_train.loc[:][num_cols] = imputer.fit_transform(X_train[num_cols])
X_test.loc[:][num_cols] = imputer.transform(X_test[num_cols])

# print(X_train.shape,y_train.shape, X_test.shape,y_test.shape)

In [13]:
print('Missing Values in Train set',X_train.isnull().sum())
print('Missing Values in Test set',X_test.isnull().sum())

Missing Values in Train set kms_driven    0
owner         0
location      0
mileage       0
power         0
brand         0
engine        0
age           0
dtype: int64
Missing Values in Test set kms_driven    0
owner         0
location      0
mileage       0
power         0
brand         0
engine        0
age           0
dtype: int64


In [14]:
X_train.head()

Unnamed: 0,kms_driven,owner,location,mileage,power,brand,engine,age
721,30000.0,first,mumbai,25.0,19.8,Royal Enfield,350.0,5
725,44000.0,first,other,5.0,8.2,Hero,100.0,2
4959,24000.0,first,other,54.585274,14.044563,other,110.0,6
4289,15000.0,first,bangalore,16.0,76.5,Hero,100.0,4
1021,6900.0,first,bangalore,35.0,19.8,Royal Enfield,350.0,4


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler

category_transformer = ColumnTransformer([
    ("kms_driven_engine_min_max_scaler",MinMaxScaler(),[0,6,3,4]),
    ("owner_ordinal_enc",OrdinalEncoder(categories=[['fourth','third','second','first']],handle_unknown='ignore',dtype=np.int16),[1]),
    ("brand_location_ohe",OneHotEncoder(sparse=False,handle_unknown='error',drop='first',),[2,5]),
],remainder='passthrough')



## Model Building

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn import set_config

set_config(display='diagram')

In [17]:

def build_pipeline_with_estimator(estimator):
    return Pipeline([
    ('category_transformer',category_transformer),
    ('estimator',estimator),
])


In [18]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import cross_val_score
# MAPE
def mape(targets, predictions):
    return np.mean(np.abs((targets - predictions)) / targets) * 100

# Adjusted R^2
def adj_r2(ind_vars, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = ind_vars.shape[0]
    k = ind_vars.shape[1]
    return 1-((1-r2)*(n-1)/(n-k-1))

# Model performance check
def model_perf(model, inp, out,cross_val=True):

    y_pred = model.predict(inp)
    y_act = out.values

    cross_val_ = cross_val_score(model, inp,out,cv=10).mean() if cross_val else None
    

    return pd.DataFrame({
                "RMSE": np.sqrt(mean_squared_error(y_act, y_pred)),
                "MAE": mean_absolute_error(y_act, y_pred),
                "MAPE": mape(y_act, y_pred),
                "R^2": r2_score(y_act, y_pred),
                "Adjusted R^2": adj_r2(inp, y_act, y_pred),
                "Cross Val Score (Mean)": cross_val_ if cross_val else None
           }, index=[0])

## LinearRegression

In [19]:
linear_regressor = build_pipeline_with_estimator(LinearRegression())

linear_regressor.fit(X_train,y_train)

print('Linear Regression Train Performance.\n')
print(model_perf(linear_regressor,X_train,y_train,True))

print('Linear Regression Test Performance.\n')
print(model_perf(linear_regressor,X_test,y_test,True))

Linear Regression Train Performance.

           RMSE          MAE       MAPE       R^2  Adjusted R^2  \
0  74067.499924  32315.44057  46.937635  0.527917      0.526955   

   Cross Val Score (Mean)  
0                0.537539  
Linear Regression Test Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  62603.785879  29924.689347  41.993161  0.642889      0.640696   

   Cross Val Score (Mean)  
0                0.632615  


## RandomForest

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

model = build_pipeline_with_estimator(RandomForestRegressor())

# params = {
#     'estimator__criterion': ['mse','mae'],
#     'estimator__n_estimators': [100,110,120,130],
#     'estimator__max_depth': [5,10,15,20,25,30],
#     'estimator__min_samples_split': range(2,20),
#     'estimator__max_features': ['auto','sqrt','log2'],
# }
# model = GridSearchCV(model,params,cv=5)
# model = RandomizedSearchCV(model,params,cv=5)

model.fit(X_train,y_train)

print('RandomForest Train Performance.\n')
print(model_perf(model,X_train,y_train,False))

print('RandomForest Test Performance.\n')
print(model_perf(model,X_test,y_test,False))

RandomForest Train Performance.

          RMSE          MAE       MAPE       R^2  Adjusted R^2  \
0  28328.24373  8427.722809  10.155582  0.930944      0.930803   

  Cross Val Score (Mean)  
0                   None  
RandomForest Test Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  52904.918067  19982.766959  25.200901  0.744968      0.743402   

  Cross Val Score (Mean)  
0                   None  


## XGBoost

In [21]:
from xgboost import XGBRegressor

xgboost = build_pipeline_with_estimator(XGBRegressor())

xgboost.fit(X_train,y_train)

print('xgboost Train Performance.\n')
print(model_perf(xgboost,X_train,y_train))

print('xgboost Test Performance.\n')
print(model_perf(xgboost,X_test,y_test))

xgboost Train Performance.

           RMSE          MAE       MAPE       R^2  Adjusted R^2  \
0  20912.394739  8331.223395  12.526976  0.962367       0.96229   

   Cross Val Score (Mean)  
0                0.624242  
xgboost Test Performance.

           RMSE           MAE      MAPE       R^2  Adjusted R^2  \
0  58745.333728  20437.017196  26.19055  0.685552      0.683621   

   Cross Val Score (Mean)  
0                0.641632  


## KNeighborsRegressor

In [22]:
from sklearn.neighbors import KNeighborsRegressor

model = build_pipeline_with_estimator(KNeighborsRegressor())

# params = {
#     'estimator__n_neighbors': [3,5,7,9],
#     'estimator__algorithm': ['ball_tree','kd_tree','brute'],
#     'estimator__leaf_size': [5,10,30,32,35]
# }

# model = GridSearchCV(model,params,cv=5)

model.fit(X_train,y_train)

print('KNeighborsRegressor Train Performance.\n')
print(model_perf(model,X_train,y_train))

print('KNeighborsRegressor Test Performance.\n')
print(model_perf(model,X_test,y_test))

KNeighborsRegressor Train Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  61522.959688  20135.561329  22.646566  0.674285      0.673622   

   Cross Val Score (Mean)  
0                0.488992  
KNeighborsRegressor Test Performance.

           RMSE           MAE       MAPE      R^2  Adjusted R^2  \
0  58408.246023  23184.219927  26.674588  0.68915      0.687241   

   Cross Val Score (Mean)  
0                0.498899  


## GradientBoostingRegressor

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

model = build_pipeline_with_estimator(GradientBoostingRegressor())

# params = {
#     'estimator__loss': ['ls','lad','huber','quantile'],
#     'estimator__learning_rate': [0.1,0.01,0.001],
#     'estimator__n_estimators': [100,110,120],
#     'estimator__criterion': ['friedman_mse','mse'],
# }

# model = GridSearchCV(model,params,cv=5)

model.fit(X_train,y_train)

print('GradientBoostingRegressor Train Performance.\n')
print(model_perf(model,X_train,y_train))

print('GradientBoostingRegressor Test Performance.\n')
print(model_perf(model,X_test,y_test))

GradientBoostingRegressor Train Performance.

           RMSE         MAE       MAPE       R^2  Adjusted R^2  \
0  42020.621993  19015.6786  26.848839  0.848054      0.847745   

   Cross Val Score (Mean)  
0                0.690897  
GradientBoostingRegressor Test Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  45619.603297  20724.899823  26.635422  0.810371      0.809206   

   Cross Val Score (Mean)  
0                0.757433  
