In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/processed/processed.csv')

In [3]:
df.sample(3)

Unnamed: 0,brand,model_year,kms_driven,owner,location,price
1268,TVS,2019,22000.0,1st,other,32000
2960,Hero,2013,1.0,1st,Delhi,36000
3953,Bajaj,2018,15000.0,1st,Bangalore,76775


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5504 entries, 0 to 5503
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   brand       5504 non-null   object 
 1   model_year  5504 non-null   int64  
 2   kms_driven  5504 non-null   float64
 3   owner       5504 non-null   object 
 4   location    5504 non-null   object 
 5   price       5504 non-null   int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 258.1+ KB


In [5]:
# scale price value
# df['price'] = df['price'].apply(np.log1p)

## Split Data

In [6]:
X = df.drop('price',axis=1)
y = df['price']

## Category Encoding

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler


In [8]:
owner_categories = list(df.owner.unique()[::-1])
owner_categories

['4th', '5th', '2nd', '3rd', '1st']

In [9]:
df.sample()

Unnamed: 0,brand,model_year,kms_driven,owner,location,price
4540,Royal Enfield,2012,66000.0,1st,other,65000


In [10]:
# ['5th','4th','3rd','2nd','1st']

In [11]:
category_transformer = ColumnTransformer([
    ("brand_ohe",OneHotEncoder(dtype=np.int16,sparse=False,handle_unknown='ignore'),[0]),
    ("kms_driven_min_max_scaler",MinMaxScaler(),[2]),
    ("owner_ordinal_enc",OrdinalEncoder(categories='auto',handle_unknown='ignore',dtype=np.int16),[3]),
    ("location_ohe",OneHotEncoder(dtype=np.int16, sparse=False,handle_unknown='ignore'),[4]),
],remainder='passthrough')



In [12]:
# Scale the price data
# y = MinMaxScaler().fit_transform([y])[0]

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=1234)

## Model Building

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn import set_config

set_config(display='diagram')

In [15]:
def build_pipeline_with_estimator(estimator):
    return Pipeline([
    ('category_transformer',category_transformer),
    ('estimator',estimator),
])


In [16]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import cross_val_score
# MAPE
def mape(targets, predictions):
    return np.mean(np.abs((targets - predictions)) / targets) * 100

# Adjusted R^2
def adj_r2(ind_vars, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = ind_vars.shape[0]
    k = ind_vars.shape[1]
    return 1-((1-r2)*(n-1)/(n-k-1))

# Model performance check
def model_perf(model, inp, out):

    y_pred = model.predict(inp)
    y_act = out.values

    cross_val = cross_val_score(model, inp,out,cv=10)
    

    return pd.DataFrame({
                "RMSE": np.sqrt(mean_squared_error(y_act, y_pred)),
                "MAE": mean_absolute_error(y_act, y_pred),
                "MAPE": mape(y_act, y_pred),
                "R^2": r2_score(y_act, y_pred),
                "Adjusted R^2": adj_r2(inp, y_act, y_pred),
                "Cross Val Score (Mean)": cross_val.mean()
           }, index=[0])

## LinearRegression

In [17]:
liner_regressor = build_pipeline_with_estimator(LinearRegression())
liner_regressor.fit(X_train,y_train)

print('Linear Regression Train Performance.\n')
print(model_perf(liner_regressor,X_train,y_train))

print('Linear Regression Test Performance.\n')
print(model_perf(liner_regressor,X_test,y_test))

Linear Regression Train Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  71828.754902  28676.181694  61.692046  0.469374      0.468771   

   Cross Val Score (Mean)  
0                 0.49132  
Linear Regression Test Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  56734.684908  27513.379655  56.300424  0.579489      0.577569   

   Cross Val Score (Mean)  
0                0.556035  


## RandomForest

In [18]:
from sklearn.ensemble import RandomForestRegressor

rf = build_pipeline_with_estimator(RandomForestRegressor())

rf.fit(X_train,y_train)

print('RandomForest Train Performance.\n')
print(model_perf(rf,X_train,y_train))

print('RandomForest Test Performance.\n')
print(model_perf(rf,X_test,y_test))

RandomForest Train Performance.

           RMSE          MAE       MAPE       R^2  Adjusted R^2  \
0  28907.447557  9798.218158  18.050987  0.914057      0.913959   

   Cross Val Score (Mean)  
0                0.541148  
RandomForest Test Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  61552.069138  24468.038992  43.527128  0.505045      0.502785   

   Cross Val Score (Mean)  
0                 0.56777  


## XGBoost

In [20]:
from xgboost import XGBRegressor

xgboost = build_pipeline_with_estimator(XGBRegressor())

xgboost.fit(X_train,y_train)

print('xgboost Train Performance.\n')
print(model_perf(xgboost,X_train,y_train))

print('xgboost Test Performance.\n')
print(model_perf(xgboost,X_test,y_test))

xgboost Train Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  20173.559251  12604.935477  26.866738  0.958144      0.958097   

   Cross Val Score (Mean)  
0                0.451688  
xgboost Test Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  61331.750659  24542.414297  42.783103  0.508582      0.506338   

   Cross Val Score (Mean)  
0                0.470177  
