In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/processed/processed.csv')

In [3]:
df.shape

(3439, 9)

In [4]:
df.sample(3)

Unnamed: 0,kms_driven,owner,location,mileage,power,price,brand,engine,age
2080,10000.0,first,mumbai,35.0,19.8,150000.0,Royal Enfield,350.0,4
377,8958.0,first,bangalore,35.0,24.8,109059.0,UM,,3
501,25765.0,second,delhi,40.0,19.8,90000.0,Royal Enfield,350.0,7


In [5]:
# df = df.drop('location',axis=1)

In [6]:
X = df.drop('price',axis=1)
y = df['price']

In [7]:
X.head()

Unnamed: 0,kms_driven,owner,location,mileage,power,brand,engine,age
0,5947.0,first,other,53.0,19.0,Bajaj,,4
1,11000.0,first,delhi,40.0,19.8,Royal Enfield,350.0,7
2,13568.0,first,delhi,63.0,14.0,Suzuki,150.0,5
3,20000.0,first,other,40.0,19.8,Royal Enfield,350.0,3
4,10143.0,first,delhi,55.0,8.0,Hero,,3


In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=1234)

In [9]:
y = y.apply(np.log1p)

## Category Encoding

In [10]:
X.head()

Unnamed: 0,kms_driven,owner,location,mileage,power,brand,engine,age
0,5947.0,first,other,53.0,19.0,Bajaj,,4
1,11000.0,first,delhi,40.0,19.8,Royal Enfield,350.0,7
2,13568.0,first,delhi,63.0,14.0,Suzuki,150.0,5
3,20000.0,first,other,40.0,19.8,Royal Enfield,350.0,3
4,10143.0,first,delhi,55.0,8.0,Hero,,3


## Impute missing values

In [11]:
from sklearn.impute import KNNImputer

num_cols = [ col for col in X.columns if X[col].dtypes != 'object' ]
print(num_cols)

['kms_driven', 'mileage', 'power', 'engine', 'age']


In [12]:
imputer = KNNImputer(n_neighbors=7,weights='distance')

X_train[num_cols] = imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = imputer.fit_transform(X_test[num_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[num_cols] = imputer.fit_transform(X_train[num_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[num_cols] = imputer.fit_transform(X_test[num_cols])
A value is trying to be se

In [13]:
print('Missing Values in Train set',X_train.isnull().sum())
print('Missing Values in Test set',X_test.isnull().sum())

Missing Values in Train set kms_driven    0
owner         0
location      0
mileage       0
power         0
brand         0
engine        0
age           0
dtype: int64
Missing Values in Test set kms_driven    0
owner         0
location      0
mileage       0
power         0
brand         0
engine        0
age           0
dtype: int64


In [14]:
X_train.head()

Unnamed: 0,kms_driven,owner,location,mileage,power,brand,engine,age
2881,51791.0,first,jaipur,27.0,26.21,other,192.039983,6.0
322,15000.0,first,other,32.0,24.5,Royal Enfield,410.0,2.0
2035,25000.0,second,other,45.0,13.0,Yamaha,150.0,5.0
3176,30000.0,first,other,65.0,7.7,Hero,100.0,10.0
949,25050.0,first,delhi,32.0,27.2,Royal Enfield,500.0,3.0


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler

category_transformer = ColumnTransformer([
    ("kms_driven_engine_min_max_scaler",MinMaxScaler(),[0,6]),
    ("owner_ordinal_enc",OrdinalEncoder(categories=[['fourth','third','second','first']],handle_unknown='ignore',dtype=np.int16),[1]),
    ("brand_location_ohe",OneHotEncoder(sparse=False,handle_unknown='ignore'),[2,5]),
],remainder='passthrough')



## Model Building

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn import set_config

set_config(display='diagram')

In [17]:

def build_pipeline_with_estimator(estimator):
    return Pipeline([
    ('category_transformer',category_transformer),
    ('estimator',estimator),
])


In [18]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import cross_val_score
# MAPE
def mape(targets, predictions):
    return np.mean(np.abs((targets - predictions)) / targets) * 100

# Adjusted R^2
def adj_r2(ind_vars, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = ind_vars.shape[0]
    k = ind_vars.shape[1]
    return 1-((1-r2)*(n-1)/(n-k-1))

# Model performance check
def model_perf(model, inp, out):

    y_pred = model.predict(inp)
    y_act = out.values

    cross_val = cross_val_score(model, inp,out,cv=10)
    

    return pd.DataFrame({
                "RMSE": np.sqrt(mean_squared_error(y_act, y_pred)),
                "MAE": mean_absolute_error(y_act, y_pred),
                "MAPE": mape(y_act, y_pred),
                "R^2": r2_score(y_act, y_pred),
                "Adjusted R^2": adj_r2(inp, y_act, y_pred),
                "Cross Val Score (Mean)": cross_val.mean()
           }, index=[0])

## LinearRegression

In [19]:
linear_regressor = build_pipeline_with_estimator(LinearRegression())

linear_regressor.fit(X_train,y_train)

print('Linear Regression Train Performance.\n')
print(model_perf(linear_regressor,X_train,y_train))

print('Linear Regression Test Performance.\n')
print(model_perf(linear_regressor,X_test,y_test))

Linear Regression Train Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  17377.088754  12705.771052  19.164959  0.806824      0.806261   

   Cross Val Score (Mean)  
0                0.802044  
Linear Regression Test Performance.

           RMSE           MAE       MAPE      R^2  Adjusted R^2  \
0  16788.643163  12496.506763  19.288227  0.83424      0.832287   

   Cross Val Score (Mean)  
0                0.828572  


## RandomForest

In [28]:
from sklearn.ensemble import RandomForestRegressor

model = build_pipeline_with_estimator(RandomForestRegressor())

params = {
    # 'estimator__criterion': ['gini','entropy'],
    'estimator__n_estimators': [100],
    'estimator__max_depth': [5,10,15,20,25],
    'estimator__min_samples_split': range(2,11),
    'estimator__max_features': ['auto','sqrt','log2'],
}

model = GridSearchCV(model,params,cv=5)

model.fit(X_train,y_train)

print('RandomForest Train Performance.\n')
print(model_perf(model,X_train,y_train))

print('RandomForest Test Performance.\n')
print(model_perf(model,X_test,y_test))

RandomForest Train Performance.

           RMSE         MAE       MAPE       R^2  Adjusted R^2  \
0  10875.910979  7885.16297  11.146544  0.924329      0.924108   

   Cross Val Score (Mean)  
0                 0.85837  
RandomForest Test Performance.



## XGBoost

In [21]:
from xgboost import XGBRegressor

xgboost = build_pipeline_with_estimator(XGBRegressor())

xgboost.fit(X_train,y_train)

print('xgboost Train Performance.\n')
print(model_perf(xgboost,X_train,y_train))

print('xgboost Test Performance.\n')
print(model_perf(xgboost,X_test,y_test))

xgboost Train Performance.

          RMSE          MAE      MAPE       R^2  Adjusted R^2  \
0  7318.489135  4941.942287  6.838691  0.965736      0.965636   

   Cross Val Score (Mean)  
0                0.847749  
xgboost Test Performance.

           RMSE           MAE       MAPE       R^2  Adjusted R^2  \
0  14311.626508  10535.770303  14.929617  0.879545      0.878125   

   Cross Val Score (Mean)  
0                0.846364  


In [22]:
from sklearn.neighbors import KNeighborsRegressor

model = build_pipeline_with_estimator(KNeighborsRegressor())

# params = {
#     'estimator__n_neighbors': [3,5,7,9],
#     'estimator__algorithm': ['ball_tree','kd_tree','brute'],
#     'estimator__leaf_size': [5,10,30,32,35]
# }

# model = GridSearchCV(model,params,cv=5)

model.fit(X_train,y_train)

print('KNeighborsRegressor Train Performance.\n')
print(model_perf(model,X_train,y_train))

print('KNeighborsRegressor Test Performance.\n')
print(model_perf(model,X_test,y_test))

KNeighborsRegressor Train Performance.

           RMSE          MAE       MAPE       R^2  Adjusted R^2  \
0  12715.013751  9078.506652  12.636847  0.896573      0.896272   

   Cross Val Score (Mean)  
0                0.840922  
KNeighborsRegressor Test Performance.

           RMSE           MAE      MAPE       R^2  Adjusted R^2  \
0  15292.729172  10790.054942  15.34334  0.862463      0.860843   

   Cross Val Score (Mean)  
0                0.839745  


## GradientBoostingRegressor

In [24]:
from sklearn.ensemble import GradientBoostingRegressor

model = build_pipeline_with_estimator(GradientBoostingRegressor())

params = {
    'estimator__loss': ['ls','lad','huber','quantile'],
    'estimator__learning_rate': [0.1],
    'estimator__n_estimators': [100],
    'estimator__criterion': ['friedman_mse','mse','mae'],
}

model = GridSearchCV(model,params,cv=5)

model.fit(X_train,y_train)

print('GradientBoostingRegressor Train Performance.\n')
print(model_perf(model,X_train,y_train))

print('GradientBoostingRegressor Test Performance.\n')
print(model_perf(model,X_test,y_test))



GradientBoostingRegressor Train Performance.





           RMSE          MAE       MAPE       R^2  Adjusted R^2  \
0  13524.342994  9770.939403  13.895251  0.882988      0.882646   

   Cross Val Score (Mean)  
0                0.856745  
GradientBoostingRegressor Test Performance.





          RMSE           MAE       MAPE      R^2  Adjusted R^2  \
0  14103.02177  10133.308708  14.913283  0.88303      0.881652   

   Cross Val Score (Mean)  
0                0.855709  
