In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
df = pd.read_csv('../data/processed/processed.csv')

In [22]:
df.shape

(1833, 9)

In [23]:
df.sample(3)

Unnamed: 0,kms_driven,owner,location,mileage,power,price,brand,engine,age
288,10000.0,first,other,40.0,18.3,140000.0,Yamaha,150.0,1
304,16586.0,first,jaipur,35.0,24.6,130000.0,KTM,200.0,3
511,58000.0,first,delhi,38.0,21.0,34600.0,Bajaj,220.0,8


In [24]:
X = df.drop('price',axis=1)
y = df['price']

In [25]:
X.head()

Unnamed: 0,kms_driven,owner,location,mileage,power,brand,engine,age
0,5947.0,first,other,53.0,19.0,Bajaj,,4
1,11000.0,first,delhi,40.0,19.8,Royal Enfield,350.0,7
2,13568.0,first,delhi,63.0,14.0,Suzuki,150.0,5
3,20000.0,first,other,40.0,19.8,Royal Enfield,350.0,3
4,6000.0,first,delhi,35.0,19.1,Royal Enfield,350.0,1


In [26]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=1234)

In [27]:
y = y.apply(np.log1p)

## Category Encoding

In [9]:
# ## Change the owner category to numerical encoding
# owners_ranks = {
#     '1st': 5,
#     '2nd': 4,
#     '3rd': 3,
#     '4th': 2,
#     '5th': 1,
    
# }

# df['owner'] = df['owner'].apply(lambda x: owners_ranks.get(x,1))

# df.info()

In [10]:
# ['5th','4th','3rd','2nd','1st']

In [28]:
X.sample()

Unnamed: 0,kms_driven,owner,location,mileage,power,brand,engine,age
1579,48000.0,fourth,jaipur,62.0,11.64,Bajaj,125.0,1


In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler
from sklearn.impute import KNNImputer

imputer_transformer = ColumnTransformer([
    ('knn_imputer',KNNImputer(n_neighbors=5),[0,3,4,6])
])

category_transformer = ColumnTransformer([
    ("brand_ohe",OneHotEncoder(dtype=np.int16,sparse=False,handle_unknown='ignore'),[5]),
    ("kms_driven_engine_min_max_scaler",MinMaxScaler(),[0,6]),
    ("owner_ordinal_enc",OrdinalEncoder(categories='auto',handle_unknown='ignore',dtype=np.int16),[3]),
    ("location_ohe",OneHotEncoder(dtype=np.int16, sparse=False,handle_unknown='ignore'),[2]),
],remainder='passthrough')



In [14]:
# Scale the price data
# y = MinMaxScaler().fit_transform([y])[0]

## Model Building

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn import set_config

set_config(display='diagram')

In [31]:

def build_pipeline_with_estimator(estimator):
    return Pipeline([
    ('imputer',imputer_transformer),
    ('category_transformer',category_transformer),
    ('estimator',estimator),
])


In [17]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import cross_val_score
# MAPE
def mape(targets, predictions):
    return np.mean(np.abs((targets - predictions)) / targets) * 100

# Adjusted R^2
def adj_r2(ind_vars, targets, predictions):
    r2 = r2_score(targets, predictions)
    n = ind_vars.shape[0]
    k = ind_vars.shape[1]
    return 1-((1-r2)*(n-1)/(n-k-1))

# Model performance check
def model_perf(model, inp, out):

    y_pred = model.predict(inp)
    y_act = out.values

    cross_val = cross_val_score(model, inp,out,cv=10)
    

    return pd.DataFrame({
                "RMSE": np.sqrt(mean_squared_error(y_act, y_pred)),
                "MAE": mean_absolute_error(y_act, y_pred),
                "MAPE": mape(y_act, y_pred),
                "R^2": r2_score(y_act, y_pred),
                "Adjusted R^2": adj_r2(inp, y_act, y_pred),
                "Cross Val Score (Mean)": cross_val.mean()
           }, index=[0])

In [18]:
# linear_regressor = build_pipeline_with_estimator(LinearRegression())
# scores = []
# for i in range(1000):
#     X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=1)

#     linear_regressor.fit(X_train,y_train)

#     y_pred = linear_regressor.predict(X_test)

#     s = r2_score(y_test,y_pred)

#     scores.append(s)

# print(scores)
    
# liner_regressor.fit(X_train,y_train)

# print('Linear Regression Train Performance.\n')
# print(model_perf(liner_regressor,X_train,y_train))

# print('Linear Regression Test Performance.\n')
# print(model_perf(liner_regressor,X_test,y_test))

## LinearRegression

In [34]:
linear_regressor = build_pipeline_with_estimator(LinearRegression())

# params = {}

# liner_regressor = GridSearchCV(liner_regressor,params,cv=10)

linear_regressor.fit(X_train,y_train)

print('Linear Regression Train Performance.\n')
print(model_perf(linear_regressor,X_train,y_train))

print('Linear Regression Test Performance.\n')
print(model_perf(linear_regressor,X_test,y_test))

ValueError: all features must be in [0, 3] or [-4, 0]

## RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = build_pipeline_with_estimator(RandomForestRegressor())

# params = {
#     'estimator__criterion': ['gini','entropy'],
#     'estimator__n_estimators': [100],
#     'estimator__max_depth': [5,10,15,20,25],
#     'estimator__min_samples_split': range(2,11),
#     'estimator__max_features': ['auto','sqrt','log2'],
# }

# model = GridSearchCV(model,params,cv=10)

model.fit(X_train,y_train)

print('RandomForest Train Performance.\n')
print(model_perf(model,X_train,y_train))

print('RandomForest Test Performance.\n')
print(model_perf(model,X_test,y_test))

RandomForest Train Performance.

       RMSE       MAE      MAPE       R^2  Adjusted R^2  \
0  0.230062  0.152402  1.399345  0.893729      0.893567   

   Cross Val Score (Mean)  
0                  0.5461  
RandomForest Test Performance.

       RMSE       MAE      MAPE      R^2  Adjusted R^2  Cross Val Score (Mean)
0  0.459848  0.324745  2.967276  0.57458      0.571989                 0.47915


## XGBoost

In [None]:
from xgboost import XGBRegressor

xgboost = build_pipeline_with_estimator(XGBRegressor())

xgboost.fit(X_train,y_train)

print('xgboost Train Performance.\n')
print(model_perf(xgboost,X_train,y_train))

print('xgboost Test Performance.\n')
print(model_perf(xgboost,X_test,y_test))

xgboost Train Performance.

       RMSE       MAE      MAPE       R^2  Adjusted R^2  \
0  0.279987  0.199967  1.840092  0.842602      0.842363   

   Cross Val Score (Mean)  
0                0.578851  
xgboost Test Performance.

       RMSE       MAE      MAPE       R^2  Adjusted R^2  \
0  0.445467  0.316339  2.889716  0.600771       0.59834   

   Cross Val Score (Mean)  
0                  0.4764  


In [None]:
from sklearn.neighbors import KNeighborsRegressor

model = build_pipeline_with_estimator(KNeighborsRegressor())

# params = {
#     'estimator__n_neighbors': [3,5,7,9],
#     'estimator__algorithm': ['ball_tree','kd_tree','brute'],
#     'estimator__leaf_size': [5,10,30,32,35]
# }

# model = GridSearchCV(model,params,cv=5)

model.fit(X_train,y_train)

print('KNeighborsRegressor Train Performance.\n')
print(model_perf(model,X_train,y_train))

print('KNeighborsRegressor Test Performance.\n')
print(model_perf(model,X_test,y_test))

KNeighborsRegressor Train Performance.

       RMSE       MAE      MAPE       R^2  Adjusted R^2  \
0  0.388821  0.274376  2.511198  0.696453      0.695993   

   Cross Val Score (Mean)  
0                0.540933  
KNeighborsRegressor Test Performance.

       RMSE       MAE      MAPE       R^2  Adjusted R^2  \
0  0.479743  0.341066  3.110378  0.536971      0.534151   

   Cross Val Score (Mean)  
0                0.455039  


## GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model = build_pipeline_with_estimator(GradientBoostingRegressor())

# params = {
#     'estimator__loss': ['ls','lad','huber','quantile'],
#     'estimator__learning_rate': [0.1],
#     'estimator__n_estimators': [100],
#     'estimator__criterion': ['friedman_mse','mse','mae'],
# }

# model = GridSearchCV(model,params,cv=5)

model.fit(X_train,y_train)

print('GradientBoostingRegressor Train Performance.\n')
print(model_perf(model,X_train,y_train))

print('GradientBoostingRegressor Test Performance.\n')
print(model_perf(model,X_test,y_test))

GradientBoostingRegressor Train Performance.

       RMSE       MAE      MAPE       R^2  Adjusted R^2  \
0  0.419139  0.314055  2.896082  0.647271      0.646737   

   Cross Val Score (Mean)  
0                0.598646  
GradientBoostingRegressor Test Performance.

       RMSE       MAE      MAPE       R^2  Adjusted R^2  \
0  0.443787  0.329871  3.025058  0.603778      0.601365   

   Cross Val Score (Mean)  
0                0.558512  
