# Load Libraries

In [1385]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from category_encoders import TargetEncoder
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_percentage_error

from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


import joblib

# Load data

In [1707]:
file_path = (r"C:\Users\aravit01\OneDrive - Kearney\1. RAVI TEJA\12. My Learnings\Portfolio Projects to Showcase\Car Price Prediction - Regression\CAR DETAILS FROM CAR DEKHO.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [55]:
df.shape

(4340, 8)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [57]:
df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [58]:
df.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [59]:
df.duplicated().sum()

763

In [67]:
df.drop_duplicates(keep='first', inplace = True)

In [71]:
df.duplicated().sum()

0

In [72]:
df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

In [73]:
cat_cols = df.select_dtypes(include='object').columns
num_cols = df.select_dtypes(exclude='object').columns

In [74]:
num_cols

Index(['year', 'selling_price', 'km_driven'], dtype='object')

observations:
- we dont have null values
- check for duplicates.
- duplicate values are dropped by keeping first row
- check any transformations needed for categorical features.
- check any imputations needed for numerical features
- target feature = selling_price
- currently we are skipping EDA, visualization step for quick processing of model


In [433]:
def get_iqr(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3-q1
    upper_bound = q3 + 1.5*iqr
    lower_bound = q1 - 1.5*iqr
    return (lower_bound, upper_bound)


In [435]:
lower_bound, upper_bound

(2019.0, 2007.0)

In [440]:
get_iqr(df, 'year')

(2001.0, 2025.0)

In [438]:
df[df['year'] < get_iqr(df, 'year')[0]].shape, df[df['year'] > get_iqr(df, 'year')[1]].shape

((37, 8), (0, 8))

In [441]:
get_iqr(df, 'km_driven')

(-45000.0, 171000.0)

In [1180]:
get_iqr(df, 'year')

(2002.0, 2026.0)

In [439]:
df[df['km_driven'] < get_iqr(df, 'km_driven')[0]].shape, df[df['km_driven'] > get_iqr(df, 'km_driven')[1]].shape

((0, 8), (106, 8))

In [464]:
df[df['km_driven'] > 200000].shape

(53, 8)

observation:
- based on bussiness discussion, we considered outliers with kn_driver > 200000 & km_driven < 1000

In [459]:
df[(df['km_driven'] < 5000) & (df['owner'] == 'Second Owner') ]

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
144,Hyundai EON Era Plus,2013,280000,3240,Petrol,Individual,Manual,Second Owner
552,Renault Duster 85PS Diesel RxL,2013,450000,1000,Diesel,Dealer,Manual,Second Owner
1312,Mahindra Quanto C6,2014,250000,1,Diesel,Individual,Manual,Second Owner
1579,Mahindra Jeep Classic,1999,170000,2020,Diesel,Individual,Manual,Second Owner
2485,Maruti Alto K10 VXI AGS,2015,281000,4432,Petrol,Dealer,Automatic,Second Owner
2667,Ford Ikon 1.3 Flair,2005,61000,4637,Petrol,Individual,Manual,Second Owner
3700,Datsun GO Plus T BSIV,2018,400000,4400,Petrol,Individual,Manual,Second Owner


In [1183]:
df = df[~(df['km_driven'] > 170000)]

In [1184]:
df = df[~(df['km_driven'] < 5000)]

In [1185]:
df = df[~(df['year'] < 2000)]

In [1571]:
df['owner'].unique()

array(['First Owner', 'Second Owner', 'Fourth & Above Owner',
       'Third Owner', 'Test Drive Car'], dtype=object)

In [1572]:
owner_mapping = {
    'First Owner' : 1,
    'Second Owner' : 2,
    'Third Owner' : 3,
    'Fourth & Above Owner' : 4,
    'Test Drive Car' : 5
}

In [1579]:
df['owner'] = df['owner'].map(owner_mapping)

In [1580]:
df.shape

(4067, 8)

# data split

In [1581]:
X = df.drop('selling_price', axis = 1)
y = df['selling_price']

In [1582]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1583]:
X_train.shape, y_test.shape, X_test.shape, y_train.shape

((3253, 7), (814,), (814, 7), (3253,))

In [1191]:
# X_train = X_train[~(X_train['km_driven'] > 200000) | (X_train['km_driven'] < 1000) ]
# X_test = X_test[~(X_test['km_driven'] > 200000) | (X_test['km_driven'] < 1000) ]

In [1192]:
# train_com_index = X_train.index.intersection(y_train.index)
# test_com_index = X_test.index.intersection(y_test.index)

In [1193]:
# y_train = y_train.loc[train_com_index]
# y_test = y_test.loc[test_com_index]

In [1194]:
# X_train.shape, y_test.shape, X_test.shape, y_train.shape

In [1584]:
X_train

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner
3691,Ford Ecosport 1.5 DV5 MT Titanium,2014,63356,Diesel,Dealer,Manual,2
1717,Ford Figo 1.5D Titanium Opt MT,2015,52328,Diesel,Dealer,Manual,1
2006,Maruti Alto LXi,2008,70000,Petrol,Individual,Manual,1
3594,Mahindra Scorpio VLX 2WD AIRBAG BSIV,2014,100000,Diesel,Individual,Manual,2
470,Hyundai i10 Era 1.1,2009,120000,Petrol,Individual,Manual,2
...,...,...,...,...,...,...,...
1200,Maruti Ignis 1.2 AMT Delta BSIV,2017,15000,Petrol,Individual,Automatic,1
1373,Renault KWID RXL,2016,38000,Petrol,Individual,Manual,1
915,Maruti Swift Dzire VDI,2017,46507,Diesel,Trustmark Dealer,Manual,1
3750,Ford Figo Diesel ZXI,2014,90000,Diesel,Individual,Manual,1


In [1585]:
X_train.select_dtypes(include='object').columns

Index(['name', 'fuel', 'seller_type', 'transmission'], dtype='object')

In [1586]:
X_train['car_name'] = X_train['name'].str.split(" ", expand=True)[0]
X_test['car_name'] = X_test['name'].str.split(" ", expand=True)[0]

In [1587]:
X_train['model'] = X_train['name'].str.split(" ", expand=True).iloc[:,1:].fillna("").apply(lambda x: " ".join(x).strip(), axis = 1)
X_test['model'] = X_test['name'].str.split(" ", expand=True).iloc[:,1:].fillna("").apply(lambda x: " ".join(x).strip(), axis = 1)

In [1588]:
X_train.drop(columns='name', axis=1, inplace=True )
X_test.drop(columns='name', axis=1, inplace=True )

In [1589]:
for i in X_train.select_dtypes(include='object'):
    print(f"{i}: {X_train[i].unique()}")

fuel: ['Diesel' 'Petrol' 'LPG' 'CNG' 'Electric']
seller_type: ['Dealer' 'Individual' 'Trustmark Dealer']
transmission: ['Manual' 'Automatic']
car_name: ['Ford' 'Maruti' 'Mahindra' 'Hyundai' 'Renault' 'Volkswagen' 'Chevrolet'
 'Skoda' 'Tata' 'Nissan' 'Honda' 'Land' 'Mercedes-Benz' 'Fiat' 'Toyota'
 'Datsun' 'Audi' 'BMW' 'Jaguar' 'Ambassador' 'Mitsubishi' 'Force' 'Volvo'
 'MG' 'Jeep' 'Isuzu' 'Daewoo' 'OpelCorsa' 'Kia']
model: ['Ecosport 1.5 DV5 MT Titanium' 'Figo 1.5D Titanium Opt MT' 'Alto LXi' ...
 'Swift VXI with ABS' 'Tavera Neo 2 LT L 9 Str' 'Ignis 1.2 AMT Delta BSIV']


In [1590]:
for i in X_train.select_dtypes(include='object'):
    print(f"{i}: {X_train[i].nunique()}")

fuel: 5
seller_type: 3
transmission: 2
car_name: 29
model: 1262


In [1591]:
for i in X_train.select_dtypes(exclude='object'):
    print(f"{i}: {X_train[i].isnull().sum()}")

year: 0
km_driven: 0
owner: 0


In [1592]:
X_train.shape, y_test.shape, y_train.shape, X_test.shape

((3253, 8), (814,), (3253,), (814, 8))

In [1593]:
X_train.head(3)

Unnamed: 0,year,km_driven,fuel,seller_type,transmission,owner,car_name,model
3691,2014,63356,Diesel,Dealer,Manual,2,Ford,Ecosport 1.5 DV5 MT Titanium
1717,2015,52328,Diesel,Dealer,Manual,1,Ford,Figo 1.5D Titanium Opt MT
2006,2008,70000,Petrol,Individual,Manual,1,Maruti,Alto LXi


## One Hot Encoding for low cardinality features

In [1594]:
cols_to_ohe = ['fuel','seller_type','transmission']

X_train_ohe = pd.get_dummies(data = X_train, columns=cols_to_ohe, drop_first= True, dtype='int8')

X_test_ohe = pd.get_dummies(data=X_test, columns=cols_to_ohe, drop_first=True, dtype='int8')
X_test_ohe = X_test_ohe.reindex(columns=X_train_ohe.columns, fill_value=0)


In [1595]:
X_train_ohe.head(3)

Unnamed: 0,year,km_driven,owner,car_name,model,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual
3691,2014,63356,2,Ford,Ecosport 1.5 DV5 MT Titanium,1,0,0,0,0,0,1
1717,2015,52328,1,Ford,Figo 1.5D Titanium Opt MT,1,0,0,0,0,0,1
2006,2008,70000,1,Maruti,Alto LXi,0,0,0,1,1,0,1


In [1596]:
X_test_ohe.head(3)

Unnamed: 0,year,km_driven,owner,car_name,model,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual
3286,2015,110000,2,Mahindra,XUV500 W8 2WD,1,0,0,0,1,0,1
1611,2005,118400,1,Maruti,Swift 1.3 VXi,0,0,0,1,1,0,1
3730,2018,35000,1,Hyundai,Grand i10 1.2 Kappa Sportz AT,0,0,0,1,1,0,0


## Target encoding for high cardinality features

In [1597]:
cols_target_encode = ['car_name','model']

encoder = TargetEncoder(smoothing=0.3)
encoder.fit(X_train_ohe[cols_target_encode], y_train)

X_train_ohe[cols_target_encode] = encoder.transform(X_train_ohe[cols_target_encode])
X_test_ohe[cols_target_encode] = encoder.transform(X_test_ohe[cols_target_encode])

In [1598]:
X_train_ohe.shape, X_test_ohe.shape

((3253, 12), (814, 12))

In [1600]:
X_train_ohe.head(3)

Unnamed: 0,year,km_driven,owner,car_name,model,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual
3691,2014,63356,2,564714.255952,504373.215186,1,0,0,0,0,0,1
1717,2015,52328,1,564714.255952,504373.215186,1,0,0,0,0,0,1
2006,2008,70000,1,350250.363911,126894.710526,0,0,0,1,1,0,1


## Data Scaling

In [1601]:
cols_to_scale = ['km_driven','car_name','model']

scaler = StandardScaler()
scaler.fit(X_train_ohe[cols_to_scale])

X_train_ohe[cols_to_scale] = scaler.transform(X_train_ohe[cols_to_scale])
X_test_ohe[cols_to_scale] = scaler.transform(X_test_ohe[cols_to_scale])

In [1602]:
X_train_ohe.head(3)

Unnamed: 0,year,km_driven,owner,car_name,model,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual
3691,2014,0.060939,2,0.181195,0.243234,1,0,0,0,0,0,1
1717,2015,-0.262392,1,0.181195,0.243234,1,0,0,0,0,0,1
2006,2008,0.255734,1,-0.380758,-5.153237,0,0,0,1,1,0,1


In [1603]:
# X_train_ohe['km_driven'].mean(),X_train_ohe['km_driven'].std() 
X_train_ohe.describe()

Unnamed: 0,year,km_driven,owner,car_name,model,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual
count,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0
mean,2013.271749,1.1467410000000001e-17,1.442053,-6.115953e-17,4.619729e-16,0.482324,0.000307,0.004304,0.503535,0.746388,0.025515,0.897018
std,4.009206,1.000154,0.707178,1.000154,1.000154,0.499764,0.017533,0.065471,0.500064,0.435145,0.157707,0.303982
min,2000.0,-1.650003,1.0,-0.6826792,-5.391809,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,-0.7704319,1.0,-0.3807584,0.243234,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2014.0,-0.08143452,1.0,-0.1721796,0.243234,0.0,0.0,0.0,1.0,1.0,0.0,1.0
75%,2017.0,0.6955199,2.0,0.1160874,0.243234,1.0,0.0,0.0,1.0,1.0,0.0,1.0
max,2020.0,2.601257,5.0,6.487522,0.4086569,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [1604]:
X_train_ohe.shape, X_test_ohe.shape

((3253, 12), (814, 12))

In [1605]:
X_train_ohe

Unnamed: 0,year,km_driven,owner,car_name,model,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual
3691,2014,0.060939,2,0.181195,0.243234,1,0,0,0,0,0,1
1717,2015,-0.262392,1,0.181195,0.243234,1,0,0,0,0,0,1
2006,2008,0.255734,1,-0.380758,-5.153237,0,0,0,1,1,0,1
3594,2014,1.135305,2,0.364692,0.243234,1,0,0,0,1,0,1
470,2009,1.721686,2,-0.172180,0.243234,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1200,2017,-1.356813,1,-0.380758,0.243234,0,0,0,1,1,0,0
1373,2016,-0.682475,1,-0.244234,0.243234,0,0,0,1,1,0,1
915,2017,-0.433058,1,-0.380758,0.408657,1,0,0,0,0,1,1
3750,2014,0.842115,1,0.181195,0.243234,1,0,0,0,1,0,1


# feature selection using RFE

In [1660]:
estimator = LinearRegression()
selected_features = 10

rfe = RFE(estimator= estimator, n_features_to_select= selected_features)

rfe.fit(X_train_ohe, y_train)

In [1661]:
rfe.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1])

In [1662]:
rfe.support_

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True])

In [1664]:
X_train_ohe.columns[rfe.support_].tolist()

['year',
 'km_driven',
 'owner',
 'car_name',
 'model',
 'fuel_Diesel',
 'fuel_Electric',
 'fuel_LPG',
 'seller_type_Trustmark Dealer',
 'transmission_Manual']

In [1665]:
selected_features = X_train_ohe.columns[rfe.support_].tolist()


In [1666]:
X_train_selected = X_train_ohe[selected_features]
X_test_selected = X_test_ohe[selected_features]

In [1667]:
X_train_selected.shape, X_test_selected.shape

((3253, 10), (814, 10))

# VIF to check for multicollinearity for selected features

In [1668]:
vif_df = pd.DataFrame()

vif_df['features'] = X_train_selected.columns
vif_df['vif'] = [variance_inflation_factor(X_train_selected.values, i) for i in range(X_train_selected.shape[1])]

In [1669]:
vif_df

Unnamed: 0,features,vif
0,year,18.872701
1,km_driven,1.317128
2,owner,6.039173
3,car_name,1.395477
4,model,1.062209
5,fuel_Diesel,2.40333
6,fuel_Electric,1.003812
7,fuel_LPG,1.015375
8,seller_type_Trustmark Dealer,1.046014
9,transmission_Manual,13.10965


observations:
- implemented train, test
- check for any transformation required for each feature, same to be applied to test as well
- splitted name feature into car_name & model, applied same transformation to test as well.
- OHE --> low cardinality features < 5
- Te --> high cardinality features > 5
- scaling --> numerical features
- rfe --> top 10 influence feature to target variable
- VIF --> to check multicollinearlity
- selected features to run the model

# model training & evaluation

In [1703]:
model = LinearRegression()

model.fit(X_train_selected, y_train)
y_train_pred = model.predict(X_train_selected)
y_test_pred = model.predict(X_test_selected)

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)


print(f"train_rmse: {train_rmse}")
print(f"test_rmse: {test_rmse}")
print("--------------------------------------")

print(f"train_r2: {train_r2}")
print(f"test_r2: {test_r2}")


train_rmse: 349523.65782535495
test_rmse: 346893.83526875766
--------------------------------------
train_r2: 0.6309954550217575
test_r2: 0.7008205150112956


In [1704]:
model = Lasso(alpha=0.9, max_iter=10000)

model.fit(X_train_selected, y_train)
y_train_pred = model.predict(X_train_selected)
y_test_pred = model.predict(X_test_selected)

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)


print(f"train_rmse: {train_rmse}")
print(f"test_rmse: {test_rmse}")
print("--------------------------------------")

print(f"train_r2: {train_r2}")
print(f"test_r2: {test_r2}")

train_rmse: 349523.66198633256
test_rmse: 346894.2087527308
--------------------------------------
train_r2: 0.6309954462359735
test_r2: 0.7008198707865125


### Lasso CV for feature selection

In [1672]:
model = LassoCV(cv = 5, random_state=42)

model.fit(X_train_selected, y_train)

y_test_pred = model.predict(X_test_selected)

test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print(f"test_score_lr: {test_rmse}")
print("--------------------------------------")
print(f"test_r2: {test_r2}")
print("--------------------------------------")
print(f"'optimal alpha: {model.alpha_}")

model.coef_

test_score_lr: 347252.62497733894
--------------------------------------
test_r2: 0.70020131673737
--------------------------------------
'optimal alpha: 961.7962065095576


array([  40145.17139176,  -42997.26219775,   -7488.4406352 ,
        288952.16350979,    8499.43998967,  159152.85230843,
            -0.        ,       0.        ,  227402.21888705,
       -393757.59068594])

In [1705]:
model_rf = RandomForestRegressor(n_estimators = 500, max_depth = 100, min_samples_split =5)

model_rf.fit(X_train_selected, y_train)
y_train_pred_best1 = model_rf.predict(X_train_selected)
y_test_pred_best1 = model_rf.predict(X_test_selected)

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_best1))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_best1))

train_r2 = r2_score(y_train, y_train_pred_best1)
test_r2 = r2_score(y_test, y_test_pred_best1)


print(f"train_rmse: {train_rmse}")
print(f"test_rmse: {test_rmse}")
print("--------------------------------------")

print(f"train_r2: {train_r2}")
print(f"test_r2: {test_r2}")

train_rmse: 140695.58826800424
test_rmse: 225564.30703024968
--------------------------------------
train_r2: 0.9402084900194057
test_r2: 0.8735032395294295


In [1706]:
model = DecisionTreeRegressor(criterion = 'friedman_mse',max_depth = 100, min_samples_split = 5, min_samples_leaf = 5)

model.fit(X_train_selected, y_train)
y_train_pred_best = model.predict(X_train_selected)
y_test_pred_best = model.predict(X_test_selected)

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred_best))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_best))

train_r2 = r2_score(y_train, y_train_pred_best)
test_r2 = r2_score(y_test, y_test_pred_best)


print(f"train_rmse: {train_rmse}")
print(f"test_rmse: {test_rmse}")
print("--------------------------------------")

print(f"train_r2: {train_r2}")
print(f"test_r2: {test_r2}")

train_rmse: 219359.11397603844
test_rmse: 253998.2056003027
--------------------------------------
train_r2: 0.8546584027921751
test_r2: 0.839601632677982


In [1675]:
def model_train_and_evaluate(model, X_train, y_train, X_test, y_test):

    models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(alpha=100, max_iter= 10000),
    'Ridge Regression': Ridge(),
    'XGBoost': XGBRegressor(n_estimators = 10, max_depth = 100, learning_rate = 0.3, random_state = 42),
    'Random forest': RandomForestRegressor(n_estimators = 200, max_depth = 100, min_samples_split =5)
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
    
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)

        results[name] = {
            'train_rmse': train_rmse,
            'test_rmse': test_rmse,
            'train_r2': train_r2,
            'test_r2': test_r2,
            '': '------------------------'
        }
    
        # print(f"model name: {model}")
        # print(f"train rmse: {train_rmse}")
        # print(f"test rmse: {test_rmse}")
        # print(f"test r2: {train_r2}")
        # print(f"test r2: {test_r2}")
        # print("----------------------------")
    
    return results 


In [1676]:
model_train_and_evaluate(models, X_train_selected, y_train, X_test_selected, y_test)

{'Linear Regression': {'train_rmse': 349523.65782535495,
  'test_rmse': 346893.83526875766,
  'train_r2': 0.6309954550217575,
  'test_r2': 0.7008205150112956,
  '': '------------------------'},
 'Lasso Regression': {'train_rmse': 349573.32853632147,
  'test_rmse': 346938.45235392376,
  'train_r2': 0.6308905693006732,
  'test_r2': 0.7007435498211432,
  '': '------------------------'},
 'Ridge Regression': {'train_rmse': 349556.11107733456,
  'test_rmse': 346927.04580896057,
  'train_r2': 0.6309269277380147,
  'test_r2': 0.7007632272366477,
  '': '------------------------'},
 'XGBoost': {'train_rmse': 84194.68202199152,
  'test_rmse': 226892.886498548,
  'train_r2': 0.9785884610072403,
  'test_r2': 0.8720087128221555,
  '': '------------------------'},
 'Random forest': {'train_rmse': 139386.93518332244,
  'test_rmse': 226760.0328232905,
  'train_r2': 0.9413155957678759,
  'test_r2': 0.8721585556393116,
  '': '------------------------'}}

# Error Analysis

In [1677]:
ea_df = pd.DataFrame()
ea_df['y_actual'] = y_test
ea_df['y_pred'] = y_test_pred_best1
ea_df['diff'] = (ea_df['y_actual'] - ea_df['y_pred']) 
ea_df['diff_perc'] = abs(((ea_df['y_actual'] - ea_df['y_pred'])/ ea_df['y_actual']) * 100)
# ea_df['mape'] = mean_absolute_percentage_error(y_test, y_test_pred_best)

In [1695]:
error_threshold = 20

In [1696]:
ea_df1 = ea_df[ea_df['diff_perc'] >= error_threshold]
ea_df1

Unnamed: 0,y_actual,y_pred,diff,diff_perc
3286,900000,5.928452e+05,307154.834194,34.128315
3730,550000,9.184219e+05,-368421.915943,66.985803
3642,600000,4.774825e+05,122517.465409,20.419578
3952,300000,4.795321e+05,-179532.116345,59.844039
3989,475000,3.290563e+05,145943.725108,30.724995
...,...,...,...,...
2157,950000,6.590935e+05,290906.487052,30.621735
3525,850000,6.793795e+05,170620.488256,20.072999
2206,300000,1.956775e+05,104322.530905,34.774177
3368,85000,1.310789e+05,-46078.859558,54.210423


In [1697]:
ea_df[ea_df['diff_perc'] > error_threshold].shape[0] / ea_df['y_actual'].shape[0]

0.47911547911547914

In [1698]:
ea_df1.index

Index([3286, 3730, 3642, 3952, 3989, 4294, 2030, 2829,  865, 4328,
       ...
       2921, 2544, 1874, 2780,  238, 2157, 3525, 2206, 3368,   94],
      dtype='int64', length=390)

In [1699]:
ea_common_index = ea_df1.index.intersection(X_test.index)
ea_common_index

Index([3286, 3730, 3642, 3952, 3989, 4294, 2030, 2829,  865, 4328,
       ...
       2921, 2544, 1874, 2780,  238, 2157, 3525, 2206, 3368,   94],
      dtype='int64', length=390)

observations:
- assuming error percent not exceeding 20%
- 48% of test values are exceeding the error thershold.
- decision tree & random forest regressor have best results
- we considered decision tree as our best model

# Export Model

In [1700]:
joblib.dump(model_rf, r"C:\Users\aravit01\OneDrive - Kearney\1. RAVI TEJA\12. My Learnings\Portfolio Projects to Showcase\Car Price Prediction - Regression\model.joblib")

['C:\\Users\\aravit01\\OneDrive - Kearney\\1. RAVI TEJA\\12. My Learnings\\Portfolio Projects to Showcase\\Car Price Prediction - Regression\\model.joblib']

In [1701]:
scaling_data = {
    'scaler': scaler,
    'cols_to_scale': cols_to_scale
}
joblib.dump(scaling_data, r"C:\Users\aravit01\OneDrive - Kearney\1. RAVI TEJA\12. My Learnings\Portfolio Projects to Showcase\Car Price Prediction - Regression\scaler.joblib")

['C:\\Users\\aravit01\\OneDrive - Kearney\\1. RAVI TEJA\\12. My Learnings\\Portfolio Projects to Showcase\\Car Price Prediction - Regression\\scaler.joblib']

In [1702]:
target_encoding = {
    'encoder': encoder,
    'cols_to_te': cols_target_encode
}
joblib.dump(target_encoding, r"C:\Users\aravit01\OneDrive - Kearney\1. RAVI TEJA\12. My Learnings\Portfolio Projects to Showcase\Car Price Prediction - Regression\te.joblib")

['C:\\Users\\aravit01\\OneDrive - Kearney\\1. RAVI TEJA\\12. My Learnings\\Portfolio Projects to Showcase\\Car Price Prediction - Regression\\te.joblib']