In [8]:
# import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings 
warnings.filterwarnings("ignore")

import shap
import matplotlib.pyplot as plt
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from feature_engine.encoding import RareLabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import ast

pd.set_option('display.max_rows', 1000)

In [17]:
df = pd.read_csv("cars_24_combined.csv").drop(['Unnamed: 0'], axis=1).drop_duplicates()
print(df.shape)
df.sample(5).T

(8015, 9)


Unnamed: 0,6057,2307,2933,6244,2715
Car Name,Hyundai i20,Maruti IGNIS,Renault Kwid,Tata Tiago,Maruti Alto 800
Year,2011.0,2017.0,2017.0,2018.0,2017.0
Distance,57085,73927,27168,103737,17035
Owner,2,1,1,1,2
Fuel,PETROL,PETROL,PETROL,PETROL,PETROL
Location,WB-06,KA-03,KA-53,,TS-09
Drive,Manual,Automatic,Manual,Manual,Manual
Type,HatchBack,HatchBack,HatchBack,HatchBack,HatchBack
Price,202000,554000,376000,426000,354000


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8015 entries, 0 to 8014
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Car Name  8014 non-null   object 
 1   Year      8014 non-null   float64
 2   Distance  8015 non-null   int64  
 3   Owner     8015 non-null   int64  
 4   Fuel      8015 non-null   object 
 5   Location  7802 non-null   object 
 6   Drive     8015 non-null   object 
 7   Type      8015 non-null   object 
 8   Price     8015 non-null   int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 626.2+ KB


In [19]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,8014.0,2016.995009,2.861454,2010.0,2015.0,2017.0,2019.0,2023.0
Distance,8015.0,52621.411728,29182.922728,0.0,30730.0,50359.0,71762.0,971212.0
Owner,8015.0,1.300187,0.510893,1.0,1.0,1.0,2.0,4.0
Price,8015.0,574882.857143,265104.875929,119000.0,393000.0,535000.0,698000.0,3300000.0


In [20]:
main_label = 'Price [Lakh INR]'
df[main_label] = df['Price']*1e-5
# log10-transform distance and group for larger bins
df['log10_Distance'] = df['Distance'].apply(lambda x: 1/5*round(5*np.log10(1+x)))
# convert years to string
df['Year'] = df['Year'].fillna('None').astype(str)
# fill NaN values
df['Location'] = df['Location'].fillna('None')
df['Car Name'] = df['Car Name'].fillna('None')
# lowercase car names
df['Car Name'] = df['Car Name'].str.lower()
# set up the rare label encoder limiting number of categories to max_n_categories
for col in ['Car Name', 'Fuel', 'Location', 'Drive', 'Drive']:
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=60, replace_with='Other', tol=20/df.shape[0])
    df[col] = encoder.fit_transform(df[[col]])
# drop unused columns
cols2drop = ['Price', 'Distance']
df = df.drop(cols2drop, axis=1)
print(df.shape)
df.sample(5).T

(8015, 9)


Unnamed: 0,454,3896,1249,7297,710
Car Name,maruti swift,hyundai new i20,hyundai grand i10,hyundai i20,hyundai elite i20
Year,2019.0,2021.0,2018.0,2012.0,2020.0
Owner,1,1,1,2,1
Fuel,PETROL,PETROL,PETROL,DIESEL,PETROL
Location,MH-14,RJ-45,Other,KA-51,Other
Drive,Automatic,Manual,Manual,Manual,Manual
Type,HatchBack,HatchBack,HatchBack,HatchBack,HatchBack
Price [Lakh INR],6.15,6.95,5.12,3.89,7.62
log10_Distance,5.0,4.0,4.6,5.0,4.6
