In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
quikr_df = pd.read_csv('quikr_car.csv')
quikr_df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [4]:
quikr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [5]:
quikr_df.describe()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
count,892,892,892,892,840,837
unique,525,48,61,274,258,3
top,Honda City,Maruti,2015,Ask For Price,"45,000 kms",Petrol
freq,13,235,117,35,30,440


### EDA

#### name

In [6]:
quikr_df['name'].unique()

array(['Hyundai Santro Xing XO eRLX Euro III', 'Mahindra Jeep CL550 MDI',
       'Maruti Suzuki Alto 800 Vxi',
       'Hyundai Grand i10 Magna 1.2 Kappa VTVT',
       'Ford EcoSport Titanium 1.5L TDCi', 'Ford Figo', 'Hyundai Eon',
       'Ford EcoSport Ambiente 1.5L TDCi',
       'Maruti Suzuki Alto K10 VXi AMT', 'Skoda Fabia Classic 1.2 MPI',
       'Maruti Suzuki Stingray VXi', 'Hyundai Elite i20 Magna 1.2',
       'Mahindra Scorpio SLE BS IV', 'Audi A8', 'Audi Q7',
       'Mahindra Scorpio S10', 'Maruti Suzuki Alto 800',
       'Hyundai i20 Sportz 1.2', 'Maruti Suzuki Alto 800 Lx',
       'Maruti Suzuki Vitara Brezza ZDi', 'Maruti Suzuki Alto LX',
       'Mahindra Bolero DI', 'Maruti Suzuki Swift Dzire ZDi',
       'Mahindra Scorpio S10 4WD', 'Maruti Suzuki Swift Vdi BSIII',
       'Maruti Suzuki Wagon R VXi BS III',
       'Maruti Suzuki Wagon R VXi Minor',
       'Toyota Innova 2.0 G 8 STR BS IV', 'Renault Lodgy 85 PS RXL',
       'Skoda Yeti Ambition 2.0 TDI CR 4x2',
       'Maru

In [7]:
# Taking first string only as a car name
quikr_df['name'] = quikr_df['name'].apply(lambda carname: " ".join(carname.split()[0:3]))

In [8]:
quikr_df['name']

0         Hyundai Santro Xing
1         Mahindra Jeep CL550
2          Maruti Suzuki Alto
3           Hyundai Grand i10
4      Ford EcoSport Titanium
                ...          
887                        Ta
888              Tata Zest XM
889        Mahindra Quanto C8
890           Honda Amaze 1.2
891        Chevrolet Sail 1.2
Name: name, Length: 892, dtype: object

#### year     

In [9]:
quikr_df['year'].unique()

array(['2007', '2006', '2018', '2014', '2015', '2012', '2013', '2016',
       '2010', '2017', '2008', '2011', '2019', '2009', '2005', '2000',
       '...', '150k', 'TOUR', '2003', 'r 15', '2004', 'Zest', '/-Rs',
       'sale', '1995', 'ara)', '2002', 'SELL', '2001', 'tion', 'odel',
       '2 bs', 'arry', 'Eon', 'o...', 'ture', 'emi', 'car', 'able', 'no.',
       'd...', 'SALE', 'digo', 'sell', 'd Ex', 'n...', 'e...', 'D...',
       ', Ac', 'go .', 'k...', 'o c4', 'zire', 'cent', 'Sumo', 'cab',
       't xe', 'EV2', 'r...', 'zest'], dtype=object)

In [10]:
# Take only which contains numbers 
quikr_df = quikr_df[quikr_df['year'].str.isnumeric()]

In [11]:
quikr_df['year'] = quikr_df['year'].astype(int)

### Price

In [12]:
quikr_df['Price'].unique()

array(['80,000', '4,25,000', 'Ask For Price', '3,25,000', '5,75,000',
       '1,75,000', '1,90,000', '8,30,000', '2,50,000', '1,82,000',
       '3,15,000', '4,15,000', '3,20,000', '10,00,000', '5,00,000',
       '3,50,000', '1,60,000', '3,10,000', '75,000', '1,00,000',
       '2,90,000', '95,000', '1,80,000', '3,85,000', '1,05,000',
       '6,50,000', '6,89,999', '4,48,000', '5,49,000', '5,01,000',
       '4,89,999', '2,80,000', '3,49,999', '2,84,999', '3,45,000',
       '4,99,999', '2,35,000', '2,49,999', '14,75,000', '3,95,000',
       '2,20,000', '1,70,000', '85,000', '2,00,000', '5,70,000',
       '1,10,000', '4,48,999', '18,91,111', '1,59,500', '3,44,999',
       '4,49,999', '8,65,000', '6,99,000', '3,75,000', '2,24,999',
       '12,00,000', '1,95,000', '3,51,000', '2,40,000', '90,000',
       '1,55,000', '6,00,000', '1,89,500', '2,10,000', '3,90,000',
       '1,35,000', '16,00,000', '7,01,000', '2,65,000', '5,25,000',
       '3,72,000', '6,35,000', '5,50,000', '4,85,000', '3,29,5

In [13]:
# Need to remove Ask For Price
# Need to remove ,
# Convert into int
quikr_df = quikr_df[quikr_df['Price'] !="Ask For Price"]

In [14]:
quikr_df['Price'] = quikr_df['Price'].str.replace(",","")

In [17]:
quikr_df['Price'] = quikr_df['Price'].astype(int)

In [18]:
quikr_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 819 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        819 non-null    object
 1   company     819 non-null    object
 2   year        819 non-null    int32 
 3   Price       819 non-null    int32 
 4   kms_driven  819 non-null    object
 5   fuel_type   816 non-null    object
dtypes: int32(2), object(4)
memory usage: 38.4+ KB


#### kms_driven

In [19]:
quikr_df['kms_driven']

0        45,000 kms
1            40 kms
3        28,000 kms
4        36,000 kms
6        41,000 kms
           ...     
886    1,32,000 kms
888      27,000 kms
889      40,000 kms
890          Petrol
891          Petrol
Name: kms_driven, Length: 819, dtype: object

In [20]:
# Need to remove kms
# Remove ,
# Convert into int datatype
# Remove petrol
quikr_df['kms_driven'] = quikr_df['kms_driven'].str.replace(",","")

In [21]:
quikr_df['kms_driven'] = quikr_df['kms_driven'].str.replace("kms","")

In [22]:
quikr_df= quikr_df[quikr_df['kms_driven'] != 'Petrol']

In [23]:
quikr_df['kms_driven'] = quikr_df['kms_driven'].astype(int)

In [24]:
quikr_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 817 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        817 non-null    object
 1   company     817 non-null    object
 2   year        817 non-null    int32 
 3   Price       817 non-null    int32 
 4   kms_driven  817 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(3), object(3)
memory usage: 35.1+ KB


#### fuel_type

In [25]:
quikr_df['fuel_type']

0      Petrol
1      Diesel
3      Petrol
4      Diesel
6      Diesel
        ...  
883    Petrol
885    Diesel
886    Petrol
888    Diesel
889    Diesel
Name: fuel_type, Length: 817, dtype: object

In [26]:
quikr_df['fuel_type'].unique()

array(['Petrol', 'Diesel', nan, 'LPG'], dtype=object)

In [27]:
quikr_df.dropna(subset=['fuel_type'],inplace=True)

In [28]:
quikr_df['fuel_type'].unique()

array(['Petrol', 'Diesel', 'LPG'], dtype=object)

In [29]:
quikr_df['fuel_type'].value_counts()

fuel_type
Petrol    428
Diesel    386
LPG         2
Name: count, dtype: int64

#### year

In [30]:
quikr_df['year']

0      2007
1      2006
3      2014
4      2014
6      2012
       ... 
883    2011
885    2009
886    2009
888    2018
889    2013
Name: year, Length: 816, dtype: int32

In [31]:
quikr_df['year'].unique()

array([2007, 2006, 2014, 2012, 2013, 2016, 2015, 2010, 2017, 2008, 2018,
       2011, 2019, 2009, 2005, 2000, 2003, 2004, 1995, 2002, 2001])

In [32]:
quikr_df['year'].value_counts()

year
2015    111
2013     94
2014     92
2012     75
2016     74
2011     59
2009     54
2017     53
2010     43
2018     30
2006     22
2007     19
2019     18
2008     16
2005     13
2003     13
2004     12
2000      7
2001      5
2002      4
1995      2
Name: count, dtype: int64

In [33]:
# quikr_df.to_csv('cleaned_quikr.csv')

In [34]:
quikr_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 816 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        816 non-null    object
 1   company     816 non-null    object
 2   year        816 non-null    int32 
 3   Price       816 non-null    int32 
 4   kms_driven  816 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(3), object(3)
memory usage: 35.1+ KB


In [35]:
encoder = LabelEncoder()

In [36]:
quikr_df.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


In [37]:
quikr_df['name'] = encoder.fit_transform(quikr_df['name'])
quikr_df['company'] = encoder.fit_transform(quikr_df['company'])
quikr_df['fuel_type'] = encoder.fit_transform(quikr_df['fuel_type'])

In [38]:
X = quikr_df.drop(columns='Price',axis=1)
y = quikr_df.Price

In [39]:
X

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,91,9,2007,45000,2
1,118,13,2006,40,0
3,88,9,2014,28000,2
4,40,6,2014,36000,0
6,45,6,2012,41000,0
...,...,...,...,...,...
883,159,14,2011,50000,2
885,203,21,2009,30000,0
886,231,22,2009,132000,2
888,229,21,2018,27000,0


In [40]:
y

0       80000
1      425000
3      325000
4      575000
6      175000
        ...  
883    270000
885    110000
886    300000
888    260000
889    390000
Name: Price, Length: 816, dtype: int32

In [41]:
scaler = StandardScaler()

In [42]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2,test_size=0.2)

In [44]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [45]:
X_train

array([[ 1.44082286,  1.51714565, -0.16585893,  0.79189489, -1.03671715],
       [ 1.24605449,  1.51714565,  0.85916496,  1.60416289, -1.03671715],
       [-0.63670634, -0.6584679 ,  0.34665302,  0.28772854,  0.9660668 ],
       ...,
       [ 0.58059592,  0.24803774,  1.11542093, -0.66458567,  0.9660668 ],
       [-1.90270069, -2.10887694, -0.4221149 ,  0.70786717, -1.03671715],
       [-0.01993986,  0.06673661,  0.60290899,  1.41370005, -1.03671715]])

In [46]:
models = [LinearRegression, Lasso, Ridge, SVR, DecisionTreeRegressor, RandomForestRegressor,GradientBoostingRegressor,CatBoostRegressor]
mae_scores = []
mse_scores = []
rmse_scores = []
r2_scores = []

for model in models:
    regressor = model().fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    
    mae_scores.append(mean_absolute_error(y_test, y_pred))
    mse_scores.append(mean_squared_error(y_test, y_pred))
    rmse_scores.append(mean_squared_error(y_test, y_pred, squared=False))
    r2_scores.append(r2_score(y_test, y_pred))

Learning rate set to 0.038268
0:	learn: 500049.6102298	total: 180ms	remaining: 2m 59s
1:	learn: 497530.6175401	total: 187ms	remaining: 1m 33s
2:	learn: 495005.6998152	total: 193ms	remaining: 1m 4s
3:	learn: 491301.5086818	total: 196ms	remaining: 48.8s
4:	learn: 489548.2834001	total: 198ms	remaining: 39.4s
5:	learn: 486858.1428891	total: 201ms	remaining: 33.3s
6:	learn: 484735.6587288	total: 204ms	remaining: 28.9s
7:	learn: 482341.0794536	total: 206ms	remaining: 25.6s
8:	learn: 479284.0497876	total: 209ms	remaining: 23s
9:	learn: 475623.9920913	total: 211ms	remaining: 20.9s
10:	learn: 473692.3310169	total: 214ms	remaining: 19.2s
11:	learn: 471658.1113981	total: 217ms	remaining: 17.8s
12:	learn: 469951.7049308	total: 219ms	remaining: 16.6s
13:	learn: 466765.8061048	total: 223ms	remaining: 15.7s
14:	learn: 464704.5609734	total: 227ms	remaining: 14.9s
15:	learn: 463189.9135997	total: 229ms	remaining: 14.1s
16:	learn: 461127.9554841	total: 231ms	remaining: 13.4s
17:	learn: 458377.0378736	to

In [47]:
regression_metrics_df = pd.DataFrame({
    "Model": ["Linear Regression", "Lasso", "Ridge", "SVR", "Decision Tree Regressor", "Random Forest Regressor","GradientBoostingRegressor",'CatBoostRegressor'],
    "Mean Absolute Error": mae_scores,
    "Mean Squared Error": mse_scores,
    "Root Mean Squared Error": rmse_scores,
    "R-squared (R2)": r2_scores
})

regression_metrics_df.set_index('Model', inplace=True)
regression_metrics_df

Unnamed: 0_level_0,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error,R-squared (R2)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Linear Regression,200763.394345,98883790000.0,314457.928955,0.151027
Lasso,200760.512925,98882890000.0,314456.50088,0.151034
Ridge,199995.355098,98643810000.0,314076.12694,0.153087
SVR,228507.915567,120421500000.0,347018.024835,-0.033887
Decision Tree Regressor,208341.183943,817029600000.0,903896.909379,-6.014663
Random Forest Regressor,134893.170836,85446720000.0,292312.715227,0.266391
GradientBoostingRegressor,130750.981198,55922890000.0,236480.200732,0.51987
CatBoostRegressor,113178.198952,42942920000.0,207226.731573,0.631311
