In [94]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.1-py3-none-win_amd64.whl (86.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


In [136]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

In [17]:
os.chdir('C:\\Users\\shoun\\Coding\\Datasets')
df5 = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv')

In [18]:
df5

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner
...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Manual,Second Owner
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner


In [24]:
'''columns name, seller_type, transmission, owner are categorical, converting them to numeric labels'''
df5[['name','seller_type', 'fuel','transmission','owner']] = df5[['name','seller_type', 'fuel','transmission','owner']].astype('category')

In [31]:
for i in list(['name','seller_type', 'fuel','transmission','owner']):
    df5['{} label'.format(i)] = df5[i].cat.codes

In [21]:
df5.dtypes

name             category
year                int64
selling_price       int64
km_driven           int64
fuel             category
seller_type      category
transmission     category
owner            category
dtype: object

In [33]:
# Get the number of years the car has been used
df5['years of usage'] = 2020 - df5['year']

In [37]:
# check if there are any null values
df5.isna().sum()

name                  0
year                  0
selling_price         0
km_driven             0
fuel                  0
seller_type           0
transmission          0
owner                 0
name label            0
seller_type label     0
fuel label            0
transmission label    0
owner label           0
years of usage        0
dtype: int64

In [38]:
df5.head(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,name label,seller_type label,fuel label,transmission label,owner label,years of usage
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner,775,1,4,1,0,13
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner,1041,1,4,1,0,13
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner,505,1,1,1,0,8
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner,118,1,4,1,0,3
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner,279,1,1,1,2,6


In [39]:
# forming dataset without the unnecessary data
df4 = df5.drop(['name','seller_type', 'fuel','transmission','owner','year'], axis = 1)

In [40]:
df4.head(5)

Unnamed: 0,selling_price,km_driven,name label,seller_type label,fuel label,transmission label,owner label,years of usage
0,60000,70000,775,1,4,1,0,13
1,135000,50000,1041,1,4,1,0,13
2,600000,100000,505,1,1,1,0,8
3,250000,46000,118,1,4,1,0,3
4,450000,141000,279,1,1,1,2,6


In [117]:
features2 = df4.drop('selling_price', axis = 1)

In [118]:
features2.head()

Unnamed: 0,km_driven,name label,seller_type label,fuel label,transmission label,owner label,years of usage
0,70000,775,1,4,1,0,13
1,50000,1041,1,4,1,0,13
2,100000,505,1,1,1,0,8
3,46000,118,1,4,1,0,3
4,141000,279,1,1,1,2,6


In [140]:
scalar = StandardScaler()
features2['km_driven'] = scalar.fit_transform(pd.DataFrame(features2['km_driven']))
features2['years of usage'] = scalar.fit_transform(pd.DataFrame(features2['years of usage']))

In [141]:
# features
features2.head()

Unnamed: 0,km_driven,name label,seller_type label,fuel label,transmission label,owner label,years of usage
0,0.081139,775,1,4,1,0,1.445074
1,-0.347689,1041,1,4,1,0,1.445074
2,0.724381,505,1,1,1,0,0.258795
3,-0.433455,118,1,4,1,0,-0.927485
4,1.603479,279,1,1,1,2,-0.215717


In [142]:
# targets
target = df4['selling_price']

In [143]:
# forming train test data
xtrain, xtest, ytrain, ytest = train_test_split(features2, target, test_size = 0.2)

In [144]:
# creating ML objects
lr = LinearRegression()
knr = KNeighborsRegressor(n_neighbors=5)
rfr = RandomForestRegressor(n_estimators=200, criterion='mse')
xgbr = XGBRegressor(objective ='reg:linear', n_estimators = 200)
sv = SVR(kernel = 'rbf')
algorithms = [lr,knr,rfr, xgbr,sv]

In [145]:
# Normal, without Bagging or Boosting
for i in algorithms:
    i.fit(xtrain, ytrain)
    ypred = i.predict(xtest)
    accuracy = r2_score(ytest, ypred) * 100
    print('\n'+str(i) + str(accuracy))


LinearRegression()46.60796531220441

KNeighborsRegressor()83.66204026037572

RandomForestRegressor(n_estimators=200)79.37843605186387

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1,
             objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)88.04137457011798

SVR()-8.254444118379034


In [146]:
# Boosting
for i in algorithms:
    i.fit(xtrain, ytrain)
    boost = AdaBoostRegressor(base_estimator = i,n_estimators= 200, learning_rate= 0.01).fit(xtrain, ytrain)
    ypred = boost.predict(xtest)
    accuracy = r2_score(ytest, ypred) * 100
    print('\n'+str(i) + str(accuracy))


LinearRegression()41.033949646552124

KNeighborsRegressor()87.97722598128277

RandomForestRegressor(n_estimators=200)81.63462255079725

















XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1,
             objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)89.67432876067531

SVR()-4.900930000987613


In [147]:
# Bagging
for i in algorithms:
    i.fit(xtrain, ytrain)
    bag = BaggingRegressor(base_estimator = i,n_estimators= 200).fit(xtrain, ytrain)
    ypred = bag.predict(xtest)
    accuracy = r2_score(ytest, ypred) * 100
    print('\n'+str(i) + str(accuracy))


LinearRegression()46.59723245730297

KNeighborsRegressor()84.38544995658457

RandomForestRegressor(n_estimators=200)77.69100075814967

















XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=0, num_parallel_tree=1,
             objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)87.04613979390155

SVR()-7.734692371580376
