### Data import

#### Import dependencies

In [25]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

#### Importing CSV data

✅ index_col tells pandas which column(s) to use as the row index of the DataFrame.

✅ [0] means: use the first column (column at position 0) as the index.

In [26]:
data = pd.read_csv('cardekho_imputated.csv',index_col=[0])
data.head(2)

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000


### Data Cleaning

#### Drop redundant columns car name and brand

In [27]:
data.drop(columns=['car_name','brand'],inplace=True)
data.head(1)

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000


### Data Exploration

#### Segregating numerical,categorical features,discrete and continuous features

In [28]:
numeric_features = [feature for feature in data.columns if data[feature].dtype != 'O']
print('Number of numeric features ',len(numeric_features))
categorical_features = [feature for feature in data.columns if data[feature].dtype == 'O']
print('Number of categorical features ', len(categorical_features))
discrete_features = [feature for feature in numeric_features if len(data[feature].unique())<=25]
print('Number of discrete features',len(discrete_features))
continuous_features = [feature for feature in numeric_features if feature not in discrete_features]
print('Number of continuous features',len(continuous_features))

Number of numeric features  7
Number of categorical features  4
Number of discrete features 2
Number of continuous features 5


### Data Encoding

#### Splitting dependent and independent variables

In [29]:
data.columns

Index(['model', 'vehicle_age', 'km_driven', 'seller_type', 'fuel_type',
       'transmission_type', 'mileage', 'engine', 'max_power', 'seats',
       'selling_price'],
      dtype='object')

In [30]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

#### Instantiate and process categorical data

In [31]:
le = LabelEncoder()
ohe = OneHotEncoder()
sc = StandardScaler()

Label Encoding car model

In [32]:
X['model'] = le.fit_transform(X['model'])

One hot encoding and scaling

In [33]:
categorical_features = X.select_dtypes(exclude='number').columns
numerical_features = X.select_dtypes(exclude='object').columns
numerical_features

Index(['model', 'vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power',
       'seats'],
      dtype='object')

Creating transformer for encoding numeric and categorical features

For Column transformer: Values must be in tuples with Str,Encoder instance and feature must be given as input.

In [None]:
preprocess = ColumnTransformer([("category",ohe,categorical_features),("number",sc,numerical_features)],remainder='passthrough')
X = preprocess.fit_transform(X)


#### Train test split

In [37]:
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.2,random_state=42)

### Model Training

#### Create a function for fit and train model

In [55]:
models = [DecisionTreeRegressor(),RandomForestRegressor(),AdaBoostRegressor(),GradientBoostingRegressor()]

In [88]:
results = {}

def train_eval(models,X_train,X_test,Y_train,Y_test):
    for i in range(0,len(models)):
        name = type(models[i]).__name__
        model = models[i].fit(X_train,Y_train)
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        rmse_train = root_mean_squared_error(Y_train,train_pred)
        mae_train = mean_absolute_error(Y_train,train_pred)
        r2_train = r2_score(Y_train,train_pred)
        rmse_test = root_mean_squared_error(Y_test,test_pred)
        mae_test = mean_absolute_error(Y_test,test_pred)
        r2_test = r2_score(Y_test,test_pred)
        
        print(model," Prediction Results: \n")
        print("Training data accuracy:")
        print("Root Mean Squared error: ",rmse_train)
        print("Mean Absolute error: ",mae_train)
        print("r2_score: ",r2_train,"\n")
        print("Test data accuracy:")
        print("Root Mean Squared error: ",rmse_test)
        print("Mean Absolute error: ",mae_test)
        print("r2_score: ",r2_test)
        print("--"*35)
        
        results[name] ={
                "rmse_train":rmse_train,
                "rmse_test":rmse_test,
                "mae_train":mae_train,
                "mae_test":mae_test,
                "r2_train":r2_train,
                "r2_test":r2_test
                }        
        
        
    return results


final_result = train_eval(models,X_train,X_test,Y_train,Y_test)


DecisionTreeRegressor()  Prediction Results: 

Training data accuracy:
Root Mean Squared error:  20797.23516567643
Mean Absolute error:  5164.819922128488
r2_score:  0.9994666998284044 

Test data accuracy:
Root Mean Squared error:  303391.324657916
Mean Absolute error:  124639.33127905721
r2_score:  0.877725129590873
----------------------------------------------------------------------
RandomForestRegressor()  Prediction Results: 

Training data accuracy:
Root Mean Squared error:  148215.18409607722
Mean Absolute error:  40230.44819091653
r2_score:  0.9729139351650368 

Test data accuracy:
Root Mean Squared error:  227446.85359455968
Mean Absolute error:  101890.11310734894
r2_score:  0.931278795709367
----------------------------------------------------------------------
AdaBoostRegressor()  Prediction Results: 

Training data accuracy:
Root Mean Squared error:  440855.67056374665
Mean Absolute error:  336705.438099031
r2_score:  0.7603632721722664 

Test data accuracy:
Root Mean Sq

In [89]:
final_result

{'DecisionTreeRegressor': {'rmse_train': 20797.23516567643,
  'rmse_test': 303391.324657916,
  'mae_train': 5164.819922128488,
  'mae_test': 124639.33127905721,
  'r2_train': 0.9994666998284044,
  'r2_test': 0.877725129590873},
 'RandomForestRegressor': {'rmse_train': 148215.18409607722,
  'rmse_test': 227446.85359455968,
  'mae_train': 40230.44819091653,
  'mae_test': 101890.11310734894,
  'r2_train': 0.9729139351650368,
  'r2_test': 0.931278795709367},
 'AdaBoostRegressor': {'rmse_train': 440855.67056374665,
  'rmse_test': 469833.09531099856,
  'mae_train': 336705.438099031,
  'mae_test': 352596.3667654439,
  'r2_train': 0.7603632721722664,
  'r2_test': 0.7067634469374992},
 'GradientBoostingRegressor': {'rmse_train': 204046.85187296217,
  'rmse_test': 253899.69188770128,
  'mae_train': 111666.19181998471,
  'mae_test': 126217.08958606141,
  'r2_train': 0.9486642022282229,
  'r2_test': 0.9143642229984207}}

In [91]:
df = pd.DataFrame(final_result).T
df.head()

Unnamed: 0,rmse_train,rmse_test,mae_train,mae_test,r2_train,r2_test
DecisionTreeRegressor,20797.235166,303391.324658,5164.819922,124639.331279,0.999467,0.877725
RandomForestRegressor,148215.184096,227446.853595,40230.448191,101890.113107,0.972914,0.931279
AdaBoostRegressor,440855.670564,469833.095311,336705.438099,352596.366765,0.760363,0.706763
GradientBoostingRegressor,204046.851873,253899.691888,111666.19182,126217.089586,0.948664,0.914364
