In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('cardekho_imputated.csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [2]:
data.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [3]:
data.drop(columns=['car_name','brand','Unnamed: 0'],inplace=True)

In [4]:
data.head(5)

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [5]:
data.sample(15)

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
14550,i20,11,56660,Individual,Petrol,Manual,18.5,1197,80.0,5,340000
1396,Santro,3,5000,Individual,Petrol,Manual,20.3,1086,68.0,5,500000
5296,i10,9,61685,Dealer,Petrol,Manual,19.81,1086,68.05,5,300000
3335,Swift Dzire,3,26750,Dealer,Diesel,Manual,28.4,1248,74.02,5,911000
12996,City,12,65000,Dealer,Petrol,Manual,17.0,1497,118.0,5,275000
10645,WR-V,4,46000,Dealer,Diesel,Manual,25.5,1498,98.6,5,725000
12529,Grand,3,18000,Individual,Petrol,Manual,18.9,1197,81.86,5,490000
7968,i10,9,25000,Individual,Petrol,Manual,20.36,1197,78.9,5,315000
7684,RediGO,4,22558,Dealer,Petrol,Manual,22.7,799,53.64,5,235000
2993,Swift,7,96000,Dealer,Diesel,Manual,28.4,1248,74.0,5,490000


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   model              15411 non-null  object 
 1   vehicle_age        15411 non-null  int64  
 2   km_driven          15411 non-null  int64  
 3   seller_type        15411 non-null  object 
 4   fuel_type          15411 non-null  object 
 5   transmission_type  15411 non-null  object 
 6   mileage            15411 non-null  float64
 7   engine             15411 non-null  int64  
 8   max_power          15411 non-null  float64
 9   seats              15411 non-null  int64  
 10  selling_price      15411 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 1.3+ MB


In [7]:
numerical_features = [features for features in data.columns if data[features].dtype != 'O'] 
print(numerical_features)

['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'selling_price']


In [8]:
categorical_features = [features for features in data.columns if features not in numerical_features] 
print(categorical_features)

['model', 'seller_type', 'fuel_type', 'transmission_type']


In [9]:
X = data.drop(columns='selling_price')
y = data['selling_price']

In [10]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [11]:
X_train

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
11335,Santro,9,55324,Dealer,Petrol,Manual,17.92,1086,62.10,5
4885,Octavia,6,99000,Dealer,Diesel,Automatic,19.30,1968,141.00,5
14692,Polo,5,49000,Dealer,Petrol,Manual,16.20,1199,74.00,5
12368,i20,4,40000,Individual,Petrol,Manual,18.60,1197,81.83,5
7093,Grand,3,7245,Dealer,Petrol,Manual,18.90,1197,81.86,5
...,...,...,...,...,...,...,...,...,...,...
5191,Ertiga,7,127731,Dealer,Diesel,Manual,20.77,1248,88.80,7
13418,Vento,11,59000,Dealer,Petrol,Manual,16.09,1598,103.20,5
5390,Wagon R,7,20000,Individual,Petrol,Manual,20.51,998,67.04,5
860,i20,2,15000,Dealer,Petrol,Manual,18.60,1197,81.86,5


In [12]:
categorical_features

['model', 'seller_type', 'fuel_type', 'transmission_type']

In [13]:
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns

In [14]:
cat_features

Index(['model', 'seller_type', 'fuel_type', 'transmission_type'], dtype='object')

In [15]:
X_test

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
3334,i10,12,73000,Dealer,Petrol,Manual,20.36,1197,78.90,5
10928,Baleno,4,58000,Individual,Diesel,Manual,27.39,1248,74.00,5
2518,Ertiga,7,96000,Dealer,Diesel,Manual,20.77,1248,88.76,7
11322,City,1,4500,Dealer,Petrol,Automatic,18.40,1498,119.35,5
9394,Alto,11,62000,Dealer,Petrol,Manual,19.70,796,46.30,5
...,...,...,...,...,...,...,...,...,...,...
12793,Tucson,3,12000,Dealer,Petrol,Automatic,12.95,1999,152.88,5
1091,City,11,80000,Individual,Petrol,Manual,17.00,1497,118.00,5
4767,i20,6,18745,Dealer,Petrol,Manual,18.60,1197,81.83,5
73,Tiago,4,32100,Individual,Petrol,Manual,23.84,1199,84.00,5


In [16]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

cat_features = ['model', 'seller_type', 'fuel_type', 'transmission_type']
num_features = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']

scalar = StandardScaler()
ohe = OneHotEncoder(drop="first", handle_unknown="ignore",sparse=False)

transformer = ColumnTransformer(
    [
        ("OneHotEncoder", ohe, cat_features),
        ("StandardScaler", scalar, num_features)
    ]
)

X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)




In [17]:
X_train_df = pd.DataFrame(X_train)

In [18]:
X_train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,120,121,122,123,124,125,126,127,128,129
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.988218,-0.000504,-0.439588,-0.762989,-0.895572,-0.402521
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.010659,1.050645,-0.108487,0.932816,0.957290,-0.402521
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,-0.343618,-0.152704,-0.852265,-0.545726,-0.616117,-0.402521
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,-0.676576,-0.369307,-0.276437,-0.549571,-0.432239,-0.402521
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,-1.009535,-1.157621,-0.204458,-0.549571,-0.431535,-0.402521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.322300,1.742114,0.244208,-0.451514,-0.268558,2.066875
11554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.654135,0.087966,-0.878657,0.221424,0.069607,-0.402521
11555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.322300,-0.850646,0.181826,-0.932185,-0.779563,-0.402521
11556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,-1.342494,-0.970981,-0.276437,-0.549571,-0.431535,-0.402521


## Random Forest Regression Model Training

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [20]:
models = {
    'Linear Regression' : LinearRegression(),
    'Decision Tree Regressor' : DecisionTreeRegressor(),
    'Random Forest Regressor' : RandomForestRegressor(),
    'Lasso Regressor' : Lasso(),
    'Ridge Regressor' : Ridge(),
    'K-Neighbour Regressor' : KNeighborsRegressor()
}
# Dictionary to store trained models
trained_models = {}

for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model

    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Store trained model
    trained_models[name] = model

    ## checking accuracy 
    print(name)
    print('Model performance for Training set')
    print(f"Accuracy :",r2_score(y_train,y_pred_train))
    print(f"Mean Square Error :",mean_squared_error(y_train,y_pred_train))
    print(f"Mean Absolute Error :",mean_absolute_error(y_train,y_pred_train))
    
 
    
    print("___________________________________")
    print('Model performance for Test set')
    print(f"Accuracy :",r2_score(y_test,y_pred_test))
    print(f"Mean Square Error :",mean_squared_error(y_test,y_pred_test))
    print(f"Mean Absolute Error :",mean_absolute_error(y_test,y_pred_test))



    print("\n")
    





Linear Regression
Model performance for Training set
Accuracy : 0.8612206377350282
Mean Square Error : 111927776457.12737
Mean Absolute Error : 164483.32893597463
___________________________________
Model performance for Test set
Accuracy : 0.770982000760672
Mean Square Error : 178115436513.68564
Mean Absolute Error : 183206.96510054724


Decision Tree Regressor
Model performance for Training set
Accuracy : 0.999471541567979
Mean Square Error : 426210181.9807348
Mean Absolute Error : 4991.751744823211
___________________________________
Model performance for Test set
Accuracy : 0.8962173293121672
Mean Square Error : 80715471070.03561
Mean Absolute Error : 124962.38645211523


Random Forest Regressor
Model performance for Training set
Accuracy : 0.9713804599318844
Mean Square Error : 23082117043.69804
Mean Absolute Error : 39619.38006488906
___________________________________
Model performance for Test set
Accuracy : 0.9269220523256575
Mean Square Error : 56835316843.09892
Mean Absolute

### Exporting Random Forest Model as it has the highest Accurcay on Test Data

In [21]:
trained_models

{'Linear Regression': LinearRegression(),
 'Decision Tree Regressor': DecisionTreeRegressor(),
 'Random Forest Regressor': RandomForestRegressor(),
 'Lasso Regressor': Lasso(),
 'Ridge Regressor': Ridge(),
 'K-Neighbour Regressor': KNeighborsRegressor()}

In [22]:
best_model = trained_models['Random Forest Regressor']

In [23]:
best_model

In [24]:
import pickle

pickle.dump(scalar,open('scalar.pkl','wb'))
pickle.dump(ohe,open('encoder.pkl','wb'))
pickle.dump(best_model,open('regression.pkl','wb'))


In [32]:
pickle.dump(transformer, open('preprocessor.pkl', 'wb'))


In [31]:
X.seats.unique()

array([5, 8, 7, 6, 4, 2, 9, 0], dtype=int64)

In [None]:
best_model.predict()