In [1]:
# import required libraries
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv('car.csv')
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,Innova,2022,18.25,19.99,17000,Petrol,Dealer,Manual,0
1,Fortuner,2020,31.5,34.17,23000,Diesel,Dealer,Manual,0
2,Swift,2019,5.4,8.98,36900,Petrol,Dealer,Manual,0
3,Polo,2019,7.85,9.15,5200,Petrol,Dealer,Manual,0
4,Corolla,2020,19.5,20.25,42450,Diesel,Dealer,Manual,0


In [3]:
df.shape

(301, 9)

In [4]:
print(df['Seller_Type'].unique())
print(df['Fuel_Type'].unique())
print(df['Transmission'].unique())
print(df['Owner'].unique())

['Dealer' 'Individual']
['Petrol' 'Diesel' 'CNG']
['Manual' 'Automatic']
[0 1 3]


In [5]:
##check missing values
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [6]:
#selecting only required Columns Using Square Brackets
final_dataset=df[['Year','Selling_Price','Present_Price','Kms_Driven','Fuel_Type','Seller_Type','Transmission','Owner']]

In [7]:
# setting current year as 2023
final_dataset['Current Year']=2023

In [8]:
final_dataset.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current Year
0,2022,18.25,19.99,17000,Petrol,Dealer,Manual,0,2023
1,2020,31.5,34.17,23000,Diesel,Dealer,Manual,0,2023
2,2019,5.4,8.98,36900,Petrol,Dealer,Manual,0,2023
3,2019,7.85,9.15,5200,Petrol,Dealer,Manual,0,2023
4,2020,19.5,20.25,42450,Diesel,Dealer,Manual,0,2023


In [9]:
#calculating no of year by subtracting current year-year
final_dataset['no_year']=final_dataset['Current Year']- final_dataset['Year']

In [10]:
final_dataset.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current Year,no_year
0,2022,18.25,19.99,17000,Petrol,Dealer,Manual,0,2023,1
1,2020,31.5,34.17,23000,Diesel,Dealer,Manual,0,2023,3
2,2019,5.4,8.98,36900,Petrol,Dealer,Manual,0,2023,4
3,2019,7.85,9.15,5200,Petrol,Dealer,Manual,0,2023,4
4,2020,19.5,20.25,42450,Diesel,Dealer,Manual,0,2023,3


In [11]:
#Dropping year column
final_dataset.drop(['Year'],axis=1,inplace=True)

In [12]:
final_dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current Year,no_year
0,18.25,19.99,17000,Petrol,Dealer,Manual,0,2023,1
1,31.5,34.17,23000,Diesel,Dealer,Manual,0,2023,3
2,5.4,8.98,36900,Petrol,Dealer,Manual,0,2023,4
3,7.85,9.15,5200,Petrol,Dealer,Manual,0,2023,4
4,19.5,20.25,42450,Diesel,Dealer,Manual,0,2023,3


In [13]:
final_dataset=pd.get_dummies(final_dataset,drop_first=True)

In [14]:
final_dataset=final_dataset.drop(['Current Year'],axis=1)

In [15]:
final_dataset.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Owner,no_year,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,18.25,19.99,17000,0,1,0,1,0,1
1,31.5,34.17,23000,0,3,1,0,0,1
2,5.4,8.98,36900,0,4,0,1,0,1
3,7.85,9.15,5200,0,4,0,1,0,1
4,19.5,20.25,42450,0,3,1,0,0,1


In [16]:
# Select independent and dependent variable
X=final_dataset.iloc[:,1:]
y=final_dataset.iloc[:,0]

In [17]:
X['Owner'].unique()

array([0, 1, 3])

In [18]:
X.head()

Unnamed: 0,Present_Price,Kms_Driven,Owner,no_year,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,19.99,17000,0,1,0,1,0,1
1,34.17,23000,0,3,1,0,0,1
2,8.98,36900,0,4,0,1,0,1
3,9.15,5200,0,4,0,1,0,1
4,20.25,42450,0,3,1,0,0,1


In [19]:
y.head()

0    18.25
1    31.50
2     5.40
3     7.85
4    19.50
Name: Selling_Price, dtype: float64

In [20]:
### Feature Importance

from sklearn.ensemble import ExtraTreesRegressor
import matplotlib.pyplot as plt
model = ExtraTreesRegressor()
model.fit(X,y)

ExtraTreesRegressor()

In [21]:
print(model.feature_importances_)

[4.23859504e-01 5.08589793e-02 1.78950194e-04 6.44051372e-02
 2.30502129e-01 1.18572557e-02 1.22114536e-01 9.62235078e-02]


In [22]:
# Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [23]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
regressor=RandomForestRegressor()

In [25]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
print(n_estimators)

[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]


In [26]:
from sklearn.model_selection import RandomizedSearchCV

In [27]:
 #Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]


In [28]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [29]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

In [30]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [31]:
rf_random.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   0.3s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   0.4s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1100; total time=   0.4s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimator

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000, 1100,
                                                         1200]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

In [32]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 25}

In [33]:
rf_random.best_score_

-4.1846142158234105

In [34]:
predictions=rf_random.predict(X_test)

In [35]:
from sklearn import metrics

In [36]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MAE: 0.9186857142857138
MSE: 3.7169575536395625
RMSE: 1.9279412733897168


In [37]:

# open a file, where you ant to store the data
file = open('random_forest_regression_model.pkl', 'wb')

# dump information to that file
pickle.dump(rf_random, file)