In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing Dataset and EDA

In [None]:
df = pd.read_csv('../input/vehicle-dataset-from-cardekho/Car details v3.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(axis = 0, inplace = True)  #since we have enough data we can remove na values

In [None]:
print(df.shape, '\n', df.isnull().sum())

In [None]:
df['Car_age'] = 2021 - df['year'] # A derived variable from the data given which may be important

In [None]:
df.head()

In [None]:
df.describe()

## Data Cleaning

In [None]:
Features = df.drop(columns = ['name', 'year','torque']) #Dropping columns that are not required

In [None]:
Features.head()

In [None]:
#Correcting the data types of some features

Features['mileage'] = Features['mileage'].replace(to_replace = 'kmpl|km/kg', value = '', regex = True).astype('float')

Features['engine'] = Features['engine'].replace('CC', '', regex = True).astype('int')

Features['max_power'] = Features['max_power'].replace('bhp', '',regex = True).astype('float')

Features.dtypes

## Data Visualization

In [None]:
histograms = Features.hist(bins = 50, figsize = (20,15))

In [None]:
figure, ((ax1, ax2),(ax3, ax4)) = plt.subplots(2,2, figsize = (14,14))

labels = Features['fuel'].value_counts().index.tolist()
values = Features['fuel'].value_counts().tolist()
ax1.pie(values, labels = labels, autopct = '%1.1f%%')

labels = Features['seller_type'].value_counts().index.tolist()
values = Features['seller_type'].value_counts().tolist()
ax2.pie(values, labels = labels, autopct = '%1.1f%%')

labels = Features['transmission'].value_counts().index.tolist()
values = Features['transmission'].value_counts().tolist()
ax3.pie(values, labels = labels, autopct = '%1.1f%%')

labels = Features['owner'].value_counts().index.tolist()
values = Features['owner'].value_counts().tolist()
ax4.pie(values, labels = labels, autopct = '%1.1f%%')

## Handling Categorical Variables

In [None]:
Features = pd.get_dummies(Features, drop_first = True)

In [None]:
corr = Features.corr()
corr_features = corr.index
plt.figure(figsize = (20,20))

gg = sns.heatmap(Features[corr_features].corr().fillna(0.0), annot = True, cmap = 'RdYlGn')

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X = Features.drop('selling_price', axis = 1)
y = Features['selling_price']
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.3)

In [None]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

# Random Forest Regressor Model And Hyperparameter Tuning using Randomized Search CV

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [None]:
n_estimators = [int(x) for x in np.linspace(100,1200,12)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(5,30,6)]

min_samples_split = [2,5,10,15,100]

min_samples_leaf = [1,2,5,10]

In [None]:
randomgrid = {'n_estimators' : n_estimators, 'max_features' : max_features, 'max_depth' : max_depth,
              'min_samples_split' : min_samples_split, 'min_samples_leaf' : min_samples_leaf}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = randomgrid, scoring = 'r2'
                              , n_iter = 10, cv = 5, verbose = 2, n_jobs = 1)

In [None]:
rf_random.fit(xtrain, ytrain)

In [None]:
yhat = rf_random.predict(xtest)

In [None]:
from sklearn.metrics import r2_score
r2_score(ytest, yhat)

In [None]:
plt.figure(figsize = (10,10))
sns.displot(ytest - yhat)

In [None]:
plt.scatter(ytest, yhat)