In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

#core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Read the data and explore it

In [None]:
vehicle=pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')
vehicle.head()

In [None]:
## check how many records and features are in the dataset
vehicle.shape

In [None]:
vehicle.info()

In [None]:
## check missing values
vehicle.isnull().sum()

### As there are no missing values, it's good to progress to next steps

In [None]:
# see unique values of categorical features
print(vehicle['Fuel_Type'].unique())
print(vehicle['Seller_Type'].unique())
print(vehicle['Transmission'].unique())
print(vehicle['Owner'].unique())

#### Here, we will drop 'Car_Name' column from our dataset as it is not required to predict the selling price of a car.

In [None]:
car_dataset = vehicle.drop(['Car_Name'],axis=1)

In [None]:
car_dataset.head()

In [None]:
car_dataset.columns

#### We have to create a new feature 'Current_Year' as 2021 and find 'Total_years' so that we can see how much old is our vehicle. 

#### As the vehicle is getting older, the selling price is also decreased.

In [None]:
car_dataset['Current_Year']=2021

In [None]:
car_dataset.head()

In [None]:
car_dataset['Total_Years']= car_dataset['Current_Year']-car_dataset['Year']

In [None]:
car_dataset.head()

####  We have to drop 'Year' and 'Current_Year' . Year is basically the purchased year and we dont need it. The selling price prediction didnt need year and current year but it needs total years which we have already calculated.

In [None]:
car_dataset.drop(['Year','Current_Year'],axis=1,inplace=True)

In [None]:
car_dataset.head()

#### We have to convert categorical features to one hot encoded values because we have less number of features and dropping the first feature to avoid dummy variable trap.

In [None]:
car_dataset=pd.get_dummies(car_dataset,drop_first=True)

In [None]:
car_dataset.head()

In [None]:
## By visualizing the data in pairplot, we didnt get that much information.
sns.pairplot(car_dataset)

#### Let's see the co-related features info with the help of HEATMAP

In [None]:
## Here, we are storing the dataset co-related features index values in a variable 'cor_mat' so as to use that values in heatmap.
## Index values like Selling_price,present_price,kms_driven.....etc,etc
cor_mat=car_dataset.corr().index

In [None]:
# Plotting the heatmap 
sns.heatmap(car_dataset[cor_mat].corr(),annot=True,cmap='RdYlGn')

#### Here, Fuel_type_Petrol and Fuel_Type_diesel is -vely co-related(red color box)

In [None]:
car_dataset.head()

In [None]:
## Independent features and dependent features
X=car_dataset.iloc[:,1:]
y=car_dataset.iloc[:,0]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [None]:
## Hyperparameters
n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=12)]
print(n_estimators)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_score_

In [None]:
predictions=rf_random.predict(X_test)

In [None]:
sns.histplot(y_test-predictions,kde=True)

In [None]:
plt.scatter(y_test,predictions)