In [None]:
#import statements
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Load datset
car_df = pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")
car_df.head()

In [None]:
#Check for missing values
car_df.isnull().sum()

In [None]:
#Check for categorical features
for i in ["Fuel_Type","Seller_Type","Transmission","Owner"]:
    print("Categories of", i, "column :", car_df[i].unique())

In [None]:
car_df.columns

In [None]:
final_df = car_df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]
final_df.head()

In [None]:
final_df['Current Year'] = datetime.now().year
final_df.head()

In [None]:
final_df['NoOfYears'] = final_df['Current Year'] - final_df['Year']
final_df.drop(['Year','Current Year'], axis=1, inplace=True)

In [None]:
final_df.head()

In [None]:
#One Hot Encoding
final_df = pd.get_dummies(final_df, drop_first=True)
final_df.head()

In [None]:
#Finding correlation
correlations = final_df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(correlations, annot=True, cmap='coolwarm')

In [None]:
X = final_df.iloc[:,1:] #independent feature
y = final_df.iloc[:,0]  #dependent feature

In [None]:
#Checking feature importance
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X,y)
feat_imp = pd.Series(model.feature_importances_, index=X.columns)
feat_imp.plot(kind='bar')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
#No. of trees in Random Forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
#No. of features to consider at every split
max_features = ['auto', 'sqrt']
#Max no. of levels in a tree
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]
#Min no. of samples required to split the node
min_samples_split = [2, 5, 10, 15, 100]
#Min no. of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
#Creating Random grid
random_grid = {'n_estimators' : n_estimators,
               'max_features' : max_features,
               'max_depth' : max_depth,
               'min_samples_split' : min_samples_split,
               'min_samples_leaf' : min_samples_leaf
              }

In [None]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, scoring = 'neg_mean_squared_error', n_iter=10, cv=5, verbose=2, random_state=52, n_jobs=1)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
predictions = rf_random.predict(X_test)
sns.histplot(y_test-predictions, kde=True)

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(y_test,predictions)