In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
car_data = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")
car_data.head()

In [None]:
car_data.shape

Since by looking at the data we can see that the selling price is our dependent value and fuel_type, seller_type, transmission are our categorical values

In [None]:
print(car_data["Fuel_Type"].value_counts(),car_data["Seller_Type"].value_counts(),car_data["Transmission"].value_counts(),car_data["Owner"].value_counts(), sep='\n\n')

now check for missing values if any

In [None]:
car_data.isnull().sum()

In [None]:
car_data.describe(include="all")

In [None]:
col = car_data.columns.to_list()
col.remove('Car_Name')

car_dup = car_data[col]
car_dup.head()

we can see that the year is of no used except for deriving a new column of number of years since manufacturing

In [None]:
current_year = 2021
car_dup["Num_Years"] = current_year-car_dup["Year"] 
car_dup.head()

In [None]:
car_dup.drop('Year', axis=1, inplace=True)
car_dup.head()

## One Hot Encoding to convert categorical data to numeric data

In [None]:
final_data = car_dup.copy()
final_data = pd.get_dummies(final_data, drop_first=True)
final_data.head()

In [None]:
final_data.corr()

In [None]:
sns.pairplot(final_data.corr())

the pairplot visualizes the correlation table in a form and shows how strongly two variables are related, we'll use heatmap next to make it more visualizing.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
corrmat = final_data.corr()
top_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heatmap
graph = sns.heatmap(final_data[top_features].corr(),annot = True, cmap='RdYlGn')

In [None]:
final_data.head()

let's separate the dependent and independent variable now

In [None]:
X = final_data.iloc[:,1:]
y = final_data.iloc[:,0]

In [None]:
print(X.head(), y.head(), sep='\n\n')

In [None]:
## Feature importance
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(X,y)

In [None]:
print(model.feature_importances_)

we can see that the first feature and fifth feature (present price and fuel_type_diesel) has the most importance

let's see this by visualization

In [None]:
feat_importance = pd.Series(model.feature_importances_, index = X.columns)
feat_importance.nlargest(5).plot(kind='barh')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [None]:
print(X_train.shape)
print(X_test.shape)

240 out of 301 of our data is for training, and 61 records are for testing

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_random = RandomForestRegressor()

let's work with the hyperparamters and some other parameters of RandomForestRegressor

In [None]:
import numpy as np
n_estimators = [int(x) for x in np.linspace(start  = 100, stop = 1200, num = 12)]
n_estimators

the array shows the number of different decision trees that will be used in RandomForestRegressor

In [None]:
#Randomized Serach CV

max_features = ['auto','sqrt']
#max number of levels in a tree
max_depth = [int(x) for x in np.linspace(5,30,num=6)]
#min number of samples required to split a node
min_samples_split = [2,5,10,15,100]
#Min number of samples required at each leaf node
min_samples_leaf = [1,2,5,10]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

RandomizedSearchCV helps us to find the best parameters for our data. It's also faster then gridsearchCV

In [None]:
random_grid = {'n_estimators' : n_estimators,
               'max_features' : max_features,
               'max_depth' : max_depth,
               'min_samples_split' : min_samples_split,
               'min_samples_leaf' : min_samples_leaf
}
print(random_grid)

In [None]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator= rf, param_distributions= random_grid, scoring='neg_mean_squared_error',n_iter=10, cv = 5, verbose=2, random_state=42, n_jobs=1)
#verbose displays the results

this RandomizedSearchCV takes input as the parameters that we have specified and will automatically select the best parameters for us.

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
y_pred = rf_random.predict(X_test)
y_pred

to compare the results, we will use displot

In [None]:
sns.distplot(y_test - y_pred)

since the graph shows the normal distribution, it means the model is giving great results.

In [None]:
plt.scatter(y_test, y_pred)

linear line shows that our results are good