In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Dependencies

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn import metrics

In [None]:
df = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')

# EDA

In [None]:
df.head()

In [None]:
print('Shape of the data: {}'.format(df.shape))

## Unique Values

Here as we can see that `Fuel_Type`, `Seller_Type`, `Transmission` and `Owner` are `CATEGORICAL FEATURES`. So, let's print it's unique value.

In [None]:
print('Unique Seller Type:', df['Seller_Type'].unique())
print('Unique Fuel Type:', df['Fuel_Type'].unique())
print('Unique Transmission Type:', df['Transmission'].unique())
print('Unique Owner Type:', df['Owner'].unique())

## Missing Values

In [None]:
df.isnull().sum()

That's great there is no missing values

In [None]:
df.describe()

## Seller Type

In [None]:
sns.barplot('Seller_Type','Selling_Price',data=df,palette='twilight')

## Transmission

In [None]:
sns.barplot('Transmission','Selling_Price',data=df,palette='spring')

## Fuel Type

In [None]:
sns.barplot('Fuel_Type','Selling_Price',data=df,palette='summer')

## Present Price

In [None]:
sns.regplot('Selling_Price','Present_Price',data=df)

## Kms Driven

In [None]:
sns.regplot('Selling_Price','Kms_Driven',data=df)

## Owner

In [None]:
sns.barplot('Owner','Selling_Price',data=df,palette='ocean')

## Car Age

In [None]:
Current_Year = 2020

df['Total_Years'] = Current_Year - df['Year']

In [None]:
plt.figure(figsize=(10,5))
sns.barplot('Total_Years','Selling_Price',data=df)

Let's drop the Car Name features as it is not going to be helpful for our model.

In [None]:
final_dataset = df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]

We need to engineer a Year feature. So, let's get the total number of years of vehicle by subtracting the year feature from a new feature `Current_Year` which we will make. Also name this feature `Total_Years`.

In [None]:
final_dataset['Current_Year'] = 2020

final_dataset['Total_Years'] = final_dataset['Current_Year'] - final_dataset['Year']

final_dataset.head()


Now let's drop the `Year` feature and `Current_Year` feature as our new feature `Total_Year` already carry the information as them.

In [None]:
final_dataset.drop(['Year', 'Current_Year'], axis=1, inplace=True)

final_dataset.head()

## Categorical Features

Moving on, we will now convert categorical features into One-Hot Encoded.

In [None]:
final_dataset = pd.get_dummies(final_dataset, drop_first=True) 
# dropping the first feature to prevent it from 'Dummy Variable Trap'

In [None]:
final_dataset.head()

In [None]:
final_dataset.corr()

In [None]:
sns.pairplot(final_dataset)

In [None]:
# get correlations of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))

# plot heat map
g = sns.heatmap(df[top_corr_features].corr(), annot=True, cmap='RdYlGn')

In [None]:
X = final_dataset.iloc[:, 1:] # independent feature
y = final_dataset.iloc[:, 0] # dependent feature

In [None]:
X.head()

In [None]:
y.head()

## Feature Importance

In [None]:
model = ExtraTreesRegressor()

model.fit(X,y)

In [None]:
print(model.feature_importances_)

In [None]:
feature_imp = pd.Series(model.feature_importances_, index=X.columns)

feature_imp.nlargest(5).plot(kind='barh')
plt.show()

# Model Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

rf_reg = RandomForestRegressor()

## Hyperparameter Tuning

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]
# max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

### Create Random Grid

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
rf = RandomForestRegressor()

In [None]:
# Random search of parameters, using 3 fold cross-validation
# search across 100 different combinations

rf_random = RandomizedSearchCV(estimator=rf, 
                               param_distributions=random_grid, 
                               scoring='neg_mean_squared_error', 
                               n_iter=10, cv=5, verbose=2, 
                               random_state=42, n_jobs=1)

In [None]:
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_score_

In [None]:
predictions = rf_random.predict(X_test)

In [None]:
sns.distplot(y_test - predictions)

In [None]:
sns.scatterplot(y_test, predictions)

In [None]:
print('R2 Score: ', metrics.r2_score(y_test,predictions))

# Model Evaluation

In [None]:
print('MAE: ', metrics.mean_absolute_error(y_test, predictions))
print('MSE: ', metrics.mean_squared_error(y_test, predictions))
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, predictions)))