# Car Resale Value Prediction

## Kaggle Vehicle Dataset
<b> [Link to Dataset](https://www.kaggle.com/nehalbirla/vehicle-dataset-from-cardekho?select=CAR+DETAILS+FROM+CAR+DEKHO.csv) </b>

In [None]:
# Importing required libraries
import pickle
import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.ensemble import RandomForestRegressor

In [None]:
# Importing the dataset
df = pd.read_csv('car_data.csv')
df.head()

In [None]:
# Checking the dataframe shape
df.shape

In [None]:
# Exploring data statistics
df.describe()

In [None]:
# Checking for any null values
df.isnull().sum()

In [None]:
# Printing all categorical features and their values
print('Types of fuel:', df['Fuel_Type'].unique())
print('Types of seller:', df['Seller_Type'].unique())
print('Types of transmission:', df['Transmission'].unique())
print('Types of owner:', df['Owner'].unique())

In [None]:
# Dropping unrequired features from dataframe
df.drop(['Car_Name'], axis = 1, inplace = True)
df.head()

In [None]:
# Creating new feature - 'num_years' (current year - year) to calculate age of car
df['Num_Years'] = datetime.datetime.now().year - df['Year']
df.head()

In [None]:
# Dropping feature 'year' (car manufacture year) from dataframe
df.drop(['Year'], axis = 1, inplace = True)
df.head()

In [None]:
# Converting categorical features into dummy variables
df = pd.get_dummies(df, drop_first = True)
df.head()

In [None]:
# Getting correlation matrix for the dataset
df.corr()

In [None]:
# Creating a Resale Value Percentage feature for understanding data
df['Resale_Percentage'] = round(df['Selling_Price'] / df['Present_Price'] * 100, 2)

In [None]:
# Plotting the pairplot for the dataset
sns.pairplot(df[['Selling_Price', 'Present_Price', 'Kms_Driven', 'Num_Years', 'Resale_Percentage']])

In [None]:
# Plotting the correlation heatmap
plt.figure(figsize = (20, 20))
sns.heatmap(df.corr(), annot = True, cmap = 'RdBu')

In [None]:
# Dropping the 'Resale_Percentage' feature
df.drop(['Resale_Percentage'], axis = 1, inplace = True)

In [None]:
# Extracting dependent and independent features
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
# Getting feature importances
model = ExtraTreesRegressor()
model.fit(X, y)
print(model.feature_importances_)

In [None]:
# Plotting barplot feature importances
feature_imp = pd.Series(model.feature_importances_, index = X.columns)
feature_imp.plot(kind = 'barh')
plt.show()

In [None]:
# Dropping 'Owner' feature as its importance is very low
df.drop(['Owner'], axis = 1, inplace = True)

In [None]:
# Extracting features from updated dataframe
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

# Getting new feature importances
model = ExtraTreesRegressor()
model.fit(X, y)
print(model.feature_importances_)

# Plotting barplot for new feature importances
feature_imp = pd.Series(model.feature_importances_, index = X.columns)
feature_imp.plot(kind = 'barh')
plt.show()

In [None]:
# Splitting dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.2)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train.head()

In [None]:
# Creating a Random Forest Regressor model
random_forest_model = RandomForestRegressor()
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)

# Performance Metrics
print('R2 score: ', r2_score(y_test, y_pred))
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))

In [None]:
# Creating RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
random_forest_model_random = RandomizedSearchCV(estimator = random_forest_model, 
                                                param_distributions = random_grid,
                                                scoring = 'neg_mean_squared_error',
                                                n_iter = 10, cv = 5, verbose = 1, n_jobs = 1)
random_forest_model_random.fit(X_train, y_train)
y_pred = random_forest_model_random.predict(X_test)

In [None]:
# Printing the best parametets and best score
print('Best Parameter:', random_forest_model_random.best_params_)
print('Best Score:', random_forest_model_random.best_score_)

In [None]:
# Plotting histogram of difference between y_pred and y_test
sns.displot(y_test - y_pred, kind = 'kde')
plt.xlabel('Selling Price (y_test - y_pred)')
plt.show()

In [None]:
# Plotting scatterplot between y_pred and y_test
plt.scatter(y_test, y_pred)
plt.title('True Value vs Predicted Value')
plt.ylabel('y_test')
plt.xlabel('y_pred')
plt.show()

In [None]:
# Exporting the model to a pickle file

# Open file in desired location/directory
file = open('model.pkl', 'wb') 
# Dump information to that file
pickle.dump(random_forest_model_random, file)