# 1. Importing Libraries and Dataset

In [None]:
# Importing Libraries for ML

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

# Importing Data

df = pd.read_csv('car_data.csv')
df.head(3)


# 2 Data Preprocessing

In [None]:
df.shape

In [None]:
df.isnull().sum()

### As Model would not understand car name better as compare to numerical data hence we drop that data.

In [None]:
df.columns

In [None]:
df = df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]
df

### To convert year as feature which impacts car price in inverse manner we find out the "Age" column which gives us age of car.

In [None]:
from datetime import date

current_year = date.today().year

df["Current_year"] = current_year

df["Age"] = df["Current_year"]-df["Year"]

df.drop(["Year", "Current_year"], axis=1, inplace=True)

df.head(3)

###  OneHotEncoding for categorical features 
We use get_dummies() feature which utomatical converts all the categorical features to Encoded data.

Use drop_first = True for dummy variable trap.

In [None]:
print(df["Seller_Type"].unique())
print(df["Transmission"].unique())
print(df["Owner"].unique())
print(df["Fuel_Type"].unique())

In [None]:
df = pd.get_dummies(df, drop_first=True)
df.head(3)

### Finding the Correlation between the features

- Here Green says high correlation and Red as low correlation 
- Features which are highely correlated with other features except Target are playing similar role and one them can be dropped.

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, cmap="RdYlGn")

#### No features are highly correlated with other hence we can keep it as it is.

### Feature Selection using ExtraTreesRegressor() 
We find out which features are highly important and which are not so important, here Owner feature is showig least importance hence we drop it.

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
x = df.iloc[:,1:]
y = df.iloc[:,0]

model=ExtraTreesRegressor()

model.fit(x,y)

feature_scores_df = pd.Series(model.feature_importances_, index=x.columns)
feature_scores_df = feature_scores_df.sort_values(ascending=False)
feature_scores_df.plot(kind="barh")

In [None]:
feature_scores_df

#### Here We can Drop Owner feature as it is not playing much role 

In [None]:
df.drop(["Owner"], axis=1, inplace=True)

----------------------------------------------------

---------------------------------------------------
# 2. Splitting Data into (Features, Target) and (Train, Test) 

In [None]:
x = df.iloc[:,1:]
y = df.iloc[:,0]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

------------------------------------------------------------
------------------------------------------------------------
# 3. Random Forest Regressor model for Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

### Hyperparameter Tuning

First we perform RandomizedSearchCV and then Based on output of RandomizedSearchCV we perform GridSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#### Parameter grid for RandomizedSearchCV

In [None]:
para_grid = {'max_depth':  [int(x) for x in np.linspace(5,30,num=6)],
             'max_features': ['auto', 'sqrt','log2', None],
             'min_samples_leaf': [1, 2, 5, 10],
             'min_samples_split' : [2, 5, 10, 15, 100],
             'n_estimators' : [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
            }
para_grid

#### RandomizedSearchCV

In [None]:
rf = RandomForestRegressor()
rf_cv = RandomizedSearchCV(estimator=rf, param_distributions=para_grid, scoring='neg_mean_squared_error', n_iter=10, cv=5, verbose=2, random_state=42)
rf_cv.fit(x_train, y_train)

In [None]:
para = rf_cv.best_params_
para

#### Parameter grid for GridSearchCV

In [None]:
param_grid = {
    
    'max_depth': [para['max_depth']],
    
    'max_features': [None],
    
    'min_samples_leaf': [para['min_samples_leaf'], 
                         para['min_samples_leaf']+2, 
                         para['min_samples_leaf'] + 4],
    
    'min_samples_split': [para['min_samples_split'] - 2,
                          para['min_samples_split'] - 1,
                          para['min_samples_split'], 
                          para['min_samples_split'] +1,
                          para['min_samples_split'] + 2],
    
    'n_estimators': [para['n_estimators'] - 200,
                     para['n_estimators'] - 100, 
                     para['n_estimators'], 
                     para['n_estimators'] + 100, 
                     para['n_estimators'] + 200]
}
print(param_grid)

#### GridSearchCv

In [None]:
rf_cv = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='neg_mean_squared_error',  cv=5, verbose=2)
rf_cv.fit(x_train, y_train)

### Fitting Model on best hyperparameters

In [None]:
para = rf_cv.best_params_

rf_final = RandomForestRegressor(n_estimators = para["n_estimators"],
                                 min_samples_split = para["min_samples_split"],
                                 min_samples_leaf = para["min_samples_leaf"],
                                 max_features = None,
                                 max_depth = para["max_depth"])
rf_final.fit(x_train, y_train)

y_pred = rf_final.predict(x_test)

rf_final.score(x_test, y_test)

In [None]:
plt.scatter(y_test, y_pred)

# Exporting Pickle File

In [None]:
import pickle 

file = open("random_forest_regression_model.pkl", "wb")

pickle.dump(rf_final, file)