<b>This dataset contains information about used cars listed on www.cardekho.com
This data can be used for a lot of purposes such as price prediction to exemplify the use of linear regression in Machine Learning.</b> 

The columns in the given dataset is as follows:
<ol>
    <li> Car_Name </li>
    <li> Year </li>
    <li> Selling_Price </li>
    <li> Present_Price </li>
    <li> Kms_Driven </li>
    <li> Fuel_Type </li>
    <li> Seller_Type </li>
    <li> Transmission </li>
    <li> Owner </li>
</ol>

## Importing Dependencies 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing Data

In [None]:
df = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(), cmap='Blues', annot=True)

## Exploring Categorical Features 

In [None]:
categorical_features = [col for col in df.columns if df[col].dtypes == 'O']

In [None]:
categorical_features

In [None]:
for col in categorical_features:
    print(col, df[col].nunique())

In [None]:
unique_values = []
for col in categorical_features:
    unique_values.append(df[col].nunique())

In [None]:
unique_values

In [None]:
sns.set_style("white")
sns.barplot(unique_values, categorical_features, orient='h')
plt.title('Unique values of each Categorical values')

<font size=3.7 color='#1b6ca8'>Here, Car_Name feature has **98** unique values so converting them into one hot encoding is not a very good idea. And also Car_Name is not much beneficial for predictions. So, we are dropping that column.</font>

In [None]:
df.drop(['Car_Name'], axis=1, inplace=True)

In [None]:
categorical_features = [col for col in df.columns if df[col].dtypes == 'O']
categorical_features

<font size=4>Splitting the data into independent and dependent features.
</font>

Here,

**Dependent Feature** - <font color='#ff9234'>'Selling_Price'</font>

**Independent Features** - <font color='#ff9234'>'Year', 'Present_Price', 'Kms_Driven', 'Fuel_Type','Seller_Type', 'Transmission', 'Owner'</font> 

In [None]:
X = df.drop(['Selling_Price'], axis=1)
y = df['Selling_Price']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
X.head()

<font size=3.7 color='#1b6ca8'>Exploring Unique values of each Categorical values</font>

In [None]:
for col in categorical_features:
    print(col, X[col].unique())

In [None]:
X.head()

## OneHotEncoding 

In [None]:
X = pd.get_dummies(X, drop_first=True)

In [None]:
X.head()

<font size=3.7 color='#1b6ca8'>Converting Year column into (How Old the Car is?) by subtracting Year column from Current Year</font> 

In [None]:
X['Current_Year'] = 2020
X['Number_of_years'] = X['Current_Year'] - X['Year']
X.drop(['Current_Year', 'Year'], axis=1, inplace=True)

In [None]:
X.head()

## Creating Models 

###  1. RandomForestRegressor

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
model = RandomForestRegressor()

<font size=3.7 color='#1b6ca8'>Performing RandomizedSearchCV</font>

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]

min_samples_split = [2, 5, 10, 15, 100]

min_samples_leaf = [1, 2, 5, 10]

In [None]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
rf = RandomizedSearchCV(estimator = model, 
                               param_distributions = random_grid,
                               scoring='neg_mean_squared_error', 
                               n_iter = 10, cv = 5, verbose=2, 
                               random_state=42, n_jobs = 1)


## Fitting the Training data 

In [None]:
rf.fit(X_train,y_train)

In [None]:
predictions = rf.predict(X_test)

In [None]:
from sklearn import metrics

print('MAE:',round(metrics.mean_absolute_error(y_test, predictions),2))
print('MSE:',round(metrics.mean_squared_error(y_test, predictions),2))
print('RMSE:',round(np.sqrt(metrics.mean_squared_error(y_test, predictions)),2))
print('R2_score',round(metrics.r2_score(y_test, predictions),2))
Random_Forest_Regressor = { 'MAE': round(metrics.mean_absolute_error(y_test, predictions),2), 'MSE': round(metrics.mean_squared_error(y_test, predictions),2), 
                      'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, predictions)),2) , 'R2_score':round(metrics.r2_score(y_test, predictions),2)}

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(y_test, predictions)
plt.xlabel('y_test')
plt.ylabel('Predictions')
plt.title('y_test vs Predictions (RandomForestRegressor)')

### 2. DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree = DecisionTreeRegressor()

In [None]:
tree.fit(X_train, y_train)

In [None]:
predictions = tree.predict(X_test)

In [None]:
print('MAE:',round(metrics.mean_absolute_error(y_test, predictions),2))
print('MSE:',round(metrics.mean_squared_error(y_test, predictions),2))
print('RMSE:',round(np.sqrt(metrics.mean_squared_error(y_test, predictions)),2))
print('R2_score',round(metrics.r2_score(y_test, predictions),2))

Decision_Tree_Regressor = { 'MAE': round(metrics.mean_absolute_error(y_test, predictions),2), 'MSE': round(metrics.mean_squared_error(y_test, predictions),2), 
                      'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, predictions)),2) , 'R2_score':round(metrics.r2_score(y_test, predictions),2)}

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(y_test, predictions)
plt.xlabel('y_test')
plt.ylabel('Predictions')
plt.title('y_test vs Predictions (DecisionTreeRegressor)')

### 3. LinearRegression 

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
LR = LinearRegression()

In [None]:
LR.fit(X_train, y_train)

In [None]:
predictions = tree.predict(X_test)

In [None]:
print('MAE:',round(metrics.mean_absolute_error(y_test, predictions),2))
print('MSE:',round(metrics.mean_squared_error(y_test, predictions),2))
print('RMSE:',round(np.sqrt(metrics.mean_squared_error(y_test, predictions)),2))
print('R2_score',round(metrics.r2_score(y_test, predictions),2))

Linear_Regression = { 'MAE': round(metrics.mean_absolute_error(y_test, predictions),2), 'MSE': round(metrics.mean_squared_error(y_test, predictions),2), 
                      'RMSE': round(np.sqrt(metrics.mean_squared_error(y_test, predictions)),2) , 'R2_score':round(metrics.r2_score(y_test, predictions),2)}

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(y_test, predictions)

plt.xlabel('y_test')
plt.ylabel('Predictions')
plt.title('y_test vs Predictions (LinearRegression)')

In [None]:
from tomark import Tomark

data = [Random_Forest_Regressor, Decision_Tree_Regressor, Linear_Regression]

markdown = Tomark.table(data)

## Final Result 

|Model| MAE | MSE | RMSE | R2_score |
|-----|-----|-----|-----|-----|
|RandomForestRegressor| 0.83 | 2.92 | 1.71 | 0.89 |
|DecisionTreeRegressor| 0.74 | 1.24 | 1.11 | 0.95 |
|LinearRegression| 0.74 | 1.24 | 1.11 | 0.95 |

