In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

# Loading Data

In [None]:
df = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')
df.head()

# Checking for Null Values

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(df.isnull(),cbar=False,yticklabels=False)
df.isnull().sum()

**No Null Values in our data!**

# Feature Analysis

### **1. Car Name**                                       
    Clearly Unique Car Names won't help our Model and also won't be feasible.
    But Car Company's reputation plays a big role in deciding resale value for a car
    so we try to extract out Company's Name from 'Car_Name'.  

In [None]:
Car_detail = df.Car_Name.unique()
Cars = pd.Series([i.split()[0] for i in Car_detail]).unique()
Cars

In [None]:
MS_cars = Cars[:14]
Toyota_cars = Cars[14:19]
Honda = ['Activa']
Hyundai_cars = Cars[32:]

In [None]:
df['Car_Company'] = df['Car_Name'].apply(lambda x:x.split()[0])
df.Car_Company.replace(MS_cars,'Maruti Suzuki',inplace=True)
df.Car_Company.replace(Toyota_cars,'Toyota',inplace=True)
df.Car_Company.replace(Honda,'Honda',inplace=True)
df.Car_Company.replace(Hyundai_cars,'Hyundai',inplace=True)
df.Car_Company.replace(['land','Royal'],['Land Rover','Royal Enfield'],inplace=True)
df.Car_Company.unique()

**We can Judge importance of our Features using Percent resale for each car
Where Percent resale is percent of original price at which it is being sold.**

In [None]:
df['Percent_resale'] = ( df['Selling_Price'] / df['Present_Price'] )*100

In [None]:
sns.set_theme('paper')
Company_analysis = df.groupby('Car_Company')['Percent_resale'].mean()
Company_analysis = pd.concat([Company_analysis,100-Company_analysis],axis=1)
Company_analysis.plot.bar(stacked=True,legend=False,color=['Blue','Slategrey'])
plt.ylabel('Percent Resale')
plt.xlabel('Company')
Company_analysis.iloc[:,0]

**Clearly we can see there is a difference in average percent resale for each company, 
For eg: we can see evident difference between percent resale of cars of Land Rover and Maruti Suzuki!**

### 2. Years
           Its logical that the older the car will be , cheaper it would sell
           So we will use 'Years_Old' feature rather than the feature 'Year' which tells in which year the car was
           bought.(We will use 2021 as our present year)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
df['Years_Old'] = df['Year'].apply(lambda x:2021-x)
df.groupby('Years_Old').Percent_resale.mean().plot(kind = 'line')
plt.ylabel('Percent resale');

**We can see that the data corresponds to our assumption**

### 3.Kms Driven
        Kms_Driven is the value of how much the car has been driven since when it was purchased

In [None]:
sns.scatterplot(df['Kms_Driven'],df['Percent_resale']);

**It is clearly evident that the selling price drops as the total Km driven by the car increases**

### 4.Fuel type
     Diesel cars should sell better over petrol cars as they have better mileage but the contrary fact is 
     that they are hard to maintain hence that should decrease their value. Let's see what our data tells us about that.

In [None]:
df.groupby('Fuel_Type').Percent_resale.mean().plot(kind='barh')
plt.xlim([0,100])
plt.xlabel('Percent resale');
df.groupby('Fuel_Type').Percent_resale.mean()

**Diesel performs slightly better than petrol but the difference is not very significant, while CNG lags behind significantly**

### 5.Seller Type
     It gives us information whether it's being sold by a dealer or an individual

In [None]:
df.groupby('Seller_Type').Percent_resale.mean().plot(kind='barh')
plt.xlim([0,100]);

**We can't see any significant difference between both the categories**

### 6.Transmission
     Whether the car had manual transmission or automatic transmission

In [None]:
df.groupby('Transmission').Percent_resale.mean().plot(kind='barh')
plt.xlim([0,100]);

**Here as well no significant difference between both categories**

### 7. Owner
     Whether it is being sold by first owner or second/third , logically if the car is not from first owner 
     it should deacrease its value

In [None]:
df.groupby('Owner').Percent_resale.mean().plot(kind='bar')
plt.ylim([0,100]);

**As we can see car value decreases as no. of owner increases**

# Preparing our Model

Dropping required columns from our data

In [None]:
df.drop(['Car_Name','Year','Percent_resale'],axis=1,inplace=True)

**Getting dummy values for Categorical Data**

In [None]:
num_cols = df._get_numeric_data().columns
cat_cols = list(set(df.columns) - set(num_cols))
for col in cat_cols:
    cat = pd.get_dummies(df[col],drop_first=True)
    df = pd.concat([df,cat],axis=1)
    df.drop(col,axis=1,inplace=True)
print(df.shape)
df.head()

In [None]:
ax = plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True,cbar=False)

Diesel and Petrol feature are highly negatively correlated, it makes sense as we do not have a lot of CNG cars in our data, so if a car is not a diesel car, there is a high probability that the car will be petrol. This may cause the problem of Multi Collinearity and decrease our model's performance, but as we are using Random Forest Regressor multicollinearity won't be an issue with our model.

**Splitting data into train data and validation data**

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(['Selling_Price'],axis=1)
y = df['Selling_Price']
x_train , x_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=22)

## Checking Impportance of Each Feature

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
test_model = ExtraTreesRegressor()
test_model.fit(x_train,y_train)
feat_imp = dict(zip(x_train.columns,test_model.feature_importances_))
print(mean_squared_error(y_test,test_model.predict(x_test)))
feat_imp

'UM' and 'Suzuki' has really less feature importance , it  might be because we don't have sufficient data for the specific category , so its better for our model to drop these features.

In [None]:
X = df.drop(['Selling_Price','UM','Suzuki'],axis=1)
y = df['Selling_Price']
x_train , x_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=22)
test_model.fit(x_train,y_train)
feat_imp = dict(zip(x_train.columns,test_model.feature_importances_))
print(mean_squared_error(y_test,test_model.predict(x_test)))
feat_imp

### Hyper Parameter Tuning
We are using Random Forest regressor for our model, and its important for our model to get the right parameters , hence we will use Randomizedsearch Cross Validation for the same

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
reg = RandomForestRegressor(bootstrap=True,oob_score=True)
prm_grid = dict(n_estimators = [100,200,300,400,500,600,700,800,900,1000],
               max_depth = [None , 4 , 7 , 10 , 12 , 15 , 20],
               min_samples_split = [2,5,8,10,12,15,20])
grid = RandomizedSearchCV(reg , prm_grid , n_iter = 5 ,scoring ='neg_root_mean_squared_error', cv=5 , verbose = 5)
grid.fit(X,y)

In [None]:
print(grid.best_score_)
print(grid.best_estimator_)
param = grid.best_params_
param

In [None]:
reg =  RandomForestRegressor(**param , bootstrap=True,oob_score=True)
reg.fit(X,y)
reg.oob_score_

In [None]:
reg.predict(np.array([6.5,33000,0,8,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0]).reshape(1,-1))

## Please Upvote if you find my notebook worth it.