In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
car = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/CAR DETAILS FROM CAR DEKHO.csv")

In [None]:
car.head()

### Adding new Variable for reference 

In [None]:
car['current']= 2020

### Adding new variable for Age column

In [None]:
car['age']=car['current']-car['year']

### Drop all non required or repeative data

In [None]:
car.drop(['current','year','name'],axis=1,inplace=True)
car.head()

In [None]:
car.info()

In [None]:
car.shape

In [None]:
car.describe()

#Point to note
- Dataset has 4340 rows and 7 columns.
- Looking at the data, there seems to be some fields that are categorical in nature, but in integer/float type.

- We will analyse and finalize whether to convert them to categorical or treat as integer.

# DATA QUALITY CHECK

## Check for NULL/MISSING values

In [None]:
# percentage of missing values in each column
round(100*(car.isnull().sum()/len(car)),2).sort_values(ascending = False)

In [None]:
# percentage of missing values in each row
round(100*(car.isnull().sum(axis=1)/len(car)),2).sort_values(ascending = False)

## Finding
- There are no missing / Null values either in columns or rows


## Duplicate Check

In [None]:
car_dub=car.copy()
# Checking for duplicates and dropping the entire duplicate row if any
car_dub.drop_duplicates(subset=None, inplace=True)

In [None]:
car_dub.shape

In [None]:
car.shape

# Insights
- The shape after running the drop duplicate command is not same as the original dataframe.



## Assign non duplicates records to orginal record 

In [None]:
car=car_dub
car.head()

In [None]:
car.info()

In [None]:
car.shape

# Data Cleaning

Checking value_counts() for entire dataframe.

This will help to identify any Unknow/Junk values present in the dataset.

In [None]:
for col in car:
    print(car[col].value_counts(ascending=False), '\n\n\n')

### Insights
- There seems to be no Junk/Unknown values in the entire dataset.

# Creating Dummy Variables
- We will create DUMMY variables for 4 categorical variables 'mnth', 'weekday', 'season' & 'weathersit'.

- Before creating dummy variables, we will have to convert them into 'category' data types.

In [None]:
#To hold original data & column after duplicates are removed
car_o=car.copy()

In [None]:
car.info()

In [None]:
# Convert to 'category' data type
car['fuel']=car['fuel'].astype('category')
car['seller_type']=car['seller_type'].astype('category')
car['transmission']=car['transmission'].astype('category')
car['owner']=car['owner'].astype('category')

In [None]:
car.info()

In [None]:
# This code does 3 things:
# 1) Create Dummy variable
# 2) Drop original variable for which the dummy was created
# 3) Drop first dummy variable for each set of dummies created.

car = pd.get_dummies(car, drop_first=True)
car.info()

# SPLITTING THE DATA
- Splitting the data to Train and Test: - We will now split the data into TRAIN and TEST (70:30 ratio)
- We will use train_test_split method from sklearn package for this

In [None]:
# Check the shape before spliting

car.shape


In [None]:
# Check the info before spliting

car.info()

In [None]:
from sklearn.model_selection import train_test_split

# We should specify 'random_state' so that the train and test data set always have the same rows, respectively

np.random.seed(0)
df_train, df_test = train_test_split(car, train_size = 0.70, test_size = 0.30, random_state = 100)

- Verify the info and shape of the dataframes after split:

In [None]:
df_train.info()

In [None]:
df_train.shape

In [None]:
df_test.info()

In [None]:
df_test.shape

# EXPLORATORY DATA ANALYSIS
- We need to perform the EDA on TRAINING (df_train) Dataset.


## Visualising Numeric Variables
- Let's make a pairplot of all the numeric variables.

In [None]:
df_train.info()


In [None]:
df_train.columns

In [None]:
# Create a new dataframe of only numeric variables:

car_n=df_train[[ 'selling_price', 'km_driven', 'age']]

sns.pairplot(car_n, diag_kind='kde')
plt.show()

### Insights
- The above Pair-Plot tells us that there is a LINEAR RELATION between 'selling_price','km_driven' and 'age'

## Visualising Catagorical Variables

In [None]:
df_train.info()

In [None]:
# Build boxplot of all categorical variables (before creating dummies) againt the target variable 'selling_price' 
# to see how each of the predictor variable stackup against the target variable.

plt.figure(figsize=(25, 10))
plt.subplot(2,2,1)
sns.boxplot(x = 'fuel', y = 'selling_price', data = car_o)
plt.subplot(2,2,2)
sns.boxplot(x = 'seller_type', y = 'selling_price', data = car_o)
plt.subplot(2,2,3)
sns.boxplot(x = 'transmission', y = 'selling_price', data = car_o)
plt.subplot(2,2,4)
sns.boxplot(x = 'owner', y = 'selling_price', data = car_o)

plt.show()

- There were 4 categorical variables in the dataset.

We used Box plot (refer the fig above) to study their effect on the dependent variable (‘selling_price’) .

The inference that We could derive were:

- **season :** Diesel & Petrol consists of 99% of all available fuel column data available. 
- **transmission :** Manual consists of 91% of all available transmission column data available.
- **seller_type :** Individual consists of 79% of all available seller_type column data available, i.e. highest in all.
- **seller_type :** First Owner consists of 61% of all available owner column data available, i.e. highest in all.




## Correlation Matrix

In [None]:
# Let's check the correlation coefficients to see which variables are highly correlated. Note:
# here we are considering only those variables (dataframe: car) that were chosen for analysis

plt.figure(figsize = (25,20))
sns.heatmap(car.corr(), annot = True, cmap="RdBu")
plt.show()

### Insights:
- The heatmap clearly shows which all variable are multicollinear in nature, and which variable have high collinearity with the target variable.
- We will refer this map back-and-forth while building the linear model so as to validate different correlated values along with VIF & p-value, for identifying the correct variable to select/eliminate from the model.

## RESCALING THE FEATURES

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
# Checking the values before scaling
df_train.head()

In [None]:
df_train.columns

In [None]:
 #Apply scaler() to all the numeric variables

num_vars = ['selling_price', 'km_driven', 'age']

df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

In [None]:
df_train.head()

In [None]:
df_train.describe()

# BUILDING A LINEAR MODEL



## Dividing into X and Y sets for the model building

In [None]:
y_train = df_train.pop('selling_price')
X_train = df_train

### RFE
Recursive feature elimination: We will be using the **LinearRegression function from SciKit Learn** for its compatibility with RFE (which is a utility from sklearn)

In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [None]:
# Running RFE with the output number of the variable equal to 7
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, 7)             # running RFE
rfe = rfe.fit(X_train, y_train)


In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train.columns[~rfe.support_]

In [None]:
# Creating X_test dataframe with RFE selected variables
X_train_rfe = X_train[col]

# Building Linear Model using 'STATS MODEL'

## Model 1

### VIF Check

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train_rfe.columns
vif['VIF'] = [variance_inflation_factor(X_train_rfe.values, i) for i in range(X_train_rfe.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
import statsmodels.api as sm

# Add a constant
X_train_lm1 = sm.add_constant(X_train_rfe)

# Create a first fitted model
lr1 = sm.OLS(y_train, X_train_lm1).fit()

In [None]:
# Check the parameters obtained

lr1.params

In [None]:
# Print a summary of the linear regression model obtained
print(lr1.summary())

## Model 2
- Removing the variable 'fuel_Electric' based on its High p-value 

In [None]:
X_train_new = X_train_rfe.drop(["fuel_Electric"], axis = 1)

### VIF

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train_new.columns
vif['VIF'] = [variance_inflation_factor(X_train_new.values, i) for i in range(X_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Add a constant
X_train_lm2 = sm.add_constant(X_train_new)

# Create a first fitted model
lr2 = sm.OLS(y_train, X_train_lm2).fit()

In [None]:
# Check the parameters obtained

lr2.params

In [None]:
# Print a summary of the linear regression model obtained
print(lr2.summary())

## Model 3  
- Removing the variable 'owner_Test Drive Car' based on its High p-value

In [None]:
X_train_new = X_train_new.drop(["owner_Test Drive Car"], axis = 1)

### VIF Check

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train_new.columns
vif['VIF'] = [variance_inflation_factor(X_train_new.values, i) for i in range(X_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Add a constant
X_train_lm3 = sm.add_constant(X_train_new)

# Create a first fitted model
lr3 = sm.OLS(y_train, X_train_lm3).fit()

In [None]:
# Check the parameters obtained

lr3.params

In [None]:
# Print a summary of the linear regression model obtained
print(lr3.summary())

## Model 4
-Removing the variable 'seller_type_Trustmark Dealer ' based on its High p-value

In [None]:
X_train_new = X_train_new.drop(["seller_type_Trustmark Dealer"], axis = 1)

### VIF Check

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train_new.columns
vif['VIF'] = [variance_inflation_factor(X_train_new.values, i) for i in range(X_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Add a constant
X_train_lm4 = sm.add_constant(X_train_new)

# Create a first fitted model
lr4 = sm.OLS(y_train, X_train_lm4).fit()

In [None]:
# Check the parameters obtained

lr4.params

In [None]:
# Print a summary of the linear regression model obtained
print(lr4.summary())

### Insights
- This model looks good, as there seems to be VERY LOW Multicollinearity between the predictors and the p-values for all the predictors seems to be significant. For now, we will consider this as our final model (unless the Test data metrics are not significantly close to this number).

# Final Model Interpretation

## Hypothesis Testing:

### Hypothesis testing states that:

- H0:B1=B2=...=Bn=0
- H1: at least one Bi!=0

lr4 model coefficient values
- const                  0.159292
- km_driven             -0.081104
- age                   -0.132559
- fuel_Diesel            0.032289
- transmission_Manual   -0.087353

### Insights
- From the lr4  model summary, it is evident that all our coefficients are not equal to zero which means We REJECT the NULL HYPOTHESIS

### F Statistics

#### F-Statistics is used for testing the overall significance of the Model: Higher the F-Statistics, more significant the Model is.

- F-statistic:                     466.7
- Prob (F-statistic):          3.70e-299
The F-Statistics value of 466.7 (which is greater than 1) and the p-value of '~0.0000' states that the overall model is significant

# The equation of best fitted surface based on model lr4:

**selling_price** = 0.159292 - (**km_driven** * **0.081104**) - (**age** * 0.132559) + ( **fuel_Diesel** * 0.032289) - ( **transmission_Manual** * 0.087353)

### Interpretation of Coefficients:

- **km_driven**: A coefficient value of ‘0.081104’ indicated that a unit increase in km_driven variable, decreases the selling_price numbers by 0.081104 units.

- **age**: A coefficient value of ‘-0.132559’ indicated that, a unit increase in age  variable, decreases the selling_price numbers by 0.132559 units.

- **fuel_Diesel**: A coefficient value of ‘0.032289’ indicated that w.r.t Petrol, a unit increase in fuel_Diesel variable increases the selling_price numbers by 0.032289 units.

- **transmission_Manual**: A coefficient value of ‘-0.087353’ indicated that w.r.t Automatic, a unit increase in transmission_Manual variable decreases the selling_price numbers by 0.087353 units.

#  ASSUMPTIONS

## Error terms are normally distributed with mean zero (not X, Y)

- Residual Analysis Of Training Data

In [None]:
y_train_pred = lr4.predict(X_train_lm4)

In [None]:
res = y_train-y_train_pred
# Plot the histogram of the error terms
fig = plt.figure()
sns.distplot((res), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)  

### Insights
- From the above histogram, we could see that the Residuals are normally distributed. Hence our assumption for Linear Regression is valid.

## There is a linear relationship between X and Y

In [None]:
car_n=car[[ 'selling_price', 'km_driven', 'age']]

sns.pairplot(car_n, diag_kind='kde')
plt.show()

### Insight
- Using the pair plot, we could see there is a linear relation between km_driven and age variable with the predictor ‘selling_price’.

## There is No Multicollinearity between the predictor variables

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train_new.columns
vif['VIF'] = [variance_inflation_factor(X_train_new.values, i) for i in range(X_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif


### Insight
- From the VIF calculation we could find that there is no multicollinearity existing between the predictor variables, as all the values are within permissible range of below 5

# MAKING PREDICTION USING FINAL MODEL


Now that we have fitted the model and checked the assumptions, it's time to go ahead and make predictions using the final model (lr4)

### Applying the scaling on the test sets

In [None]:
#Apply scaler() to all the numeric variables

num_vars = ['selling_price', 'km_driven', 'age']

df_test[num_vars] = scaler.fit_transform(df_test[num_vars])

In [None]:
df_test.head()

In [None]:
df_test.describe()

### Dividing into X_test and y_test

In [None]:
y_test = df_test.pop('selling_price')
X_test = df_test
X_test.info()


In [None]:
#Selecting the variables that were part of final model.
col1=X_train_new.columns
X_test=X_test[col1]
# Adding constant variable to test dataframe
X_test_lm4 = sm.add_constant(X_test)
X_test_lm4.info()

In [None]:
# Making predictions using the final model (lr6)

y_pred = lr4.predict(X_test_lm4)

# MODEL EVALUATION

In [None]:
# Plotting y_test and y_pred to understand the spread

fig = plt.figure()
plt.scatter(y_test, y_pred, alpha=.5)
fig.suptitle('y_test vs y_pred', fontsize = 20)              # Plot heading 
plt.xlabel('y_test', fontsize = 18)                          # X-label
plt.ylabel('y_pred', fontsize = 16) 
plt.show()

## R^2 Value for TEST

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

### Adjusted R^2 Value for TEST

In [None]:
# We already have the value of R^2 (calculated in above step)

r2=0.3618371256083056 

In [None]:
# Get the shape of X_test
X_test.shape

In [None]:
# n is number of rows in X

n = X_test.shape[0]


# Number of features (predictors, p) is the shape along axis 1
p = X_test.shape[1]

# We find the Adjusted R-squared using the formula

adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2

### Final Result Comparison
- Train R^2 :0.433
- Train Adjusted R^2 :0.432
- Test R^2 :0.362 
- Test Adjusted R^2 :0.360

This seems to be a really good model that can moderate 'Generalize' various datasets.

# FINAL REPORT

As per our final Model, the top predictor variables that influences the selling_prize are:
- **km_driven**: A coefficient value of ‘0.081104’ indicated that a unit increase in km_driven variable, decreases the selling_price numbers by 0.081104 units.

- **age**: A coefficient value of ‘-0.132559’ indicated that, a unit increase in age  variable, decreases the selling_price numbers by 0.132559 units.

- **fuel_Diesel**: A coefficient value of ‘0.032289’ indicated that w.r.t Petrol, a unit increase in fuel_Diesel variable increases the selling_price numbers by 0.032289 units.

- **transmission_Manual**: A coefficient value of ‘-0.087353’ indicated that w.r.t Automatic, a unit increase in transmission_Manual variable decreases the selling_price numbers by 0.087353 units.