In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

There are no null values in the dataset.

# **Exploratory Data Analysis**

In [None]:
#lets examine the car owner types vs selling price
sns.barplot(x='Owner',y='Selling_Price',data=df,palette='spring')
plt.xticks([0,1,2],('1 st owner','2nd owner','3rd owner'),)

1 St owner's vehiles are selling for more price

# Transmission type vs selling price

In [None]:
sns.barplot(x='Transmission',y='Selling_Price',data=df)

Automatic cars are selling for more price than manual cars

# Fuel type vs selling price

In [None]:
sns.barplot(x='Fuel_Type',y='Selling_Price',data=df)

Diesel cars are going for more price compared to petrol and cng

 # Seller type vs selling price

In [None]:
sns.barplot(x='Seller_Type',y='Selling_Price',data=df)

Dealer can able to sell for more price than induvidual

# Kms driven vs selling price

In [None]:
sns.lmplot(x='Kms_Driven',y='Selling_Price',data=df)

# Vehicle age vs selling price

In [None]:
#create a new variable age 
df['age']=2020-df['Year']
df.head()

In [None]:
plt.figure(figsize=(10,10))
sns.regplot(x='age',y='Selling_Price',data=df)

# Feature Engineering

In [None]:
#using dummies to encode all catogorical features
df=pd.get_dummies(df,columns=['Fuel_Type','Seller_Type','Transmission'],drop_first=True)

In [None]:
#drop the year column, as we already calculated the age
df.drop(columns=['Year'],inplace=True)
df.head()

In [None]:
#understanding the realtionship between the features
sns.pairplot(df)

# Correlation heatmap

In [None]:
correlations=df.corr()
indx=correlations.index
plt.figure(figsize=(26,22))
sns.heatmap(df[indx].corr(),annot=True)

# Train the model

In [None]:
#scaling the model for faster convergence 
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

var=['Selling_Price','Present_Price','Kms_Driven','age']

df[var]=scaler.fit_transform(df[var])

In [None]:
#droping the name columns as it has nothing do with sales price
df.drop(columns=['Car_Name'],inplace=True)

In [None]:
#split the model between train and test
from sklearn.model_selection  import train_test_split

y=df['Selling_Price']
x=df.drop(columns=['Selling_Price'],axis=1)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [None]:
#creating and training the model
from sklearn.linear_model import LinearRegression
lm=LinearRegression()

lm.fit(x_train,y_train)

# Model Evaluation

In [None]:
#pring intercept of best fit line
print(lm.intercept_)

In [None]:
# temp here stores the numerical columns from the vehicles dataset that influence the prediction
temp=df.drop(columns=['Selling_Price'])

coeff_df = pd.DataFrame(lm.coef_,temp.columns,columns=['Coefficient'])
coeff_df 

Interpreting the coefficients:
For numerical features:

* Holding all other features fixed, a 1 unit increase in Present_Price is associated with an increase of 0.742.
* Holding all other features fixed, a 1 unit increase in Kms_Driven is associated with a decrease of .0411.
* Holding all other features fixed, a 1 unit increase in Vehicle_Age means decrease in 0.22764.****

# Predictions from our Model

In [None]:
predictions=lm.predict(x_test)

fig = plt.figure()
# Plot-label
fig.suptitle('y_test vs predictions')

#X-label
plt.xlabel('y_test')

# Y-label
plt.ylabel('predcitions')
plt.scatter(y_test,predictions)

# Residual Analysis

Residual = Observed value - Predicted value

A residual plot is a graph that shows the residuals on the vertical axis and the independent variable on the horizontal axis. If the points in a residual plot are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a nonlinear model is more appropriate.

In [None]:
fig=plt.figure(figsize=(8,8))
  
sns.distplot((y_test-predictions),bins=20)

#Plot Label
fig.suptitle('Residual Analysis', fontsize = 20)

In [None]:
sns.residplot(predictions,(y_test-predictions),lowess=True,color='g')

plt.xlabel('predicted value')
plt.ylabel('residual')

# Regression Evaluation Metrics

Here are three common evaluation metrics for regression problems:

Mean Absolute Error (MAE) is the mean of the absolute value of the errors:

 
Mean Squared Error (MSE) is the mean of the squared errors:

 
Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:


MAE is the easiest to understand, because it's the average error.
MSE is more popular than MAE, because MSE "punishes" larger errors, which tends to be useful in the real world.
RMSE is even more popular than MSE, because RMSE is interpretable in the "y" units.
All of these are loss functions, because we want to minimize them.

In [None]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))


In [None]:
R2 = metrics.r2_score(y_test,predictions)
R2

The low R2 valued is because of multi collinearity present between Fuel_Type_Diesel and Fuel_Type_Petrol. This can be solved by using vif

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif["features"] = x.columns

In [None]:
vif

We can drop Fuel_Type_Petrol as it is having high VIF

In [None]:
y=df['Selling_Price']
x1=df.drop(columns=['Selling_Price'],axis=1)

x1.drop(columns=['Fuel_Type_Petrol'],axis=1,inplace=True)

In [None]:
#splitting test and train data
x1_train,x1_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

In [None]:
lm2=LinearRegression()
lm2.fit(x1_train,y_train)

In [None]:
pred2=lm2.predict(x1_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, pred2))
print('MSE:', metrics.mean_squared_error(y_test, pred2))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred2)))

In [None]:
R2 = metrics.r2_score(y_test,pred2)
R2

Better R2 value compared to original model

In [None]:
#plotting the residual plot
sns.residplot(pred2,(y_test-pred2),lowess=True,color='g')

plt.xlabel('predicted value')
plt.ylabel('residual')