In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing required libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


In [None]:
# reading the csv file
cars = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/Car details v3.csv")

In [None]:
# Displaying head
cars.head()

In [None]:
# checking the shape
print(cars.shape) # it gives the shape of the data
print(cars.size) # it gives the total number of data points in the data

In [None]:
# checking datatypes of each column
cars.dtypes

In [None]:
# describing the dataset
cars.describe()

In [None]:
# Checking null values
cars.isnull().sum()

As we can see the null values in the dataset, we need to deal with them inorder to proceed further. 

Lets check the highest null value column in the data which is torque column

In [None]:
cars[cars.torque.isnull()]

As we can see for all the torque null values, mileage, engine, max_power and seats also have null values in the data. We can drop these rows from the dataset 

In [None]:
# removing the null value rows from the dataset
cars = cars[~(cars.torque.isnull())]
cars.shape

In [None]:
# checking null values again
cars.isnull().sum()

Finally we have 7906 rows in the dataset after removing these null value rows

Lets convert the mileage, engine, maxpower, torque columns data types

In [None]:
# Converting mileage column dtype to numerical
cars.mileage = cars.mileage.apply(lambda x: float(x.split(' ')[0]))
cars.mileage.dtype

In [None]:
# converting engine column dtype to int
cars.engine = cars.engine.apply(lambda x: int(x.split(' ')[0]))
cars.engine.dtype

In [None]:
# converting max_power datatype
cars.max_power = cars.max_power.apply(lambda x: float(x.split(' ')[0]))
cars.max_power.dtype

Dropping torque column since the data have different type of units and different representation

In [None]:
# dropping torque column from the dataset
cars.drop('torque', axis = 1, inplace = True)

In [None]:
# displaying the dataset head again
cars.head()

In [None]:
cars.name.value_counts().head(10)

**As we can see that Maruti Swift Dzire VDI* is the most selling car

In [None]:
# count plot for top 10 most sold cars
plt.figure(figsize = [12,8])
cars.name.value_counts().head(10).plot(kind = 'bar')
plt.show()

In [None]:
plt.figure(figsize = [12,8])
cars.groupby('year').count()['name'].plot(kind = 'bar')
plt.show()

Customers who bought the cars in years 2015-2018 are getting sold mostly

In [None]:
# distribution of selling price
plt.figure(figsize=[12,8])
sns.distplot(cars.selling_price)
plt.show()

As we can see that distribution of selling price is slightly right skewed

In [None]:
# distribution of selling price
plt.figure(figsize=[12,8])
sns.distplot(cars.km_driven)
plt.show()

As we can see that the most of the km_driven values are below approximately 4 lakhs 

In [None]:
# countplot for fuel
plt.figure(figsize=[20,10])
plt.subplot(2,2,1)
sns.countplot(cars.fuel)
plt.subplot(2,2,2)
sns.countplot(cars.seller_type)
plt.subplot(2,2,3)
sns.countplot(cars.transmission)
plt.subplot(2,2,4)
sns.countplot(cars.owner)
plt.show()

1. Most of the cars getting sold are from diesel and manual transmission
2. Most cars are getting sold by individual seller type
3. First owners cars are getting sold mostly

In [None]:
# plotting categorical variables vs target variable selling price
plt.figure(figsize=[20,10])
plt.subplot(2,2,1)
sns.barplot(cars.owner, cars.selling_price)
plt.subplot(2,2,2)
sns.barplot(cars.fuel, cars.selling_price)
plt.subplot(2,2,3)
sns.barplot(cars.seller_type, cars.selling_price)
plt.subplot(2,2,4)
sns.barplot(cars.transmission, cars.selling_price)
plt.show()

1. Test_drive cars has highest selling price
2. Diesel cars selling price is high
3. Dealet type selling price is more compared to individual sellertype
4. As we can see that automatic cars selling price is high.

In [None]:
# km_driven vs selling_price
plt.figure(figsize=[12,8])
sns.scatterplot(cars.km_driven, cars.selling_price)
plt.show()

1. As we can see km_driven has outliers above 10 lakhs kms. Removing these records from the dataset
2. As we can see that if the km_driven increases the selling price is getting decreased

In [None]:
cars = cars[~(cars.km_driven > 1000000)]
cars

In [None]:
cars.columns

In [None]:
# engine vs selling_price
plt.figure(figsize=[12,8])
sns.scatterplot(cars.engine, cars.selling_price)
plt.show()

1. As we can see engine has outliers above 3400 cc. Removing these records from the dataset

In [None]:
cars = cars[~(cars.engine > 3400)]
cars

In [None]:
# mileage vs selling_price
plt.figure(figsize=[12,8])
sns.scatterplot(cars.mileage, cars.selling_price)
plt.show()

In [None]:
# max_power vs selling_price
plt.figure(figsize=[12,8])
sns.scatterplot(cars.max_power, cars.selling_price)
plt.show()

In [None]:
# seats vs selling_price
plt.figure(figsize=[12,8])
sns.scatterplot(cars.seats, cars.selling_price)
plt.show()

In [None]:
# boxplot for selling_price
plt.figure(figsize=[15,8])
sns.boxplot(cars.selling_price)
plt.show()

As we can see mostly outliers are above 50 lakhs so removing these records from the dataset

In [None]:
cars = cars[~(cars.selling_price > 5000000)]
cars

In [None]:
# boxplot for mileage
plt.figure(figsize=[12,8])
sns.boxplot(cars.mileage)
plt.show()

Removing the outliers from mileage column, >32.5 and <5

In [None]:
cars = cars[~((cars.mileage > 32.5) | (cars.mileage <5))]
cars

In [None]:
# boxplot for engine
plt.figure(figsize=[12,8])
sns.boxplot(cars.engine)
plt.show()

for engine column, we have outliers above 2500 cc and below 650 removing these records from the dataset

In [None]:
cars = cars[~((cars.engine > 2500) | (cars.engine < 650))]
cars

In [None]:
# boxplot for max_power
plt.figure(figsize=[12,8])
sns.boxplot(cars.max_power)
plt.show()

As we can see we have outliers, lets remove few outliers above 175 bhp

In [None]:
cars = cars[~(cars.max_power > 175)]
cars

lets drop the name column from the data and proceed further to bulid the linear model


In [None]:
cars.drop('name', axis = 1, inplace = True)

In [None]:
cars.head()

In [None]:
# looking at pair plot for numerical data
sns.pairplot(cars)
plt.show()

In [None]:
# correlation between numerical variables in the data
cars.corr()

In [None]:
# heatmap for cars numerical data
plt.figure(figsize = [15,8])
sns.heatmap(cars.corr(), annot = True, cmap = 'RdYlGn')
plt.show()

lets create dummies for the categorical columns in cars dataset

In [None]:
cars.head()

creating dummies for fuel, seller_type, transmission, owner columns

In [None]:
catcols = ['fuel', 'seller_type', 'transmission', 'owner']
dummies = pd.get_dummies(cars[catcols], drop_first= True)
dummies.head()

Combining these dummies with cars dataset and dropping original columns

In [None]:
# concatinating datasets cars and dummies
cars_final = pd.concat([cars, dummies],axis = 1)
cars_final.head()

In [None]:
# dropping original columns 'fuel', 'seller_type', 'transmission', 'owner'
cars_final.drop(['fuel', 'seller_type', 'transmission', 'owner'], axis = 1, inplace = True)

lets look at the heatmap again to check the collinearity between vairables in the data

In [None]:
cars_final.corr()

In [None]:
plt.figure(figsize = [20,15])
sns.heatmap(cars_final.corr(), annot = True, cmap = 'RdYlGn')
plt.show()

In [None]:
cars_final.shape

Finally we have 17 columns in the dataset with all columns in numerical.


## Feature selection

In [None]:
# importing required libraries
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression

In [None]:
y = cars_final[['selling_price']]
X = cars_final.drop('selling_price',axis= 1)

In [None]:
X.head()

In [None]:
y.head()

Lets split the data into train and test


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 100)

In [None]:
print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)
print('y_train shape',y_train.shape)
print('y_test shape',y_test.shape)

Checking r2 score using sklearn linear model

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X,y)

In [None]:
y_pred = lr.predict(X)

In [None]:
r2score = r2_score(y, y_pred)
r2score

As we can see r2score is very low for this model, lets move on to build model using statsmodel

In [None]:
#importing rfe
from sklearn.feature_selection import RFE

Taking top 20 features by automatic selection technique

In [None]:
rfe = RFE(lr, 15)

In [None]:
rfe.fit(X_train,y_train)

In [None]:
# list of columns recommended by rfe
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
# list of columns recommended by RFE
finalcol = X_train.columns[rfe.support_]
finalcol

In [None]:
# List of columns removed by RFE
X_train.columns[~rfe.support_]

## Building model 1 using statsmodel

In [None]:
X_train = X_train[finalcol]


In [None]:
#building statsmodel
X_train_sm = sm.add_constant(X_train)
model1 = sm.OLS(y_train,X_train_sm).fit()
model1.summary()

Lets drop mileage column since it has high p-value

In [None]:
X_train = X_train.drop('fuel_Petrol', axis = 1)

## buliding model 2

In [None]:
#building statsmodel
X_train_sm = sm.add_constant(X_train)
model1 = sm.OLS(y_train,X_train_sm).fit()
model1.summary()

All p values are good, however, lets see the Variance inflation factor for all the vairables to check multicollinearity

In [None]:
X_train = X_train.drop('owner_Fourth & Above Owner', axis = 1)

In [None]:
## Buliding model 3

In [None]:
#building statsmodel
X_train_sm = sm.add_constant(X_train)
model1 = sm.OLS(y_train,X_train_sm).fit()
model1.summary()

In [None]:
X_train = X_train.drop('mileage', axis = 1)

In [None]:
#building statsmodel
X_train_sm = sm.add_constant(X_train)
model1 = sm.OLS(y_train,X_train_sm).fit()
model1.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train = X_train.drop(['seats'], axis = 1)

In [None]:
#building statsmodel
X_train_sm = sm.add_constant(X_train)
model1 = sm.OLS(y_train,X_train_sm).fit()
model1.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train = X_train.drop(['year'], axis = 1)

In [None]:
#building statsmodel
X_train_sm = sm.add_constant(X_train)
model1 = sm.OLS(y_train,X_train_sm).fit()
model1.summary()

In [None]:
X_train = X_train.drop(['fuel_LPG'], axis = 1)

In [None]:
#building statsmodel
X_train_sm = sm.add_constant(X_train)
model1 = sm.OLS(y_train,X_train_sm).fit()
model1.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train = X_train.drop(['max_power'], axis = 1)

In [None]:
#building statsmodel
X_train_sm = sm.add_constant(X_train)
model1 = sm.OLS(y_train,X_train_sm).fit()
model1.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train = X_train.drop(['seller_type_Trustmark Dealer'], axis = 1)

In [None]:
#building statsmodel
X_train_sm = sm.add_constant(X_train)
model1 = sm.OLS(y_train,X_train_sm).fit()
model1.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif