In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# importing required libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [None]:
# reading the csv file
cars=pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/Car details v3.csv")

In [None]:
#displaying head of the data
cars.head()

In [None]:
#checking the shape and size
print("shape of the dataset is : ",cars.shape)
print("size of the dataset is : ",cars.size)

In [None]:
#checking the datatypes for each column
cars.dtypes


In [None]:
# checking for null values
cars.isnull().sum()

as there are null values in the above data we have to deal with them to proceed further

In [None]:
#describing the dataset
cars.describe()

In [None]:
cars[cars.mileage.isnull()]

In [None]:
cars[cars.engine.isnull()]

In [None]:
cars[cars.max_power.isnull()]

In [None]:
cars[cars.torque.isnull()]

In [None]:
cars[cars.seats.isnull()]

As we can see that for all the torque null values , mileage,max_power and seats also have null values. so we can drop those columns from the dataset

In [None]:
# dropping the null values
cars.dropna(axis=0,inplace=True)

In [None]:
#checking the shape after dropping
print("The shape of the data after dropping the null values is: ",cars.shape)

In [None]:
#checking for null values
cars.isnull().sum()

lets convert the mileage,engine,maxpower,torque columns data types

In [None]:
# converting the datatype of mileage column to numerical
cars['mileage']=cars.mileage.apply(lambda x: float(x.split(' ')[0]))
cars['mileage'].dtype

In [None]:
cars['mileage'].head()

In [None]:
# converting the data type of engine column to int
cars['engine']=cars.engine.apply(lambda x: int(x.split(' ')[0]))

In [None]:
print(cars.engine.dtype)

In [None]:
#converting the datatype of maxpower to float
cars['max_power']=cars.max_power.apply(lambda x: float(x.split(' ')[0]))
print(cars.max_power.dtype)

In [None]:
cars.torque.unique()

Dropping torque column since the data have different type of units and different representation

In [None]:
# dropping torque column
cars.drop('torque',axis=1,inplace=True)

In [None]:
cars.name.value_counts().head(10)

**as we can see that *Maruti Swift Dzire VDI* is the most selling car**

In [None]:
#visualizing the top 10 most sold cars
plt.figure(figsize=(12,8))
cars.name.value_counts().head(10).plot(kind='bar')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
cars.groupby('year').count()['name'].plot(kind='bar')
plt.show()

**inference** :
customers bought the cars in between years 2015-2018
cars which are getting sold mostly are also from the 2015-2018

In [None]:
# selling price distribution
plt.figure(figsize=(12,8))
sns.distplot(cars.selling_price)
plt.show()

we can see from the above graph that the variable selling price is slightly right or postively skewed

In [None]:
#distribution km_driven
plt.figure(figsize=(12,8))
sns.distplot(cars.km_driven)
plt.show()

we can see from the above graph that the most km_driven values are below or approximately 4lakhs

In [None]:
cars.columns

In [None]:
# countplots for fuel,seller_type,transmission and owner

plt.figure(figsize=(20,10))
plt.subplot(2,2,1)
sns.countplot(cars.fuel)
plt.subplot(2,2,2)
sns.countplot(cars.seller_type)
plt.subplot(2,2,3)
sns.countplot(cars.transmission)
plt.subplot(2,2,4)
sns.countplot(cars.owner)
plt.show()

inferences:
- most of the customers use "diesel/petrol" fuel type cars
- most of the cars are getting sold by individual seller type
- most of the car's transmission is manual 
- first owner cars are getting sold mostly than second,third and fourth and above 


In [None]:
#plotting categorical variable vs target variable

plt.figure(figsize=(20,10))
plt.subplot(2,2,1)
sns.barplot(cars.owner,cars.selling_price)
plt.subplot(2,2,2)
sns.barplot(cars.fuel,cars.selling_price)
plt.subplot(2,2,3)
sns.barplot(cars.seller_type,cars.selling_price)
plt.subplot(2,2,4)
sns.barplot(cars.transmission,cars.selling_price)
plt.show()

inferences :
- testdrive cars have high selling price
- selling price is high for diesel fuel type cars followed by petrol type
- automatic transmission cars are highly priced than manual transmission
- dealer type selling price is more compared to individual selling price


In [None]:
#kms_driven vs selling price
plt.figure(figsize=(20,10))
sns.scatterplot(cars.km_driven,cars.selling_price)
plt.show()

inference:
- as we can see that if the km_driven increases the selling_price decreases
- as we can see from the graph that km_driven has outliers above 10lakhs so we are removing those outliers of cars which have been driven more than 10lakh kms

In [None]:
cars=cars[~(cars.km_driven>1000000)]
cars.head()

In [None]:
cars.shape

In [None]:
#engine vs selling_price
plt.figure(figsize=(12,8))
sns.scatterplot(cars.engine,cars.selling_price)
plt.show()

-  we can see from the above graph that higher the engine cc higher the selling_price and we can say that there is slight linear relationship between the two variables
- removing the outliers having engine capacity more than 3400

In [None]:
cars=cars[~(cars.engine>3400)]
cars.head()

In [None]:
cars.shape

In [None]:
#mileage vs selling_price
plt.figure(figsize=(15,5))
sns.scatterplot(cars.mileage,cars.selling_price)
plt.show()

In [None]:
#max_power vs selling_price
plt.figure(figsize=(15,5))
sns.scatterplot(cars.max_power,cars.selling_price)
plt.show()

In [None]:
#seating capacity vs selling_price
plt.figure(figsize=(15,5))
sns.scatterplot(cars.seats,cars.selling_price)
plt.show()

In [None]:
#boxplot for selling price
plt.figure(figsize=(15,5))
sns.boxplot(cars.selling_price)
plt.show()

- as we can see mostly outliers are above 50lakhs so we are removing these records from the dataset

In [None]:
cars=cars[~(cars.selling_price>5000000)]
cars.head()

In [None]:
print("The shape of the dataset after removing the outliers above 50lakh are :  ",cars.shape)

In [None]:
# box plot for engine
plt.figure(figsize=(15,5))
sns.boxplot(cars.engine)
plt.show()

for engine column we have outliers above 2500 cc and below 650, so we are removing these records from the dataset

In [None]:
cars=cars[~((cars.engine>2500)|(cars.engine<650))]
cars.shape

In [None]:
# box plot for mileage
plt.figure(figsize=(15,5))
sns.boxplot(cars.mileage)
plt.show()

removing the outliers from mileage variable >32.5 and <5

In [None]:
cars=cars[~(cars.mileage >32.5) | (cars.mileage <5)]

In [None]:
cars.shape

In [None]:
# box plot for max_power
plt.figure(figsize=(15,5))
sns.boxplot(cars.max_power)
plt.show()

As we can see we have outliers, lets remove few outliers above 175 BHP

In [None]:
#dropping the outliers
cars=cars[~(cars.max_power >175)]
cars.shape

In [None]:
cars.isnull().sum()

lets drop the name column from the data and proceed further to build the linear model

In [None]:
#dropping the name variable
cars.drop('name',axis=1,inplace=True)

In [None]:
cars.head()

In [None]:
#looking at pairplot for numerical data
sns.pairplot(cars)
plt.show()

In [None]:
#checking the correlation
cars.corr()

In [None]:
#plotting the heatmap
plt.figure(figsize=(18,8))
sns.heatmap(cars.corr(),annot=True,cmap= 'Spectral')
plt.show()

lets create dummies for categorical columns in cars dataset

we are not giving drop first =True because we dont know which is significant and which is not significant

In [None]:
# creating dummies
cat_cols=['fuel','seller_type','transmission','owner']
dummies=pd.get_dummies(cars[cat_cols],drop_first=True)
dummies.head()

In [None]:
#concatenating the dummies
cars_final=pd.concat([cars,dummies],axis=1)
cars_final.head()

In [None]:
# dropping the categorical column
cars_final.drop(['fuel','seller_type','transmission','owner'],axis=1,inplace=True)

let's check the heatmap again to check the correlation 

In [None]:
cars_final.corr()

In [None]:
#heatmap for final_dataset
plt.figure(figsize=(20,15))
sns.heatmap(cars_final.corr(),annot=True,cmap='RdYlGn')
plt.show()

Finally we have 21 columns in the dataset with all the columns in numerical

## step 1 : Feature selection

In [None]:
#importing the required libraries
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import LinearRegression

In [None]:
y=cars_final[['selling_price']]
x=cars_final.drop('selling_price',axis=1)


In [None]:
x.head()

In [None]:
y.head()

checking r2score using linear regression model

In [None]:
lr=LinearRegression()

In [None]:
lr.fit(x,y)

In [None]:
y_pred=lr.predict(x)
y_pred

In [None]:
r2score=r2_score(y,y_pred)
r2score

as we can see r2score is very low lets build mode using stats

## lets split the data into train and test

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=100)

In [None]:
print("x_train shape ",x_train.shape)
print("x_test shape ",x_test.shape)
print("y_train shape ",y_train.shape)
print("y_test shape ",y_test.shape)

In [None]:
lr= LinearRegression()

In [None]:
#importing rfe
from sklearn.feature_selection import RFE

Taking top 15 features by automatic selection technique

In [None]:
rfe=RFE(lr,15)

In [None]:
rfe.fit(x_train,y_train)

In [None]:
#list of columns recommended by RFE
list(zip(x_train.columns,rfe.support_,rfe.ranking_))

In [None]:
#list of columns recommended by RFE
final_col=x_train.columns[rfe.support_]
final_col

In [None]:
# list of columns removed by RFE
x_train.columns[~rfe.support_]

## Building model 1 using statsmodel

In [None]:
x_train=x_train[final_col]

In [None]:
#adding constant to x
x_train_sm=sm.add_constant(x_train)

In [None]:
# building stats model
model=sm.OLS(y_train,x_train_sm).fit()
model.summary()

lets drop mileage as it has high p value

In [None]:
x_train=x_train.drop('fuel_Petrol',axis=1)

In [None]:
x_train_sm=sm.add_constant(x_train)
model=sm.OLS(y_train,x_train_sm).fit()
model.summary()

all p values are good, however,lets see the variance inflation factor for all the variables to check multicollinearity


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
x = x_train
vif['Features'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
x_train=x_train.drop('owner_Fourth & Above Owner',axis=1)

In [None]:
x_train_sm=sm.add_constant(x_train)
model=sm.OLS(y_train,x_train_sm).fit()
model.summary()

In [None]:
x_train=x_train.drop('seller_type_Trustmark Dealer',axis=1)

In [None]:
x_train_sm=sm.add_constant(x_train)
model=sm.OLS(y_train,x_train_sm).fit()
model.summary()

In [None]:
vif = pd.DataFrame()
x = x_train
vif['Features'] = x.columns
vif['VIF'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
x_train=x_train.drop('seats',axis=1)

In [None]:
x_train_sm=sm.add_constant(x_train)
model=sm.OLS(y_train,x_train_sm).fit()
model.summary()