In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [None]:
# reading the csv file
df=pd.read_csv('../input/vehicle-dataset-from-cardekho/Car details v3.csv')

In [None]:
print(df.shape)

In [None]:
df.name.nunique()

In [None]:
df

In [None]:
df.isnull().sum()

### EDA and Feature Engineering

In [None]:
sns.displot(df['selling_price'],kind='kde')

In [None]:
df['selling_price'].describe()

Majority Selling prices are under 10L as seen from the above plot.

Relationship of selling price with other variables like - Mileage, km driven, seats, owner, year

In [None]:
# selling price vs km driven
dat=df.loc[(df['selling_price']<20_00_000) & (df['km_driven']<5_00_000)]

sns.regplot(data=dat,x='km_driven',y='selling_price')

In [None]:
# selling price vs mileage
data=pd.DataFrame()
data['mileage']=df.loc[(df['selling_price']<10_00_000)].mileage.str.split(expand=True)[0].astype('float64')
data['sp']=df.loc[(df['selling_price']<10_00_000)].selling_price
sns.regplot(data=data,x='mileage',y='sp')

In [None]:
# selling price vs owner
sns.catplot(data=df,x='owner',y='selling_price')
plt.figure(figsize=(10,10))
plt.show()

In [None]:
# there are only 5 test drive cars and these act as outliers which can lead to inaccurate predictions. They can be dropped
df=df[~(df.owner=='Test Drive Car')]

In [None]:
#seats
plt.figure(figsize=(10,10))
sns.boxplot(data=df,x='seats',y='selling_price')

In [None]:
#year
plt.figure(figsize=(15,7))
sns.boxplot(data=df,x='year',y='selling_price')

In [None]:
#transmission
plt.figure(figsize=(7,7))
sns.boxplot(data=df,x='transmission',y='selling_price')

In [None]:
#seller_type
plt.figure(figsize=(7,7))
sns.boxplot(data=df,x='fuel',y='selling_price')

### Conclusions form the above plots-


1. It seems that as the km_driven increases, the selling_price decreases, which makes sense
2. Selling price and mileage are directly proportional, as expected
3. As the number of owners decrease, the price increases
4. Automatic Transsmissiom vehicles sell at a higher price
5. Newer cars also sell at a higher price except the year 2020. (This affect is mostly due to the pandemic)
Most patterns are as one would expect. 

In [None]:
df['Brand']=df.name.str.split(expand=True)[0]
#power data is available hence we don't need Torque&RPM data
df.drop(columns=['name','torque'],inplace=True)

In [None]:
#count of each brand
Brand=pd.DataFrame(df.groupby(['Brand']).year.count()).sort_values(ascending=False,by='year')
Brand


In [None]:
#bottom 10 brands
remove=list(Brand.index[-10:])
remove

In [None]:
# removing the bottom 10 brands
df=df[~df.Brand.isin(remove)]

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(data=df,x='Brand')

In [None]:
#splitting strings
df.loc[:,'engine']=df['engine'].astype('str').str.split(expand=True)[0]
df.loc[:,'mileage']=df['mileage'].astype('str').str.split(expand=True)[0]
df.loc[:,'max_power']=df['max_power'].astype('str').str.split(expand=True)[0]


In [None]:
# converting to correct data type
df['engine']=df['engine'].astype('float64')
df=df.loc[~df['max_power'].isin(['bhp'])]
df['mileage']=df['mileage'].astype('float64')
df['max_power']=df['max_power'].astype('float64')

In [None]:
corrmatrix=df.corr()
sns.heatmap(corrmatrix,square=True,annot=True)

In [None]:
df.info()

From above EDA & correlation matrix, the factors to be considered for the model are - year, transmission, engine, power, owner, fuel.
Lets plot these in pairs to see if we find some interesting stuff

In [None]:
sns.set()
col=['selling_price','year','transmission','engine','max_power','owner','fuel']
sns.pairplot(df[col], size=3)
plt.show()

### Cleaning the data for modelling

In [None]:
df.info()

In [None]:
#as mileage, engine, maxpower and unique and important parameters for prediction, rows which have these values empty should be dropped

In [None]:
df.dropna(inplace=True)

In [None]:
#dealing with categorical variables
ownerohc=pd.get_dummies(df['owner'])

fuelohc=pd.get_dummies(df['fuel'])
df['transmission']=df.transmission.map({'Manual':1,'Automatic':0})

In [None]:
# adding encoded variables to the dataset
df.drop(columns=['fuel','owner'],inplace=True)
df=pd.concat([df,ownerohc,fuelohc],axis=1)

In [None]:
df

In [None]:
cols=['year',
 'selling_price',
 'km_driven',
 'mileage',
 'transmission',
 
 'engine',
 'max_power',
 
 'Brand',
 'First Owner',
 'Fourth & Above Owner',
 'Second Owner',
 
 'Third Owner',
 'CNG',
 'Diesel',
 'LPG',
 'Petrol']

# Choosing features- Year, km_driven, transmission, engine, max_power, Brand, Owner, Fuel

In [None]:
dat=df[cols]

In [None]:
brandohc=pd.get_dummies(dat['Brand'])
dat.drop(columns=['Brand'],inplace=True)
dat=pd.concat([dat,brandohc],axis=1)

### Modelling

In [None]:
X=dat.drop(columns=['selling_price'])
y=dat['selling_price']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [None]:
model1=LinearRegression()
model1.fit(X_train,y_train)
preds=model1.predict(X_test)
mean_absolute_error(y_test,preds)

In [None]:
y.mean()

In [None]:
#use randomtreeregressor/xgboost
#get coef from the linearregressor, feature engineer a bit

In [None]:
#decision tree
from sklearn.tree import DecisionTreeRegressor
model2=DecisionTreeRegressor(random_state=0)
model2.fit(X_train,y_train)
preds2=model2.predict(X_test)
mean_absolute_error(y_test,preds2)

In [None]:
model2

In [None]:
# the appropriate amount of leaf nodes
def mae(max_leaf, X_train, X_test, y_train, y_test):
    model=DecisionTreeRegressor(max_leaf_nodes=max_leaf, random_state=0)
    model.fit(X_train,y_train)
    pred=model.predict(X_test)
    meanabse=mean_absolute_error(y_test,pred)
    return(meanabse)

for m in [50,100,250,500,750,800,825,850,875,900,1000,2000,5000,7500,10000]:
    error=mae(m,X_train, X_test, y_train, y_test)
    print('Max leaf nodes = {}, MAE = {}'.format(m,error))

Max_leaf_nodes=800

In [None]:
#gradient boosting
from xgboost import XGBRegressor
model3=XGBRegressor()
model3.fit(X_train,y_train)
preds3=model3.predict(X_test)
mean_absolute_error(y_test,preds3)

In [None]:
#Using early stopping rounds
from xgboost import XGBRegressor
model3=XGBRegressor(early_stopping_rounds=5,n_estimators=500)
model3.fit(X_train,y_train)
preds3=model3.predict(X_test)
mean_absolute_error(y_test,preds3)

In [None]:
#plotting residuals
sns.scatterplot(x=preds3,y=y_test-preds3)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,preds3)

Above is a model which will predict the selling price of a vehicle, given inputs such as built year, km driven, mileage, transmission type, engine volume, max power , owner, fuel and brand. 
The R-square value is 0.96

Possible improvements-
1. Removing outliers in various parameters.
2. Hyperparameter tuning using grid search and random search
3. Using the model name of the car along with the brand.
