In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#importing the dataset
car=pd.read_csv('../input/vehicle-dataset-from-cardekho/Car details v3.csv')

## Feature engineering and vizualization

In [None]:
car.head()

In [None]:
#shape of the dataset
car.shape

In [None]:
#datatypes of the columns
car.dtypes

In [None]:
#checking null values
car.isnull().sum()

In [None]:
car[car.mileage.isnull()]

In [None]:
car[car.engine.isnull()]

In [None]:
car[car.max_power.isnull()]

In [None]:
car[car.torque.isnull()]

The torque,mileage,max_power,engine and seats columns have null values at the same row indices. 
So we can drop all the null values.

In [None]:
# dropping the null values
car.dropna(axis=0,inplace=True)

In [None]:
car['torque'].unique()

Dropping torque column since the data have different type of units and different representation

In [None]:
#dropping torque column
car.drop(['torque'],1,inplace=True)

In [None]:
#value counts of top 10 selling cars
car.name.value_counts().head(10)

In [None]:
#visualizing the top 10 most sold cars
plt.figure(figsize=(12,8))
car.name.value_counts().head(10).plot(kind='bar')
plt.show()

Maruti Swift Dzire VDI is the most selling car

In [None]:
print(car['name'].unique())
print(car['name'].value_counts().sum())

Dropping car name column as it is difficult to predict the selling price based on car name

In [None]:
#dropping car name column
car.drop(['name'],1,inplace=True)

In [None]:
#modifying mileage, engine, max_power column
car['mileage']=car.mileage.apply(lambda x:float(x.split(' ')[0]))
car['engine']=car.engine.apply(lambda x:int(x.split(' ')[0]))
car['max_power']=car.max_power.apply(lambda x:float(x.split(' ')[0]))

In [None]:
#creating new column "no. of years driven" by subtracting current year from year 2021 
car["years_driven"]=int(2021)-car['year']


In [None]:
#dropping year column
car.drop(['year'],1,inplace=True)

In [None]:
#describing the dataset
car.describe()

In [None]:
#inspecting unique values in each column
columns = pd.DataFrame({
    "column":car.columns,
    "unique values": [len(car[i].unique()) for i in car.columns],
    "types": list(car.dtypes)
})
print(columns)

In [None]:
sns.lineplot(data=car, x="years_driven", y="selling_price")

As the no. of years driven increases, selling price decreases

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(data=car, x="km_driven", y="selling_price")

Most of the cars have selling price less than 4 lakhs, and also we can see that as the kms_driven increases selling price decreases

In [None]:
# countplots for fuel,seller_type,transmission and owner
plt.figure(figsize=(20,10))
plt.subplot(2,2,1)
sns.countplot(car.fuel)
plt.subplot(2,2,2)
sns.countplot(car.seller_type)
plt.subplot(2,2,3)
sns.countplot(car.transmission)
plt.subplot(2,2,4)
sns.countplot(car.owner)
plt.show()

inferences:

most of the customers use "diesel/petrol" fuel type cars

most of the cars are getting sold by individual seller type

most of the car's transmission is manual

first owner cars are getting sold mostly than second,third and fourth and above

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(car.seats)

most of the customers buy 5 seater cars

In [None]:
#plotting categorical variable vs target variable

plt.figure(figsize=(20,10))
plt.subplot(2,2,1)
sns.barplot(car.owner,car.selling_price)
plt.subplot(2,2,2)
sns.barplot(car.fuel,car.selling_price)
plt.subplot(2,2,3)
sns.barplot(car.seller_type,car.selling_price)
plt.subplot(2,2,4)
sns.barplot(car.transmission,car.selling_price)
plt.show()

inferences :

testdrive cars have high selling price

selling price is high for diesel fuel type cars followed by petrol type

automatic transmission cars are highly priced than manual transmission

dealer type selling price is more compared to individual selling price

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(car.seats,car.selling_price)
plt.show()

7 seater cars have the high Selling price followed by 2 seater cars 

In [None]:
#engine vs selling_price
plt.figure(figsize=(20,10))
plt.subplot(2,2,1)
sns.scatterplot(car.engine,car.selling_price)
plt.subplot(2,2,2)
sns.scatterplot(car.mileage,car.selling_price)
plt.subplot(2,2,3)
sns.scatterplot(car.max_power,car.selling_price)
plt.show()

In [None]:
# box plot for max_power
plt.figure(figsize=(15,5))
sns.boxplot(car.max_power)
plt.show()

In [None]:
#dropping the outliers
car=car[~(car.max_power>260)]


In [None]:
# box plot for engine
plt.figure(figsize=(15,5))
sns.boxplot(car.engine)
plt.show()

In [None]:
#dropping the outliers
car=car[~(car.engine>3000)]

In [None]:
# box plot for engine
plt.figure(figsize=(15,5))
sns.boxplot(car.mileage)
plt.show()

In [None]:
#dropping the outliers
car=car[~(car.mileage>32)|(car.mileage<5)]

In [None]:
#looking at pairplot for numerical data
sns.pairplot(car)
plt.show()

In [None]:
#plotting the heatmap
plt.figure(figsize=(16,10))
sns.heatmap(car.corr(),annot=True)
plt.show()

From the heapmap we can see that, selling price is not dependent on seats.
There is a very low correlation between selling price and seats. So we can drop seats column 

In [None]:
#dropping seats column
car.drop(['seats'],1,inplace=True)

In [None]:
car['seller_type']=car['seller_type'].map({'Trustmark Dealer':'Trustmark_Dealer','Individual':'Individual','Dealer':'Dealer'})
car['owner']=car['owner'].map({'First Owner':'First_Owner','Second Owner':'Second_Owner','Third Owner':'Third_Owner','Fourth & Above Owner':'Fourth_and_Above_Owner','Test Drive Car':'Test_Drive_Car'})

In [None]:
#encoding categorical variables
car=pd.get_dummies(car,drop_first=True)

In [None]:
x=car.drop(['selling_price'],1)
y=car['selling_price']

### Feature importance

In [None]:
#feature importance
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(x,y)

In [None]:
model_1=model.feature_importances_
columns=x.columns
for i in range(len(columns)):
    print("{} has importance value : {}".format(columns[i],model_1[i]))
    

In [None]:
#vizualizing top 5 important features
feat_imp=pd.Series(model.feature_importances_,index=x.columns)
feat_imp.nlargest(5).plot(kind='barh')
plt.show()

## Train test split

In [None]:
#train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)


## Model training  

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
def regression(model):
    model=model()
    r2=cross_val_score(model,x_train,y_train,scoring='r2',cv=5)
    mean_r2=np.mean(r2)
    print(mean_r2)

In [None]:
#training data on RandomforestRegressor model
from sklearn.ensemble import RandomForestRegressor
regression(RandomForestRegressor)

In [None]:
#training data on ExtraTreesRegressor model
from sklearn.ensemble import ExtraTreesRegressor
regression(ExtraTreesRegressor)

In [None]:
#training data on XGBRegressor model
from xgboost import XGBRegressor
regression(XGBRegressor)

Since ExtraTreesRegressor model is giving highest r^2 value, so we will do hyperparameter optimisation on ExtraTreesRegressor model 

## Hyperparameter optimization

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
regressor=ExtraTreesRegressor(random_state=49)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators= [100, 300, 800, 1000]
max_depth = [10, 20, 30]
max_features=['auto', 'sqrt']
min_samples_split=[2,10,20,50,100]
min_samples_leaf=[1,2,5,10]
hyperparameter={'n_estimators':n_estimators,'max_depth':max_depth,'max_features':max_features,'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}
RS_cv=RandomizedSearchCV(estimator=regressor,param_distributions=hyperparameter,scoring='r2',cv=5,n_jobs=4,return_train_score=True,verbose = 4,random_state=49)
RS_cv.fit(x_train,y_train)

### Finding the Best parameters for our model  

In [None]:
#best r2 score and parameter
print(RS_cv.best_params_)
print(RS_cv.best_score_)

In [None]:
#predicting on the test data
prediction=RS_cv.predict(x_test)

In [None]:
#plotting distribution plot of y_test-prediction
sns.distplot(y_test-prediction)

Above normal distribution signifies that our model is predicting acurately with some errors

In [None]:
#scatter plot between actual outcome and predicted outcome 
plt.scatter(y_test,prediction)

There is very less scatter between the actual outcome and predicted outcome 

In [None]:
#creating pickle file
# dumping model to that file
import pickle
file = open('price_regression_model.pkl', 'wb')
pickle.dump(RS_cv, file)