# **Workflow**

1. Data Collection
2. Data Cleaning & Checking
3. Visualize Data & Checking
4. Splitting Data
5. Training Model Using XGBRegressor
6. Evaluate Model
7. Prediction

## **Import Libraries**

Import the important module

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBRegressor
import pandas.util.testing as tm

## **Load Data**

In [None]:
carData = pd.read_csv('../input/vehicle-dataset-from-cardekho/Car details v3.csv')

## **Data Cleaning & Checking**

In [None]:
# Getting first 5 row of the data
carData.head()

In [None]:
# Getting last 5 row of the data
carData.tail()

In [None]:
# Getting the statistics report from the data
carData.describe()

In [None]:
# Getting some information about the data frame
carData.info()

In [None]:
# Checking the number of missing values
carData.isnull().sum()

In [None]:
# Checking the shape of data
carData.shape

In [None]:
# Find value class width of price
range = carData.selling_price.max() - carData.selling_price.min()
classWidth = range/len(carData)
print(f'Class Width : {classWidth}')

In [None]:
# Looking and Checking the actual Mean value of data
def checkStatisticsMean(data):
    ch1 = data.selling_price.mean()
    ch2 = data.seats.mean()
    ch3 = data.km_driven.mean()
    ch4 = data.year.mean()

    print(f'Price Mean : {ch1}')
    print(f'Seats Mean : {ch2}')
    print(f'Km Driven Mean : {ch3}')
    print(f'Year Mean : {ch4}')

checkStatisticsMean(carData)

In [None]:
# Looking and Checking the actual Max value of data
def checkStatisticsMax(data):
    ch1 = data.selling_price.max()
    ch2 = data.seats.max()
    ch3 = data.km_driven.max()
    ch4 = data.year.max()

    print(f'Price Max : {ch1}')
    print(f'Seats Max : {ch2}')
    print(f'Km Driven Max : {ch3}')
    print(f'Year Max : {ch4}')

checkStatisticsMax(carData)

In [None]:
# Looking and Checking the actual Min value of data
def checkStatisticsMin(data):
    ch1 = data.selling_price.min()
    ch2 = data.seats.min()
    ch3 = data.km_driven.min()
    ch4 = data.year.min()

    print(f'Price Min : {ch1}')
    print(f'Seats Min : {ch2}')
    print(f'Km Driven Min : {ch3}')
    print(f'Year Min : {ch4}')

checkStatisticsMin(carData)

In [None]:
# Looking and Checking the distribution categorical data
def checkDistributionCategorical(data):
    ch1 = data.fuel.value_counts()
    ch2 = data.transmission.value_counts()
    ch3 = data.seller_type.value_counts()
    ch4 = data.owner.value_counts()

    print(f'Fuel Data :\n{ch1}\n')
    print(f'Transmission Data :\n{ch2}\n')
    print(f'Seller Type :\n{ch3}\n')
    print(f'Owner :\n{ch4}\n')

checkDistributionCategorical(carData)

In [None]:
# Encode Fuel Data
carData.replace({'fuel':{'Diesel': 0, 'Petrol': 1, 'CNG': 2, 'LPG': 3}}, inplace=True)

# Encode Transmission Data
carData.replace({'transmission':{'Manual': 0, 'Automatic': 1}}, inplace=True)

# Encode Seller Type Data
carData.replace({'seller_type':{'Individual': 0, 'Dealer': 1, 'Trustmark Dealer': 2}}, inplace=True)

# Encode Owner Data
carData.replace({'owner':{'First Owner': 0, 'Second Owner': 1, 'Third Owner': 2, 'Fourth & Above Owner': 3, 'Test Drive Car': 4}}, inplace=True)

In [None]:
# Checking is the data succesfully Encode
carData.loc[:, ['fuel', 'transmission', 'seller_type', 'owner']]

In [None]:
# Fixing missing value
carData.dropna(inplace=True)

In [None]:
# Checking is the data have Null or Nah
carData.isnull().sum()

In [None]:
# Checking the Object type
carData.loc[:, ['mileage', 'engine', 'max_power', 'torque']]

In [None]:
# Cleaning mileage data
carData['mileage'] = carData['mileage'].str.replace('kmpl', '').str.replace('km/kg', '').str.replace(',', '')

# Cleaning engine data
carData['engine'] = carData['engine'].str.replace('CC', '').str.replace(',', '')

# Cleaning max_power data
carData['max_power'] = carData['max_power'].str.replace('bhp', '').str.replace(',', '')

In [None]:
# Convert mileage data to numeric
carData['mileage'] = pd.to_numeric(carData['mileage'])

# Convert engine data to numeric
carData['engine'] = pd.to_numeric(carData['engine'])

# Convert max_power data to numeric
carData['max_power'] = pd.to_numeric(carData['max_power'])

In [None]:
# Getting some information about the data frame
carData.info()

In [None]:
# Make a correlation data to knowing Value Strength and Direction of Linear Relationship
correlation = carData.corr()

In [None]:
correlation

## **Visualize Data & Checking**

In [None]:
# Setting sns theme
sns.set_theme(color_codes=True, style='darkgrid', palette='deep', font='sans-serif')

In [None]:
# Constructing a heatmap to understand the correlation
plt.figure(figsize=(10, 10))
sns.heatmap(correlation, cbar=True, square=True, fmt='.1f', annot=True, annot_kws={'size': 8}, cmap='Blues')

In [None]:
sns.pairplot(correlation)

In [None]:
sns.jointplot(x="year", y="selling_price", data=carData, kind="reg")

In [None]:
sns.lmplot(x="year", y="selling_price", col="owner", data=carData, col_wrap=2, height=3)

In [None]:
sns.lmplot(x="year", y="selling_price", data=carData, lowess=True)

In [None]:
sns.lmplot(x="year", y="selling_price", hue="seller_type", data=carData)

In [None]:
sns.lmplot(x="year", y="selling_price", col="seller_type", data=carData, aspect=.5)

In [None]:
# Checking the data
carData.head()

In [None]:
# Checking the type of data
carData.dtypes

## **Splitting Data**

In [None]:
X = carData.drop(['name', 'selling_price', 'torque'], axis=1)
y = carData['selling_price']

In [None]:
trainX, testX, trainY, testY = train_test_split(
    X, y,
    test_size=0.1,
    shuffle=False,
    random_state=1)

## **Training Model Using XGBRegressor**

In [None]:
regressor = XGBRegressor(
    gamma=0,
    learning_rate=0.1,
    max_depth=5,
    n_estimators=1000,
    n_jobs=16,
    objective='reg:squarederror',
    subsample=0.8,
    scale_pos_weight=0,
    reg_alpha=0,
    reg_lambda=1
)

model = regressor.fit(trainX, trainY)

## **Evaluate Model**

In [None]:
# predict X train
trainPredict = model.predict(trainX)

# predict X test  
testPredict = model.predict(testX)

In [None]:
# Train X 

# R Squared 
trainRsquared = metrics.r2_score(trainY, trainPredict)
print(f'R-Squared : {trainRsquared}')

# Mean Absolute Error
trainMAE = metrics.mean_absolute_error(trainY, trainPredict)
print(f'MAE : {trainMAE}')

#  Mean Squared Error
trainMSE = metrics.mean_squared_error(trainY, trainPredict)
print(f'MSE : {trainMSE}')

#  Root Mean Squared Error
trainRMSE = math.sqrt(metrics.mean_squared_error(trainY, trainPredict))
print(f'RMSE : {trainRMSE}')

# Median
trainM = metrics.median_absolute_error(trainY, trainPredict)
print(f'Median : {trainM}')

In [None]:
# Test X 

# R Squared 
testRsquared = metrics.r2_score(testY, testPredict)
print(f'R-Squared : {testRsquared}')

# Mean Absolute Error
testMAE = metrics.mean_absolute_error(testY, testPredict)
print(f'MAE : {testMAE}')

#  Mean Squared Error
testMSE = metrics.mean_squared_error(testY, testPredict)
print(f'MSE : {testMSE}')

#  Root Mean Squared Error
testRMSE = math.sqrt(metrics.mean_squared_error(testY, testPredict))
print(f'RMSE : {testRMSE}')

# Median
testM = metrics.median_absolute_error(testY, testPredict)
print(f'Median : {testM}')

In [None]:
# Train Predicted Value & Actual Value
test = pd.DataFrame({'Predicted value':trainPredict, 'Actual value':trainY})
fig= plt.figure(figsize=(16,8))
test = test.reset_index()
test = test.drop(['index'],axis=1)
plt.plot(test[:50])
plt.legend(['Actual value','Predicted value'])

In [None]:
# Test Predicted Value & Actual Value
test = pd.DataFrame({'Predicted value':testPredict, 'Actual value':testY})
fig= plt.figure(figsize=(16,8))
test = test.reset_index()
test = test.drop(['index'],axis=1)
plt.plot(test[:50])
plt.legend(['Actual value','Predicted value'])

## **Prediction**

In [None]:
trainOutput = pd.DataFrame({
    'Train Actual Price': trainY,
    'Train Predicted Price ': trainPredict})

trainOutput.to_csv('Train Prediction.csv', index=False)

In [None]:
testOutput = pd.DataFrame({
    'Train Actual Price': testY,
    'Train Predicted Price ': testPredict})

testOutput.to_csv('Test Prediction.csv', index=False)

In [None]:
trainDataPredict = pd.read_csv('./Train Prediction.csv')
trainDataPredict

In [None]:
testDataPredict = pd.read_csv('./Test Prediction.csv')
testDataPredict