# Used car price prediction Using XGBoost 

---

## 1. PROBLEM STATEMENT AND BUSINESS CASE

This project will train 3 machine learnung algorithm Multitple Linear Regression, Random Forst Regression and XGBoost to predict the price of used cars.

INPUTS: Make, Model, Type, Origin, DriveTrain, MSRP, Invoice, EngineSize, Cylinders, Horsepower, MPG_City, MPG_Highway, Weight, Wheelbase, Length

OUTPUT: MSRP (Price)

This project can be used by car dealership to predict used car prices and understand the key factors that contribute to used car prices.

---

## 2. IMPORT LIBRARIES/DATASETS AND PERFORM EXPLORATORY DATA ANALYSIS

In [None]:
!pip install xgboost

In [None]:
!pip install wordcloud

In [None]:
!pip install plotly

In [None]:
import numpy as np # Multi-dimensional array object
import pandas as pd # Data Manipulation
import seaborn as sns # Data Visualization
import matplotlib.pyplot as plt # Data Visualization
import plotly.express as px # Interactive Data Visualization
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # Offline version of the Plotly modules.

In [None]:
# Read the CSV file 
car_df = pd.read_csv('/kaggle/input/cars1/CARS.csv')

In [None]:
# Load the top 5 instances
car_df.head()

In [None]:
# Display the feature columns
car_df.columns

In [None]:
# Check the shape of the dataframe
car_df.shape

In [None]:
# Check if any missing values are present in the dataframe
car_df.isnull().sum()

In [None]:
# Fix nulls
car_df = car_df.dropna()

In [None]:
# Obtain the summary of the dataframe
car_df.info()

In [None]:
# Convert MSRP and Invoice datatype to integer so we need to remove $ sign and comma (,) from these 2 columns

car_df["MSRP"] = car_df["MSRP"].str.replace("$", "")
car_df["MSRP"] = car_df["MSRP"].str.replace(",", "")
car_df["MSRP"] = car_df["MSRP"].astype(int)

In [None]:
# Convert MSRP and Invoice datatype to integer so we need to remove $ sign and comma (,) from these 2 columns

car_df["Invoice"] = car_df["Invoice"].str.replace("$", "")
car_df["Invoice"] = car_df["Invoice"].str.replace(",", "")
car_df["Invoice"] = car_df["Invoice"].astype(int)

In [None]:
# view the updated MSRP and Invoice Columns
car_df.head()

---

## .3 DATA VISUALIZATION 

In [None]:
# scatterplots for joint relationships and histograms for univariate distributions
sns.pairplot(data = car_df)

In [None]:
# view various makes of the cars
car_df.Make.unique()


In [None]:
fig = px.histogram(car_df, x = "Make",
                  labels = {"Make":"Manufacturer"},
                  title = "MAKE OF THE CAR",
                  color_discrete_sequence = ["maroon"])
                  
fig.show("notebook")

In [None]:
# view various types of the cars
car_df.Type.unique()

In [None]:
fig = px.histogram(car_df, x = "Type",
                  labels = {"Type":"Type"},
                  title = "TYPE OF THE CAR",
                  color_discrete_sequence = ["blue"])
                  
fig.show()

In [None]:
#  plot the location
car_df.Origin.unique()

In [None]:
fig = px.histogram(car_df, x = "Origin",
                  labels = {"Origin":"Origin"},
                  title = "LOCATION OF THE CAR SALES",
                  color_discrete_sequence = ["brown"])
                  
fig.show()

In [None]:
#  view the drivetrain of the cars
car_df.DriveTrain.unique()

In [None]:
fig = px.histogram(car_df, x = "DriveTrain",
                  labels = {"DriveTrain":"Drivetrain"},
                  title = "DRIVETRAIN OF THE CAR",
                  color_discrete_sequence = ["BLACK"])
                  
fig.show()

In [None]:
# Plot the make of the car and its location
fig = px.histogram(car_df, x = "Make",
                  color = "Origin",
                  labels = {"Make":"Manufacturer"},
                  title = "MAKE OF THE CAR Vs LOCATION")
                  
fig.show()

In [None]:
# view the model of all used cars using WordCloud generator
from wordcloud import WordCloud, STOPWORDS

In [None]:
car_df

In [None]:
text = car_df.Model.values

In [None]:
stopwords = set(STOPWORDS)

In [None]:
wc = WordCloud(background_color = "black", max_words = 2000, max_font_size = 100, random_state = 3, 
              stopwords = stopwords, contour_width = 3).generate(str(text))

In [None]:
fig = plt.figure(figsize = (25, 15))
plt.imshow(wc, interpolation = "bilinear")
plt.axis("off")
plt.show()

In [None]:
# Obtain the correlation matrix
sns.heatmap(car_df.corr(), annot = True)

#### Positive correlation between engine size and number of cylinders
#### Positive correlation between horsepower and number of cylinders
#### highest positive correlation with MSRP is = horsepower

---

## .4 PREPARE THE DATA BEFORE MODEL TRAINING

In [None]:
car_df.head()

In [None]:
# Perform One-Hot Encoding for "Make", "Model", "Type", "Origin", and "DriveTrain"
df_dum = pd.get_dummies(car_df, columns =['Make', 'Model', 'Type', 'Origin', 'DriveTrain'])

In [None]:
df_dum.head()

In [None]:
# Invoice feature does not contribute to car price prediction 
df_data = df_dum.drop(['Invoice'], axis = 1)

In [None]:
df_data.head()

In [None]:
# Feeding input features to X and output (MSRP) to y
X = df_data.drop("MSRP", axis = 1)
y = df_data["MSRP"]

In [None]:
X = np.array(X)

In [None]:
y = np.array(y)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2)

In [None]:
X_train.shape

In [None]:
X_test.shape

---

## .5 TRAIN AND EVALUATE A MULTIPLE LINEAR REGRESSION

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
from math import sqrt

In [None]:
LinearRegression_model = LinearRegression()
LinearRegression_model.fit(X_train, y_train)

In [None]:
accuracy_LinearRegression = LinearRegression_model.score(X_test, y_test)
accuracy_LinearRegression

---

## .6 TRAIN AND EVALUATE A DECISION TREE AND RANDOM FOREST MODELS 

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
DecisionTree_model = DecisionTreeRegressor()
DecisionTree_model.fit(X_train, y_train)

In [None]:
accuracy_DecisionTree = DecisionTree_model.score(X_test, y_test)
accuracy_DecisionTree

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
RandomForest_model = RandomForestRegressor( n_estimators = 5, max_depth = 5)
RandomForest_model.fit(X_train, y_train)

In [None]:
accuracy_RandomForest = RandomForest_model.score(X_test, y_test)
accuracy_RandomForest

---

## .7 TRAIN AN XG-BOOST REGRESSOR MODEL

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)

In [None]:
accuracy_XGBoost = model.score(X_test, y_test)
accuracy_XGBoost

---

## .8 COMPARE MODELS AND CALCULATE REGRESSION KPIs

In [None]:
import warnings 
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
y_predict_linear = LinearRegression_model.predict(X_test)
fig = sns.regplot(y_predict_linear, y_test, color = 'red', marker = "^")
fig.set(title = "Linear Regression Model", xlabel = "Predicted Price of the used cars ($)", ylabel = "Actual Price of the used cars ($)")

In [None]:
RMSE= float(format(np.sqrt(mean_squared_error(y_test, y_predict_linear)), ".3f"))
MSE= mean_squared_error(y_test, y_predict_linear)
MAE= mean_absolute_error(y_test, y_predict_linear)
r2= r2_score(y_test, y_predict_linear)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2) 

---

In [None]:
y_predict_RandomForest = RandomForest_model.predict(X_test)

fig = sns.regplot(y_predict_RandomForest, y_test, color = 'blue', marker = "s")
fig.set(title = "Random Forest Regression Model", xlabel = "Predicted Price of the used cars ($)", ylabel= "Actual Price of the used cars ($)")

In [None]:
RMSE= float(format(np.sqrt(mean_squared_error(y_test, y_predict_RandomForest)), ".3f"))
MSE= mean_squared_error(y_test, y_predict_RandomForest)
MAE= mean_absolute_error(y_test, y_predict_RandomForest)
r2= r2_score(y_test, y_predict_RandomForest)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2) 

---

In [None]:
y_predict_XGBoost = model.predict(X_test)

fig = sns.regplot(y_predict_XGBoost, y_test, color = 'green', marker = "D")
fig.set(title = "XGBoost Model", xlabel = "Predicted Price of the used cars ($)", ylabel = "Actual Price of the used cars ($)")

In [None]:
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict_XGBoost)), ".3f"))
MSE = mean_squared_error(y_test, y_predict_XGBoost)
MAE = mean_absolute_error(y_test, y_predict_XGBoost)
r2 = r2_score(y_test, y_predict_XGBoost)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2) 

---

## From the ablove results, it is clearly shown that XGBoost model scores 91% accuracy which outperforms Linear Regression and Random Forest Regression models