In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install xgboost

In [None]:
!pip install wordcloud

In [None]:
!pip install plotly

In [None]:
import numpy as np # Multi-dimensional array object
import pandas as pd # Data Manipulation
import seaborn as sns # Data Visualization
import matplotlib.pyplot as plt # Data Visualization
import plotly.express as px # Interactive Data Visualization 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # Offline version of the Plotly modules.


In [None]:
# Read the CSV file 
car_df = pd.read_csv('/kaggle/input/cars1/CARS.csv')

In [None]:
# Load the top 10 instances
car_df.head(10)

In [None]:
# Load the bottom 10 instances 
car_df.tail(10)

In [None]:
# Display the feature columns
car_df.columns

In [None]:
# Check the shape of the dataframe
car_df.shape

In [None]:
# Check if any missing values are present in the dataframe
car_df.isnull().sum()

In [None]:
car_df = car_df.dropna()

In [None]:
# Obtain the summary of the dataframe
car_df.info()

In [None]:
# Convert MSRP and Invoice datatype to integer so we need to remove $ sign and comma (,) from these 2 columns

car_df["MSRP"] = car_df["MSRP"].str.replace("$", "")
car_df["MSRP"] = car_df["MSRP"].str.replace(",", "")
car_df["MSRP"] = car_df["MSRP"].astype(int)

In [None]:
car_df["MSRP"]

In [None]:
car_df["Invoice"] = car_df["Invoice"].str.replace("$", "")
car_df["Invoice"] = car_df["Invoice"].str.replace(",", "")
car_df["Invoice"] = car_df["Invoice"].astype(int)

In [None]:
# Let's view the updated MSRP and Invoice Columns
car_df.head()

In [None]:
# Display the updated summary of the dataframe
car_df.info()

In [None]:
car_df.describe()

In [None]:
print(car_df.MSRP.max())

In [None]:
print(car_df.MSRP.min())

In [None]:
# scatterplots for joint relationships and histograms for univariate distributions

sns.pairplot( data = car_df)

In [None]:
# Let's view various makes of the cars
car_df.Make.unique()

In [None]:
fig = px.histogram(car_df, x = "Make",
                  labels = {"Make":"Manufacturer"},
                  title = "MAKE OF THE CAR",
                  color_discrete_sequence = ["maroon"])
                  
fig.show()

In [None]:
# Let's view various types of the cars
car_df.Type.unique()

In [None]:
fig = px.histogram(car_df, x = "Type",
                  labels = {"Type":"Type"},
                  title = "TYPE OF THE CAR",
                  color_discrete_sequence = ["blue"])
                  
fig.show()

In [None]:
# Let's plot the location
car_df.Origin.unique()

In [None]:
fig = px.histogram(car_df, x = "Origin",
                  labels = {"Origin":"Origin"},
                  title = "LOCATION OF THE CAR SALES",
                  color_discrete_sequence = ["brown"])
                  
fig.show()

In [None]:
# Let's view the drivetrain of the cars
car_df.DriveTrain.unique()

In [None]:
fig = px.histogram(car_df, x = "DriveTrain",
                  labels = {"DriveTrain":"Drivetrain"},
                  title = "DRIVETRAIN OF THE CAR",
                  color_discrete_sequence = ["BLACK"])
                  
fig.show()

In [None]:
# Plot the make of the car and its location
fig = px.histogram(car_df, x = "Make",
                  color = "Origin",
                  labels = {"Make":"Manufacturer"},
                  title = "MAKE OF THE CAR Vs LOCATION")
                  
fig.show()

In [None]:
fig = px.histogram(car_df, x = "Make",
                  color = "Type",
                  labels = {"Make":"Manufacturer"},
                  title = "MAKE AND TYPE OF THE CAR",
                  opacity = 1)
                  
fig.show()

In [None]:
# Let's view the model of all used cars using WordCloud generator
from wordcloud import WordCloud, STOPWORDS

In [None]:
car_df

In [None]:
text = car_df.Model.values

In [None]:
text

In [None]:
stopwords = set(STOPWORDS)

In [None]:
stopwords

In [None]:
wc = WordCloud(background_color = "black", max_words = 2000, max_font_size = 100, random_state = 3, 
              stopwords = stopwords, contour_width = 3).generate(str(text))          

In [None]:
fig = plt.figure(figsize = (25, 15))
plt.imshow(wc, interpolation = "bilinear")
plt.axis("off")
plt.show()

In [None]:
# Obtain the correlation matrix
car_df.corr()

In [None]:
sns.heatmap(car_df.corr(), annot= True)

In [None]:
car_df.head()

In [None]:
# Perform One-Hot Encoding for "Make", "Model", "Type", "Origin", and "DriveTrain"
df_dum = pd.get_dummies(car_df, columns = ['Make','Model','Type','Origin','DriveTrain'])

In [None]:
df_dum

In [None]:
# Invoice feature does not contribute to car price prediction 
df_data = df_dum.drop(['Invoice'], axis=1)

In [None]:
df_data

In [None]:
df_data.shape

In [None]:
# Feeding input features to X and output (MSRP) to y
X = df_data.drop("MSRP", axis = 1)
y = df_data["MSRP"]

In [None]:
X = np.array(X)

In [None]:
y = np.array(y)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.2)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
from math import sqrt

In [None]:
LinearRegression_model = LinearRegression()
LinearRegression_model.fit(X_train, y_train)

In [None]:
accuracy_LinearRegression = LinearRegression_model.score(X_test, y_test)
accuracy_LinearRegression

In [None]:
# Photo Credits:
# https://creazilla.com/nodes/22202-giraffe-clipart 
# https://pixy.org/4569488/ 
# https://pixabay.com/illustrations/monkey-animal-gorilla-zoo-nature-4187960/ 
# https://creazilla.com/nodes/15581-running-tiger-clipart 

In [None]:
from sklearn.tree import DecisionTreeRegressor


In [None]:
DecisionTree_model = DecisionTreeRegressor()
DecisionTree_model.fit(X_train,y_train)

In [None]:
accuracy_DecisionTree = DecisionTree_model.score(X_test, y_test)
accuracy_DecisionTree

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
RandomForest_model = RandomForestRegressor(n_estimators = 5, max_depth = 5)
RandomForest_model.fit(X_train,y_train)

In [None]:
accuracy_RandomForest= RandomForest_model.score(X_test, y_test)
accuracy_RandomForest

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)

In [None]:
accuracy_XGBoost = model.score(X_test, y_test)
accuracy_XGBoost

In [None]:
y_predict_linear = LinearRegression_model.predict(X_test)

fig = sns.regplot(y_predict_linear, y_test, color = 'red', marker = "^")
fig.set(title = "Linear Regression Model", xlabel = "Predicted Price of the used cars ($)", ylabel = "Actual Price of the used cars ($)")

In [None]:
RMSE= float(format(np.sqrt(mean_squared_error(y_test, y_predict_linear)), ".3f"))
MSE= mean_squared_error(y_test, y_predict_linear)
MAE= mean_absolute_error(y_test, y_predict_linear)
r2= r2_score(y_test, y_predict_linear)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2) 

In [None]:
y_predict_RandomForest = RandomForest_model.predict(X_test)

fig = sns.regplot(y_predict_RandomForest, y_test, color = 'blue', marker = "s")
fig.set(title = "Random Forest Regression Model", xlabel = "Predicted Price of the used cars ($)", ylabel= "Actual Price of the used cars ($)")

In [None]:
RMSE= float(format(np.sqrt(mean_squared_error(y_test, y_predict_RandomForest)), ".3f"))
MSE= mean_squared_error(y_test, y_predict_RandomForest)
MAE= mean_absolute_error(y_test, y_predict_RandomForest)
r2= r2_score(y_test, y_predict_RandomForest)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2) 

In [None]:
y_predict_XGBoost = model.predict(X_test)

fig = sns.regplot(y_predict_XGBoost, y_test, color = 'green', marker = "D")
fig.set(title = "XGBoost Model", xlabel = "Predicted Price of the used cars ($)", ylabel = "Actual Price of the used cars ($)")

In [None]:
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict_XGBoost)), ".3f"))
MSE = mean_squared_error(y_test, y_predict_XGBoost)
MAE = mean_absolute_error(y_test, y_predict_XGBoost)
r2 = r2_score(y_test, y_predict_XGBoost)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2) 