In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
audi_df =  pd.read_csv("../input/used-car-dataset-ford-and-mercedes/audi.csv")
bmw_df = pd.read_csv("../input/used-car-dataset-ford-and-mercedes/bmw.csv")
toyota_df = pd.read_csv("../input/used-car-dataset-ford-and-mercedes/toyota.csv")
print(audi_df.head(5))
print("----------------------------------------")
print(bmw_df.head(5))
print("----------------------------------------")
print(toyota_df.head(5))

In [None]:
df = pd.concat([audi_df, bmw_df, toyota_df], axis=0)
df.describe()

In [None]:
df.info()

In [None]:
corr = df.corr(method="spearman")
ax = sns.heatmap(corr, square=True, linewidth=1, annot=True, fmt=".2f", cmap="coolwarm")
ax.figure.set_size_inches(14, 10)
plt.show()

In [None]:
print(df["transmission"].value_counts())
print(df["fuelType"].value_counts())
top_10_models = df["model"].value_counts()[:10].reset_index()
car_models = []
for car_model in top_10_models["index"]:
    car_models.append(car_model)
    
print(car_models)

In [None]:
model_df = df[df['model'].isin(car_models)]

In [None]:
# Check if there are correlations between the 3 categorical columns with price
plt.figure(figsize=(30,10))
plt.subplot(1,3,1)
sns.violinplot(x="transmission", y="price", data=df, scale="area", inner="quartile")
plt.title("Price vs Transmission")

plt.subplot(1,3,2)
sns.violinplot(x="fuelType", y="price", data=df, scale="area", inner="quartile")
plt.title("Price vs Fuel")

plt.subplot(1,3,3)
sns.violinplot(x="model", y="price", data=model_df)
plt.title("Price vs Top 10 Sold Model")
plt.show()

# 

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_cols = ["model", "transmission", "fuelType"]
OHE = OneHotEncoder(handle_unknown = "ignore", sparse = False)
df_OHE = pd.DataFrame(OHE.fit_transform(df[cat_cols]))
df_OHE.index = df.index
df_num = df.drop(cat_cols, axis=1)
df_concat = pd.concat([df_num, df_OHE], axis=1)

In [None]:
df_concat.head(5)

In [None]:
drop_cols = ["price"]
X = df_concat.drop(drop_cols, axis=1)
y = df['price']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
pred = lr_model.predict(X_val)
error = mae(pred, y_val)
print('MAE of Linear Regression is: {:,.0f}'.format(error))

dt_model = DecisionTreeRegressor(max_leaf_nodes=900, random_state=1)
dt_model.fit(X_train, y_train)
pred = dt_model.predict(X_val)
error = mae(pred, y_val)
print('MAE of Decision Tree is: {:,.0f}'.format(error))

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_val)
error = mae(pred, y_val)
print('MAE of Random Forest is: {:,.0f}'.format(error))

xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
pred = xgb_model.predict(X_val)
error = mae(pred, y_val)
print('MAE of XGBoost is: {:,.0f}'.format(error))

In [None]:
print(f"Linear Regression Model Accuray: {(lr_model.score(X_val, y_val)*100):.2f}")
print(f"Decision Tree Model Accuray: {(dt_model.score(X_val, y_val)*100):.2f}")
print(f"Random Forest Model Accuray: {(rf_model.score(X_val, y_val)*100):.2f}")
print(f"XGBoost Model Accuray: {(xgb_model.score(X_val, y_val)*100):.2f}")

In [None]:
drop_cols = ["price", "tax", "mpg"]
X = df_concat.drop(drop_cols, axis=1)
y = df['price'] 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
pred = lr_model.predict(X_val)
error = mae(pred, y_val)
print('MAE of Linear Regression is: {:,.0f}'.format(error))

dt_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
dt_model.fit(X_train, y_train)
pred = dt_model.predict(X_val)
error = mae(pred, y_val)
print('MAE of Decision Tree is: {:,.0f}'.format(error))

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_val)
error = mae(pred, y_val)
print('MAE of Random Forest is: {:,.0f}'.format(error))

xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
pred = xgb_model.predict(X_val)
error = mae(pred, y_val)
print('MAE of XGBoost is: {:,.0f}'.format(error))

In [None]:
print(f"Linear Regression Model Accuray: {(lr_model.score(X_val, y_val)*100):.2f}")
print(f"Decision Tree Model Accuray: {(dt_model.score(X_val, y_val)*100):.2f}")
print(f"Random Forest Model Accuray: {(rf_model.score(X_val, y_val)*100):.2f}")
print(f"XGBoost Model Accuray: {(xgb_model.score(X_val, y_val)*100):.2f}")