In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Loading data (feature hp gets dropped since is 85 for each row)
data   = pd.read_csv("../input/car-price-prediction/car_price.csv")
target = data["price"].values
data.drop(columns=["hp"], inplace=True)

# Data preprocessing
The dataset has almost exclusively categorical data, (except for ```km``` attribute). Being the numerosity of unique elements (for each categorical feature) not too high, we could think of One-Hot encode each one of them. ```Extras``` is composed by a string of extra accessories for each row. Since they are separated by a comma, they can be easily splitted by using ```DataFrame.str.get_dummies(sep=",")```. 

In [None]:
data

In [None]:
# Saving attributes categories for later analysis
features = ["Body Color", "Gearing Type", "body_type", "make_model"]
features_info = {feature: {"uniques": data[feature].unique(),
                           "mean"   : [data.loc[data[feature] == val, "price"].mean() for val in data[feature].unique()],
                           "std"    : [data.loc[data[feature] == val, "price"].std() for val in data[feature].unique()]
                          }
                for feature in features}

# Data preprocessing
data = pd.get_dummies(data, columns=["make_model", "body_type", "Body Color", "Gearing Type"])
columns = data.columns.values
data.rename({column: column.split("_")[-1] for column in columns}, axis="columns", inplace=True)

extras = data["Extras"].str.get_dummies(",")
data.drop(columns=["Extras"], inplace=True)
data = pd.concat([data, extras], axis=1)

This is what we get after the preprocessing steps. Note that some column have been renamed, just for convenience.

In [None]:
data

# Some EDA
The ```km``` column probably refers to used cars. If we look at the scatter plot ```price```-```km```, there is a negative correlation between the two attributes. We can also plot the average price for a car, given a certain body type, color, gearing and model. Looks like orange cars are cheaper! 

In [None]:
from math import floor

# Kilometers - Price 
fig, km_to_price = plt.subplots(figsize=(15,5))
kms = data["km"].values
km_to_price.scatter(kms, target, s=3)
km_to_price.set_xlabel("Kilometers")
km_to_price.set_xlabel("Price")
km_to_price.set_title("Kilometers -> Price (Used cars?)")
plt.show()

# Features - Price
fig, axes = plt.subplots(2, len(features) // 2, figsize=(15,15))
for i, feature in enumerate(features_info):
    axes[floor(i // 2), i % 2].set_title(feature)
    axes[floor(i // 2), i % 2].set_xlabel("Price")
    axes[floor(i // 2), i % 2].barh(y=features_info[feature]["uniques"],
                 width=features_info[feature]["mean"],
                 xerr=features_info[feature]["std"]
                )
    

# Building a model
Trying to predict the car price.

In [None]:
print(f"Target mean: {target.mean()} target std: {target.std()}")

The target standard deviation give us the error of the trivial predictor (target mean). 

In [None]:
# Train / Test splitting
from sklearn.model_selection import train_test_split

numpy_data = data.drop(columns=["price"]).values

x_train, x_test, y_train, y_test = train_test_split(numpy_data, target, test_size=0.3, random_state=24)

The dataset is quite small, so we can directly use sklearn with grid search fpr hyperparameters optimization. I tried an Histogram-based Gradient Boosting Regression Tree. No particular reason for that, it is possible to try out different models and see how they perform. 

In [None]:
# Models testing 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics         import r2_score, mean_squared_error, make_scorer

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble     import HistGradientBoostingRegressor                           

estimators = {'HistGradientBoostingRegressor': {'func'  : HistGradientBoostingRegressor(random_state=42),
                                                'params': {'learning_rate'    : [0.05, 0.1, 0.15],
                                                           'min_samples_leaf' : [10, 20, 30],
                                                           'max_leaf_nodes'   : [31, 41, 51]}},
             }

models_to_test = estimators.keys()
for name in models_to_test:
    model = GridSearchCV(estimator=estimators[name]["func"],
                         param_grid=estimators[name]["params"],
                         scoring=make_scorer(r2_score),
                         n_jobs=-1)
    model.fit(x_train, y_train)
    preds   = model.predict(x_test)
    print("{}: \n R2: {:.3f} \n RMSE {} \n BP: {} \n".format(name, 
                                                          r2_score(y_test, preds),
                                                          np.sqrt(r2_score(y_test, preds)),
                                                          model.best_params_))