# Import Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')
SEED = 69420
FILE_PATH = '../input/used-car-dataset-ford-and-mercedes/ford.csv'

plt.rcParams['figure.figsize'] = (10.0, 8.0)


# Simple EDA

In [None]:
df = pd.read_csv(FILE_PATH)

In [None]:
df

In [None]:
df = df.sort_values('year')

In [None]:
df

Obviously 2060 is a misentry so we remove it

In [None]:
df = df.iloc[:-1, :]

In [None]:
df

In [None]:
nullvaluecheck = pd.DataFrame(df.isna().sum().sort_values(ascending=False)*100/df.shape[0],columns=['missing %']).head(60)
nullvaluecheck.style.background_gradient(cmap='PuBu')

**Visualise Quantiles Over Time**

This helps us understand how the distribution of price has changed over time

In [None]:
plt.figure(figsize=(10, 8))
ax = sns.boxplot(data = df, x='year',y='price')
# plt.yscale('log')
for item in ax.get_xticklabels():
    item.set_rotation(90)

Lots of outliers! But price has been generally trending upwards (aside from 2008 ofc where there was the financial crisis thus making the wick massive

**How Many Values for each year?**

In [None]:
years = df.year.unique()
for year in years:
    print(f"{year}")
    stats = df[df['year']==year].describe()
    print(f"Number of Prices for {year} is {stats.iloc[0,1]}")

Yikes! Imagine trying to model an entire years worth of prices based on only 115 sales!

We are going to only take the years 2013 onwards from hereon

In [None]:
df = df.iloc[498:]

**Sanity Check**

In [None]:
df.year.unique()

In [None]:
sns.displot(x=df.price, hue=df.year, kind='kde', palette=sns.color_palette()[:8], aspect=2)
plt.title('KDEPlot of all Models Prices')
plt.show()

This plot shows us that apart from 2019 (when Ford Scandal occured) the data generally belongs to the same distribution just with a greater magnitude

In [None]:
ax = sns.countplot(df.model)
plt.title('Countplot of all Models')
for item in ax.get_xticklabels():
    item.set_rotation(90)

print(df.model.value_counts())

**Yikes again! We are only going to model the top 4 models**

In [None]:
models = list(df.model.value_counts().index[:4])
models

In [None]:
df = df[df.model.isin(models)]

**Sanity Check!**

In [None]:
df['model'].value_counts()

# EDA on Cleaned DF

In [None]:
sns.displot(x=df.price, hue=df.year, kind='kde', palette=sns.color_palette()[:8], aspect=2)
plt.title('KDEPlot of Selected Model Prices')
plt.show()

In [None]:
ax = sns.countplot(df.model)
plt.title('Countplot of all Models')
for item in ax.get_xticklabels():
    item.set_rotation(90)

print(df.model.value_counts())

In [None]:
plt.figure(figsize=(10, 8))
ax = sns.boxplot(data = df, x='year',y='price')
plt.title('BoxPlot of Selected Model Prices Yearly')
for item in ax.get_xticklabels():
    item.set_rotation(90)

Still we can clearly see outliers! I will clip the data to < 35k to clean them

In [None]:
df = df[df['price'] <= 35000]

In [None]:
plt.figure(figsize=(10, 8))
ax = sns.boxplot(data = df, x='year',y='price')
plt.title('BoxPlot of Selected Model Prices Yearly')
for item in ax.get_xticklabels():
    item.set_rotation(90)

In [None]:
sns.pairplot(df)

In [None]:
corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(corr,
            mask=mask,
            cmap='PuBu',
            square=True,
            linewidths=.5,
            annot=True)
plt.show()

# Model Prep

In [None]:
y = df['price'].values
df.drop('price', axis=1, inplace=True)
X = df

In [None]:
X.shape, y.shape

In [None]:
df

In [None]:
cols_to_encode = list(X.columns[X.dtypes == 'object'])
cols_to_encode.append('year')
cols_to_encode

# **Special Encoder Class**

In [None]:
# https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
MCLE = MultiColumnLabelEncoder(columns = cols_to_encode)

X = MCLE.fit_transform(X)

# Model Building

Finally! I will publish the cleaned X, y sets for those who want to use them for an easy comparison between models

**THE TEST SETS ARE HOLDOUT - NO PART OF MODELLING, ONLY EVALUATION - USE VAL FOR TUNING**

In [None]:
from sklearn.model_selection import train_test_split

# 60 20 20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=SEED)

X_train.shape, X_test.shape, X_val.shape, y_train.shape, y_test.shape, y_val.shape

In [None]:
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
X_val.to_csv('X_val.csv')

pd.Series(y_train).to_csv('y_train.csv')
pd.Series(y_test).to_csv('y_test.csv')
pd.Series(y_val).to_csv('y_val.csv')

# Optuna Tuning XGBoost

In [None]:
import optuna 
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna import Trial, visualization
from optuna.samplers import TPESampler

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from statistics import mean

In [None]:
def objective(trial: Trial, X, y) -> float:
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=SEED)
    evals = [(X_val, y_val)]
    
    # Assign Parameter Dict
    param = {
                "n_estimators":trial.suggest_int('n_estimators', 0, 1000),
                'max_depth':trial.suggest_int('max_depth', 2, 25),
                'reg_alpha':trial.suggest_int('reg_alpha', 0, 5),
                'reg_lambda':trial.suggest_int('reg_lambda', 0, 5),
                'min_child_weight':trial.suggest_int('min_child_weight', 0, 5),
                'gamma':trial.suggest_int('gamma', 0, 5),
                'learning_rate':trial.suggest_loguniform('learning_rate', 0.005, 0.5),
                'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree', 0.1, 1, 0.01)
            }
    
    # Build Model
    model = XGBRegressor(**param,
                         predictor = 'gpu_predictor',
                         tree_method = 'gpu_hist',
                         eval_metric = 'rmse',
                         verbosity=1)
    
    # Fit Model
    model.fit(X_train, y_train, eval_set = evals, eval_metric = 'rmse', early_stopping_rounds = 10)
    
    # Predict
    y_pred = model.predict(X_test)
    y_valpred = model.predict(X_val)
    
    # Compute Metrics
    test_rmse = mean_squared_error(y_test, y_pred, squared=False)
    val_rmse = mean_squared_error(y_val, y_valpred, squared=False)
    
    return mean((val_rmse, test_rmse))

In [None]:
study = optuna.create_study(study_name='CarPriceRegression',
                            direction='minimize',
                            sampler=TPESampler())

In [None]:
%%time
# To conserve computing time I have limited the trials here to 10
study.optimize(lambda trial : objective(trial, X, y),
               n_trials= 100,
               n_jobs=-1,
               gc_after_trial=True,
               show_progress_bar=True)

print('Best trial: RMSE {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [None]:
hist = study.trials_dataframe()
hist.head()

In [None]:
# Deselect Objective Value to see the curve more clearly!
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

# Train Full Model

In [None]:
print('Best trial: RMSE {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [None]:
clf = XGBRegressor(**study.best_trial.params,
                   predictor = 'gpu_predictor',
                   random_state = SEED,
                   verbosity=0)

evals = [(X_val, y_val)]

In [None]:
%%time
clf.fit(X_train, y_train,
        eval_set = evals,
        eval_metric = 'rmse',
        early_stopping_rounds = 10)

# Final Results...

In [None]:
# Predict
y_pred = clf.predict(X_test)

# Compute Metrics
test_rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"Final Model RMSE: {test_rmse}")

# Next Steps?

* Use Optuna to tune OVR CrossValidated Classifier

* Use different types of Boosting

* User different ML Algorithms

* KNN Feature Generations

* Deep Learning Neural Nets

* AutoML Libraries

**If you enjoyed this please upvote to reach others!**