# Analysis of a day of week
## HistGradientBoostingRegressor with Optuna and Visualizations of Model Errors to 

## Thanks to [AmbrosM](/www.kaggle.com/ambrosm) for this incredible kernels:

* https://www.kaggle.com/code/ambrosm/tpsmar22-eda-which-makes-sense
* https://www.kaggle.com/code/ambrosm/tpsmar22-without-machine-learning

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, PercentFormatter
from cycler import cycler
from IPython import display
import datetime

plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] + plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])

# 2.7 Get Data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col='row_id', parse_dates=['time'])
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col='row_id', parse_dates=['time'])

print(train.shape, test.shape)

In [None]:
train.head()
df_train = train.copy()


# 4.3 Feature engineering 

In [None]:
# Feature Engineering
for df in [df_train]:
    df['weekday'] = df.time.dt.weekday
    df['hour'] = df.time.dt.hour
    df['minute'] = df.time.dt.minute
    df['day_of_year'] = df.time.dt.day_of_year

In [None]:
FEATURES = df.columns
FEATURES = FEATURES.drop(['congestion'])
TARGET = df.columns[4]

### Only one day

In [None]:
day_to_train = 2

In [None]:
X_train = df[df.weekday == day_to_train]
#X_train = df
X_train = X_train.drop(columns=['time'])

In [None]:
y_train = X_train.congestion
X_train = X_train.drop(columns=['congestion'])

# Remove The least populated class in y has only 2 members, which is less than n_splits=...
floor_quanty = y_train.quantile(0.01)
high_quanty = y_train.quantile(0.99)

y_train = y_train.where(y_train > floor_quanty, floor_quanty)
y_train = y_train.where(y_train < high_quanty, high_quanty)

#y_train.value_counts()

### One-Hot Encoding

In [None]:
object_cols = 'direction'

In [None]:
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols].values.reshape(-1,1)))

#Give cols names to one-hot 
#https://stackoverflow.com/questions/56338847/how-to-give-column-names-after-one-hot-encoding-with-sklearn
OH_cols_train.columns = OH_encoder.get_feature_names()
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)

# 5.1 Quick models

In [None]:
#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

### Optuna with N-fold cross-validation

In [None]:
X_train = OH_X_train.copy()

# Rewrite features to add One-hot
FEATURES = X_train.columns
#TARGET = df.columns[4]

In [None]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import HistGradientBoostingRegressor

skfolds = StratifiedKFold(n_splits=5) #shuffle default=False

def objective(trial):
    
    #https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html#optuna.trial.Trial
    max_depth = trial.suggest_int('max_depth', 18, 20) # Will regularize the model and thus reduce the risk of overfitting
    #max_features = trial.suggest_int('max_features', 50, 60)
    #n_estimators = trial.suggest_int('n_estimators', 300, 350, step=5)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 26, 30)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 45, 50)
    l2_regularization = trial.suggest_float('l2_regularization', 1e-2, 100)
    #floor_value = trial.suggest_float('floor_value', 5e-2, 12e-2, step=0.01)
    #high_value = trial.suggest_float('high_value', 69e-2, 99e-2, step=0.05)
    
    est = HistGradientBoostingRegressor(random_state=42,
                                        min_samples_leaf=min_samples_leaf,
                                        max_leaf_nodes = max_leaf_nodes,
                                        max_depth=max_depth,
                                        l2_regularization = l2_regularization                                        
                                       )

    
    for train_index, test_index in skfolds.split(X_train, y_train):

        X_train_folds = X_train[FEATURES].iloc[train_index]
        y_train_folds = y_train.iloc[train_index]
        
        #4.1 Remove outliers (optional)
        #floor_quanty = y_train_folds.quantile(floor_value)
        #high_quanty = y_train_folds.quantile(high_value)
        
        #y = y.where(y > floor_quanty, floor_quanty)
        #y_train_folds = y_train_folds.where(y_train_folds > floor_quanty, floor_quanty)
        #y_train_folds = y_train_folds.where(y_train_folds < high_quanty, high_quanty)
                
        X_test_fold = X_train[FEATURES].iloc[test_index]
        y_test_fold = y_train.iloc[test_index]
        
        est.fit(X_train_folds, y_train_folds)
        y_pred = est.predict(X_test_fold)
        score = mean_absolute_error(y_test_fold, y_pred)
        print('MAE:', score)
        #return score, y_test_fold, y_pred
        return score

In [None]:
study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction='minimize')
study.optimize(objective, n_trials=20)

# 5.3.2. Optuna. Optimization history

In [None]:
optuna.visualization.plot_optimization_history(study)


In [None]:
study.best_params

In [None]:
trials_df = study.trials_dataframe()
trials_df.params_l2_regularization.plot.line()

# 5.3.4. Optuna. Contour

In [None]:
optuna.visualization.plot_contour(study)


In [None]:
study.best_params

## Start Debug Optuna

In [None]:

for df in [df_train]:
    df['weekday'] = df.time.dt.weekday
    df['hour'] = df.time.dt.hour
    df['minute'] = df.time.dt.minute
    df['day_of_year'] = df.time.dt.day_of_year

In [None]:
X_train = df[df.weekday == day_to_train]
X_train = X_train.drop(columns=['time', 'direction'])

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(X_train, test_size=0.5, random_state=42, shuffle=False)

y_true = test_set.congestion
test_set = test_set.drop(columns=['congestion'])

y = train_set.congestion
X = train_set.drop(columns=['congestion'])

from sklearn.ensemble import HistGradientBoostingRegressor

est = HistGradientBoostingRegressor(scoring = 'neg_mean_absolute_error').fit(X, y)
est.score(X, y)

## End Debug Optuna

# 5.4 Errors the models make
Get threshold

In [None]:
X_train = OH_X_train.copy()

In [None]:
X_train_y = pd.concat([X_train, y_train],axis=1)


In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(X_train_y, test_size=0.2, random_state=42, shuffle=False)

y_true = test_set.congestion
test_set = test_set.drop(columns=['congestion'])

y = train_set.congestion
X = train_set.drop(columns=['congestion'])

In [None]:
est = HistGradientBoostingRegressor(random_state=42, **study.best_params)
est.fit(X, y)
y_pred = est.predict(test_set)
y_pred = y_pred.round()
score = mean_absolute_error(y_true, y_pred)
print('MAE:', score)

In [None]:
#ser_y_pred = pd.Series(y_pred.valu.to_numpy(), index=y_pred.index)
MAE_per_out = y_true.subtract(y_pred).abs()
MAE_per_out = MAE_per_out.sort_values(ascending=False)

In [None]:
MAE_per_out.plot.hist(bins=len(MAE_per_out.value_counts()), figsize=(16, 10))
plt.ylabel('Count')
plt.xlabel('(y_pred - y_true)')

In [None]:
worse_MAE = MAE_per_out[MAE_per_out > 20]
train_worse = df_train.iloc[worse_MAE.index.to_numpy()]

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(train_worse.congestion, bins=np.linspace(-0.5, 100.5, 102))
#plt.bar(range(101), train_worse.congestion.value_counts().sort_index(), width=1,
#        color=['r' if con in [15, 20, 21, 29, 34] else '#ffd700' for con in range(101)])
plt.ylabel('Count')
plt.xlabel('Congestion')
plt.show()

In [None]:
plt.subplots(2, 2, sharex=True, sharey=True, figsize=(16, 12))
for y in range(4):
    plt.subplot(2, 2, y+1)
    vc = train_worse[(train_worse.x == 2) & (train_worse.y == y)].congestion.value_counts().sort_index()
    plt.bar(vc.index, vc, width=1)
            #color=['r' if con in [15, 20, 21, 29, 34] else '#ffd700' for con in vc.index])
    plt.ylabel('Count')
    plt.xlabel('Congestion')
    plt.title(f"(x = {2}) & (y = {y})")
plt.show()