# Final model training

Finally... We can train a "final" model (if I had more time, it will definitely will be more like the *final_v0_first_debugging_version*, but it kinda works).

To summarize the decision on this model, we train it:
- With 'good' hyperparameters (found with the hyperparameter tuning of a model with different features... not good... but still)
- Without BERT embeddings

## Import standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import time
from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer

from catboost import CatBoostRegressor

## Import custom scripts

In [2]:
import sys, os
sys.path.append(os.getcwd()+ "/../")
from src.data_preprocessing import DataPreprocessing

from src.compute_metrics import Metrics

## Load all the features
The datapreprocessing pipeline is doing quite some stuff, and in a non-efficient manner (I don't have much time for optimizing that :( )
But it should be less than 2 min

In [3]:
dp = DataPreprocessing(df_path = "../data/real_estate_ads_2022_10.csv",
                        train_indices_path="../data/train_indices.npy", 
                        test_indices_path="../data/test_indices.npy",
                        get_params_from_params=True,
                        get_tfidf_embeddings_flag=True,
                        get_bert_embeddings_flag=False, # IMPORTANT: WITHOUT BERT EMBEDDINGS
                        get_textual_features_flag=True,
                        transform_time_features_flag=True,
                        transform_cyclic_features_flag=True)

## Split train / test data
We can use the datapreprocessing method for that.

This is done for better reproducibility, but can be done with the sklearn train / test split, and setting a seed should suffice.

In [4]:
X_train, X_test = dp.get_train_test_split(dp.X)
y_train, y_test = dp.get_train_test_split(dp.Y)

## Define a function for training
We will use k-fold cross-validation

In [5]:
def train_catboost_and_get_metrics(X, y, 
                                    catboost_params = {},
                                    backward_transform_label=True, 
                                    backward_standardize_flag=False, verbose=False,
                                    standard_scale_flag=False,
                                    impute_flag=False):

    if impute_flag:
        imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        X = pd.DataFrame(imp_mean.fit_transform(X), columns=X.columns)

    bst = CatBoostRegressor(**catboost_params)

    metrics = Metrics(dp=dp, backward_transform_flag=backward_transform_label, backward_standardize_flag=backward_standardize_flag)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        if standard_scale_flag:
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
            X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)

        bst.fit(X_train, y_train)
        y_pred = bst.predict(X_val)

        if verbose: 
            print(f"y_pred: {y_pred[:5]}")
            print(f"y_val: {y_val[:5]}")
            
        computed_metrics = metrics.get_single_train_val_metrics(bst, X_train, y_train, X_val, y_val)
        metrics.append(computed_metrics)

    average_metrics = metrics.get_average()
    std_metrics = metrics.get_std()
    # Add _std to the keys to differentiate them from the average metrics:
    std_metrics = {f"{key}_std" : value for key, value in std_metrics.items()} 

    return {**average_metrics, **std_metrics}

## Define convenience functions for prettier display

In [6]:
def filter_metrics(metrics_dict, only_validation=True, format_mean_std_together=True):

    if only_validation:
        metrics_dict = {key: value for key, value in metrics_dict.items() if "test_" in key}

    if format_mean_std_together:
        metrics_dict = {key: f"{value:.2f} ± {metrics_dict[key+'_std']:.2f}" for key, value in metrics_dict.items() if "std" not in key}

    return metrics_dict

def highlight_max(s):
    is_max = s == s.replace("nan ± nan", "0").apply(lambda x: x.split("+-")[0]).max()
    return ['font-weight: bold' if v else '' for v in is_max]

def highlight_min(s):
    is_min = s == s.replace("nan ± nan", "0").apply(lambda x: x.split("+-")[0]).min()
    return ['font-weight: bold' if v else '' for v in is_min]

def format_results_df(results, column_names=None):
    results_df = pd.DataFrame(results).T

    if column_names is not None:
        results_df.columns = column_names
    
    def apply_highlight(column):
        if column.name in ["test_explained_variance", "test_r2", "test_custom_1", "test_custom_5", "test_custom_10", "test_custom_20"]:
            return highlight_max(column)
        else:
            return highlight_min(column)

    
    return results_df.style.apply(apply_highlight, axis=1)

## Try out different hyperparameters

In [8]:
parameter_dicts_list = [
    {"iterations" : 2000, "learning_rate" : 0.08, "colsample_bylevel" : 1, "depth" : 10, "subsample" : 0.96},
    {"iterations" : 2000, "learning_rate" : 0.09, "colsample_bylevel" : 1, "depth" : 10, "subsample" : 0.96},
]


In [8]:
results_first_params = train_catboost_and_get_metrics(X_train, y_train, catboost_params=parameter_dicts_list[0], impute_flag=True)
print("First model trained and evaluated!")
results_second_params = train_catboost_and_get_metrics(X_train, y_train, catboost_params=parameter_dicts_list[1], impute_flag=True)
print("Second model trained and evaluated!")

0:	learn: 0.9400686	total: 275ms	remaining: 9m 8s
1:	learn: 0.9112916	total: 417ms	remaining: 6m 56s
2:	learn: 0.8858703	total: 559ms	remaining: 6m 12s
3:	learn: 0.8618670	total: 713ms	remaining: 5m 55s
4:	learn: 0.8411135	total: 869ms	remaining: 5m 46s
5:	learn: 0.8210054	total: 932ms	remaining: 5m 9s
6:	learn: 0.8036198	total: 998ms	remaining: 4m 44s
7:	learn: 0.7862850	total: 1.06s	remaining: 4m 24s
8:	learn: 0.7710251	total: 1.12s	remaining: 4m 8s
9:	learn: 0.7580271	total: 1.19s	remaining: 3m 56s
10:	learn: 0.7460632	total: 1.25s	remaining: 3m 45s
11:	learn: 0.7329523	total: 1.31s	remaining: 3m 36s
12:	learn: 0.7207194	total: 1.37s	remaining: 3m 29s
13:	learn: 0.7131157	total: 1.43s	remaining: 3m 22s
14:	learn: 0.7044249	total: 1.49s	remaining: 3m 17s
15:	learn: 0.6974106	total: 1.54s	remaining: 3m 11s
16:	learn: 0.6893182	total: 1.6s	remaining: 3m 7s
17:	learn: 0.6835303	total: 1.66s	remaining: 3m 3s
18:	learn: 0.6759409	total: 1.73s	remaining: 3m
19:	learn: 0.6682247	total: 1.79

In [9]:
df_results = pd.concat([pd.DataFrame(filter_metrics(results_first_params, only_validation=True, format_mean_std_together=True), index=[0]), 
                        pd.DataFrame(filter_metrics(results_second_params, only_validation=True, format_mean_std_together=True), index=[1])])

format_results_df(df_results, column_names=["First params", "Second params"])

Unnamed: 0,First params,Second params
test_explained_variance,0.69 ± 0.10,0.70 ± 0.10
test_r2,0.69 ± 0.10,0.70 ± 0.10
test_mape,255.99 ± 149.95,256.27 ± 153.49
test_median_absolute_error,330.37 ± 2.67,325.33 ± 1.55
test_mean_absolute_error,559.12 ± 4.40,555.17 ± 3.67
test_mean_squared_log_error,0.11 ± 0.03,0.11 ± 0.02
test_custom_1,15.97 ± 0.39,16.54 ± 0.27
test_custom_5,55.91 ± 0.42,56.48 ± 0.17
test_custom_10,79.22 ± 0.22,79.13 ± 0.22
test_custom_20,94.13 ± 0.19,94.11 ± 0.21


# Train with all data

In [9]:
final_params = parameter_dicts_list[0] 

bst = CatBoostRegressor(verbose=False, **final_params)

In [10]:
metrics = Metrics(dp=dp)

bst.fit(X_train, y_train)
y_pred = bst.predict(X_test)

computed_metrics = metrics.get_single_train_val_metrics(bst, X_train, y_train, X_test, y_test)
metrics.append(computed_metrics)

average_metrics = metrics.get_average()

In [11]:
train_data = {key.replace('train_', ''): value for key, value in average_metrics.items() if 'train_' in key}
val_data = {key.replace('test_', ''): value for key, value in average_metrics.items() if 'test_' in key}

df = pd.DataFrame({'Training': train_data, 'Validation': val_data})
df

Unnamed: 0,Training,Validation
explained_variance,0.969216,0.02943
r2,0.969136,0.029133
mape,24.019171,118.569857
median_absolute_error,149.744596,320.926277
mean_absolute_error,230.026072,673.169114
mean_squared_log_error,0.027007,0.131914
custom_1,29.139792,15.764818
custom_5,85.053296,56.690882
custom_10,97.058524,80.025799
custom_20,99.399145,94.588906


In [12]:
import pickle as pkl
with open("../data/models/final_model_catboost.pkl", "wb") as f:
    pkl.dump(bst, f)