# Catboost training

## Import standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import time
from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer

from catboost import CatBoostRegressor

## Import custom scripts

In [2]:
import sys, os
sys.path.append(os.getcwd()+ "/../")
from src.data_preprocessing import DataPreprocessing

## Load all the features
The datapreprocessing pipeline is doing quite some stuff, and in a non-efficient manner (I don't have much time for optimizing that :( )
But it should be less than 2 min

In [3]:
dp = DataPreprocessing(df_path = "../data/real_estate_ads_2022_10.csv",
                        train_indices_path="../data/train_indices.npy", 
                        test_indices_path="../data/test_indices.npy",
                        get_params_from_params=True,
                        get_tfidf_embeddings_flag=True,
                        get_bert_embeddings_flag=True,
                        get_textual_features_flag=True,
                        transform_time_features_flag=True,
                        transform_cyclic_features_flag=True)

KeyboardInterrupt: 

## Load the metrics class
That is convenient to compute multiple metrics:
- **explained_variance_score**: Measures the proportion of the variance in the dependent variable that is predictable from the independent variable(s). A score of 1 indicates perfect prediction, while a score of 0 indicates that the model does not explain any of the variance.

- **r2_score**: Also known as the coefficient of determination, it indicates the proportion of the variance in the dependent variable that is predictable from the independent variable(s). A value of 1 indicates a perfect fit, while a value of 0 indicates that the model does not explain any of the variance.

- **mean_absolute_percentage_error (MAPE)**: Measures the average of the absolute percentage errors of predictions. It provides a percentage error which is easy to interpret but can be sensitive to very small actual values.

- **median_absolute_error**: Computes the median of all absolute differences between the target and predicted values. This metric is robust to outliers and gives a better sense of the typical error when outliers are present.

- **mean_squared_error (MSE)**: Measures the average of the squares of the errors—that is, the average squared difference between the estimated values and the actual value. It penalizes larger errors more than smaller ones due to squaring.

- **mean_squared_log_error (MSLE)**: Similar to MSE but takes the logarithm of the predictions and actual values. It is useful when you want to penalize underestimation more than overestimation and is less sensitive to large errors than MSE.

- **custom metrics**: Compute the percentage of times that the error falls less than some threshold. This may correlate with customer satisfaction, if they are for example happy if there's less than a 5% rate, this would count the percentage of happy customers. Of course, this will need further study (for example, segmenting the score)

In [None]:
import importlib
import src.compute_metrics
importlib.reload(src.compute_metrics) # We do this for debugging purposes

from src.compute_metrics import Metrics

## Split train / test data
We can use the datapreprocessing method for that.

This is done for better reproducibility, but can be done with the sklearn train / test split, and setting a seed should suffice.

In [None]:
X_train, X_test = dp.get_train_test_split(dp.X)
y_train, y_test = dp.get_train_test_split(dp.Y)

## Define convenience functions for prettier display

In [None]:
def filter_metrics(metrics_dict, only_validation=True, format_mean_std_together=True):

    if only_validation:
        metrics_dict = {key: value for key, value in metrics_dict.items() if "test_" in key}

    if format_mean_std_together:
        metrics_dict = {key: f"{value:.2f} ± {metrics_dict[key+'_std']:.2f}" for key, value in metrics_dict.items() if "std" not in key}

    return metrics_dict

def highlight_max(s):
    is_max = s == s.replace("nan ± nan", "0").apply(lambda x: x.split("+-")[0]).max()
    return ['font-weight: bold' if v else '' for v in is_max]

def highlight_min(s):
    is_min = s == s.replace("nan ± nan", "0").apply(lambda x: x.split("+-")[0]).min()
    return ['font-weight: bold' if v else '' for v in is_min]

def format_results_df(results, column_names=None):
    results_df = pd.DataFrame(results).T

    if column_names is not None:
        results_df.columns = column_names
    
    def apply_highlight(column):
        if column.name in ["test_explained_variance", "test_r2", "test_custom_1", "test_custom_5", "test_custom_10", "test_custom_20"]:
            return highlight_max(column)
        else:
            return highlight_min(column)

    
    return results_df.style.apply(apply_highlight, axis=1)

## Some hyperparameter tuning

We can use Weights&Biases.

In [None]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msergi-andreu[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
X = X_train.copy()
y = y_train.copy()

def wandb_train(config=None):

    with wandb.init(config=config):
        config = wandb.config

        bst = CatBoostRegressor(**config)

        metrics = Metrics(dp=dp, backward_transform_flag=True, backward_standardize_flag=False)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        for train_index, test_index in kf.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[test_index]
            y_train, y_val = y.iloc[train_index], y.iloc[test_index]

            bst.fit(X_train, y_train)
            y_pred = bst.predict(X_val)
                
            computed_metrics = metrics.get_single_train_val_metrics(bst, X_train, y_train, X_val, y_val)
            metrics.append(computed_metrics)

        average_metrics = metrics.get_average()
        std_metrics = metrics.get_std()
        # Add _std to the keys to differentiate them from the average metrics:
        std_metrics = {f"{key}_std" : value for key, value in std_metrics.items()} 

        wandb.log({**average_metrics, **std_metrics})

    return {**average_metrics, **std_metrics}

## Define a sweep

In [None]:
# Define a method for hyperparameter optimization
# It can be grid search, random, or bayesian search

# We use grid search. The reason being that, although it is more computationally expensive,
# it is very effective, and would not create noise / overfitting in the hyperparameter search

# It is best for a first exploration

sweep_config = {
    'method' : 'bayes'
}

# The metrics would not be used (since using grid search)
# Just adding in case we want to use another method later

metric = {
    'name' : 'test_custom_5',
    'goal' : 'maximize'
}

sweep_config['metric'] = metric

"""
# Now the important part: the parameters to sweep, for a catboost classifier:
parameters_dict = {
    'iterations': { # Number of iterations
        'values' : [1000, 5000] # [100, 200, 500]
    },
    'learning_rate' : { # Learning rate
        'values' : [0.005, 0.01, 0.05, 0.1, 0.022760000079870224] # [0.001, 0.01, 0.1]
    },
    'depth' : { # Depth of the tree
        'values' :  [4, 6, 8] # [1, 5, 10]
    },
    'subsample' : { # Subsample ratio
        'values' : [0.800000011920929, 1] #[0.05, 0.5, 1]
    },
    'colsample_bylevel' : { # Column subsample ratio
        'values' : [1] #[0.05, 0.5, 1]
    },
    'min_data_in_leaf' : { # Minimum number of data in leaf
        'values' : [1, 20, 50] #[1, 5, 20, 50, 100]
    },
}
"""

parameters_dict = {
    'iterations': { # Number of iterations
        'values' : [1000, 2000]
    },
    'learning_rate' : { # Learning rate
        'distribution' : 'log_uniform_values',
        'min' : 0.005,
        'max' : 0.1
    },
    'depth' : { # Depth of the# [1, 5, 10]
        'values' :  [2, 5, 6, 10] 
    },
    'subsample' : { # Subsample ratio
        'distribution' : 'uniform',
        'min' : 0.8,
        'max' : 1
    },
    'colsample_bylevel' : { # Column subsample ratio
        'values' : [0.5, 1] #[0.05, 0.5, 1]
    },
    'min_data_in_leaf' : { # Minimum number of data in leaf
        'values' : [1, 20] #[1, 5, 20, 50, 100]
    },
}

sweep_config["parameters"] = parameters_dict

sweep_id = wandb.sweep(sweep_config, project="olx")

sweep_config

Create sweep with ID: znegrnqe
Sweep URL: https://wandb.ai/sergi-andreu/olx/sweeps/znegrnqe


{'method': 'bayes',
 'metric': {'name': 'test_custom_5', 'goal': 'maximize'},
 'parameters': {'iterations': {'values': [1000]},
  'learning_rate': {'distribution': 'log_uniform_values',
   'min': 0.005,
   'max': 0.1},
  'depth': {'values': [2, 5, 6, 10]},
  'subsample': {'distribution': 'uniform', 'min': 0.8, 'max': 1},
  'colsample_bylevel': {'values': [0.5, 1]},
  'min_data_in_leaf': {'values': [1, 20]}}}

In [None]:
%%capture
wandb.agent(sweep_id, function=wandb_train);

[34m[1mwandb[0m: Agent Starting Run: gg6s5n6n with config:
[34m[1mwandb[0m: 	colsample_bylevel: 0.5
[34m[1mwandb[0m: 	depth: 2
[34m[1mwandb[0m: 	iterations: 1000
[34m[1mwandb[0m: 	learning_rate: 0.08270812901812892
[34m[1mwandb[0m: 	min_data_in_leaf: 1
[34m[1mwandb[0m: 	subsample: 0.9803219298593951
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0:	learn: 0.9617518	total: 138ms	remaining: 2m 17s
1:	learn: 0.9514275	total: 143ms	remaining: 1m 11s
2:	learn: 0.9423258	total: 148ms	remaining: 49.3s
3:	learn: 0.9343842	total: 154ms	remaining: 38.3s
4:	learn: 0.9265702	total: 161ms	remaining: 32.1s
5:	learn: 0.9193966	total: 166ms	remaining: 27.5s
6:	learn: 0.9135523	total: 173ms	remaining: 24.5s
7:	learn: 0.9084819	total: 179ms	remaining: 22.2s
8:	learn: 0.9029106	total: 183ms	remaining: 20.2s
9:	learn: 0.8971149	total: 193ms	remaining: 19.1s
10:	learn: 0.8912272	total: 198ms	remaining: 17.8s
11:	learn: 0.8855924	total: 203ms	remaining: 16.7s
12:	learn: 0.8806040	total: 210ms	remaining: 16s
13:	learn: 0.8776029	total: 215ms	remaining: 15.1s
14:	learn: 0.8741673	total: 221ms	remaining: 14.5s
15:	learn: 0.8704188	total: 226ms	remaining: 13.9s
16:	learn: 0.8661135	total: 230ms	remaining: 13.3s
17:	learn: 0.8630395	total: 236ms	remaining: 12.9s
18:	learn: 0.8590543	total: 241ms	remaining: 12.5s
19:	learn: 0.8555603	total: 246ms	remaini

0,1
test_custom_1,▁
test_custom_10,▁
test_custom_10_std,▁
test_custom_1_std,▁
test_custom_20,▁
test_custom_20_std,▁
test_custom_5,▁
test_custom_5_std,▁
test_explained_variance,▁
test_explained_variance_std,▁

0,1
test_custom_1,7.8892
test_custom_10,64.1201
test_custom_10_std,0.1741
test_custom_1_std,0.22094
test_custom_20,88.54811
test_custom_20_std,0.24788
test_custom_5,37.03918
test_custom_5_std,0.32485
test_explained_variance,0.56119
test_explained_variance_std,0.08273


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0cob2l5u with config:
[34m[1mwandb[0m: 	colsample_bylevel: 0.5
[34m[1mwandb[0m: 	depth: 6
[34m[1mwandb[0m: 	iterations: 1000
[34m[1mwandb[0m: 	learning_rate: 0.08094956841512399
[34m[1mwandb[0m: 	min_data_in_leaf: 20
[34m[1mwandb[0m: 	subsample: 0.9199567435977296
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0:	learn: 0.9496780	total: 10.7ms	remaining: 10.6s
1:	learn: 0.9275672	total: 21.5ms	remaining: 10.7s
2:	learn: 0.9129208	total: 42.9ms	remaining: 14.2s
3:	learn: 0.8919514	total: 54.5ms	remaining: 13.6s
4:	learn: 0.8740562	total: 64.6ms	remaining: 12.9s
5:	learn: 0.8590366	total: 75.5ms	remaining: 12.5s
6:	learn: 0.8486419	total: 86.1ms	remaining: 12.2s
7:	learn: 0.8367338	total: 98ms	remaining: 12.2s
8:	learn: 0.8272830	total: 110ms	remaining: 12.1s
9:	learn: 0.8142587	total: 121ms	remaining: 11.9s
10:	learn: 0.8068802	total: 132ms	remaining: 11.9s
11:	learn: 0.7975265	total: 143ms	remaining: 11.7s
12:	learn: 0.7875744	total: 156ms	remaining: 11.8s
13:	learn: 0.7791931	total: 166ms	remaining: 11.7s
14:	learn: 0.7711387	total: 179ms	remaining: 11.7s
15:	learn: 0.7643463	total: 190ms	remaining: 11.7s
16:	learn: 0.7589176	total: 201ms	remaining: 11.7s
17:	learn: 0.7540049	total: 214ms	remaining: 11.7s
18:	learn: 0.7470088	total: 224ms	remaining: 11.6s
19:	learn: 0.7404028	total: 238ms	r