# Catboost feature elimination

Two many features is bad. For some reasons:

- We just did an "*all in*" approach: looks like a good feature? just feed it
- Performance can be hurt by having so many features
- Some different features may be giving duplicated info
- Portability and costs can be hurt, specially when using BERT embedding features

This notebook is about **finding a reduced set of features to use for our final model**-

The knowledge would be incorporated into the **11_Final_model_training.ipynb** notebook.

## Import standard libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import time
from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer

from catboost import CatBoostRegressor

## Import custom scripts

In [2]:
import sys, os
sys.path.append(os.getcwd()+ "/../")
from src.data_preprocessing import DataPreprocessing

## Load all the features
The datapreprocessing pipeline is doing quite some stuff, and in a non-efficient manner (I don't have much time for optimizing that :( )
But it should be less than 2 min

In [3]:
dp = DataPreprocessing(df_path = "../data/real_estate_ads_2022_10.csv",
                        train_indices_path="../data/train_indices.npy", 
                        test_indices_path="../data/test_indices.npy",
                        get_params_from_params=True,
                        get_tfidf_embeddings_flag=True,
                        get_bert_embeddings_flag=True,
                        get_textual_features_flag=True,
                        transform_time_features_flag=True,
                        transform_cyclic_features_flag=True)

In [4]:
import importlib
import src.compute_metrics
importlib.reload(src.compute_metrics) # We do this for debugging purposes

from src.compute_metrics import Metrics

## Split train / test data
We can use the datapreprocessing method for that.

This is done for better reproducibility, but can be done with the sklearn train / test split, and setting a seed should suffice.

In [5]:
X_train, X_test = dp.get_train_test_split(dp.X)
y_train, y_test = dp.get_train_test_split(dp.Y)

## Define a function for training
We will use k-fold cross-validation

In [6]:
def train_catboost_and_get_metrics(X, y, 
                                    catboost_params = {},
                                    backward_transform_label=True, 
                                    backward_standardize_flag=False, verbose=False,
                                    standard_scale_flag=False,
                                    impute_flag=False):

    if impute_flag:
        imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        X = pd.DataFrame(imp_mean.fit_transform(X), columns=X.columns)

    bst = CatBoostRegressor(**catboost_params)

    metrics = Metrics(dp=dp, backward_transform_flag=backward_transform_label, backward_standardize_flag=backward_standardize_flag)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for train_index, test_index in kf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        if standard_scale_flag:
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
            X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)

        bst.fit(X_train, y_train)
        y_pred = bst.predict(X_val)

        if verbose: 
            print(f"y_pred: {y_pred[:5]}")
            print(f"y_val: {y_val[:5]}")
            
        computed_metrics = metrics.get_single_train_val_metrics(bst, X_train, y_train, X_val, y_val)
        metrics.append(computed_metrics)

    average_metrics = metrics.get_average()
    std_metrics = metrics.get_std()
    # Add _std to the keys to differentiate them from the average metrics:
    std_metrics = {f"{key}_std" : value for key, value in std_metrics.items()} 

    return {**average_metrics, **std_metrics}

## Define convenience functions for prettier display

In [7]:
def filter_metrics(metrics_dict, only_validation=True, format_mean_std_together=True):

    if only_validation:
        metrics_dict = {key: value for key, value in metrics_dict.items() if "test_" in key}

    if format_mean_std_together:
        metrics_dict = {key: f"{value:.2f} ± {metrics_dict[key+'_std']:.2f}" for key, value in metrics_dict.items() if "std" not in key}

    return metrics_dict

def highlight_max(s):
    is_max = s == s.replace("nan ± nan", "0").apply(lambda x: x.split("+-")[0]).max()
    return ['font-weight: bold' if v else '' for v in is_max]

def highlight_min(s):
    is_min = s == s.replace("nan ± nan", "0").apply(lambda x: x.split("+-")[0]).min()
    return ['font-weight: bold' if v else '' for v in is_min]

def format_results_df(results, column_names=None):
    results_df = pd.DataFrame(results).T

    if column_names is not None:
        results_df.columns = column_names
    
    def apply_highlight(column):
        if column.name in ["test_explained_variance", "test_r2", "test_custom_1", "test_custom_5", "test_custom_10", "test_custom_20"]:
            return highlight_max(column)
        else:
            return highlight_min(column)

    
    return results_df.style.apply(apply_highlight, axis=1)

## Try out different feature elimination techniques

In [10]:
cols = X_train.columns.tolist()

tfidf_features = [col for col in cols if "tfidf" in col]
bert_features = [col for col in cols if "bert" in col]

params_features = [col for col in cols if "params" in col]


X_train_temp = X_train.copy().drop(columns=tfidf_features + bert_features)
results_catboost_without_embeddings = train_catboost_and_get_metrics(X_train_temp, y_train, 
                                                                catboost_params = {"verbose" : False})

X_train_temp = X_train.copy().drop(columns=params_features)
results_catboost_without_params = train_catboost_and_get_metrics(X_train_temp, y_train, 
                                                                catboost_params = {"verbose" : False})

X_train_temp = X_train.copy().drop(columns=tfidf_features + bert_features + params_features)
results_catboost_without_embeddings_and_params = train_catboost_and_get_metrics(X_train_temp, y_train, 
                                                                catboost_params = {"verbose" : False})

X_train_temp = X_train.copy().drop(columns=bert_features)
results_catboost_without_bert = train_catboost_and_get_metrics(X_train_temp, y_train, 
                                                                catboost_params = {"verbose" : False})



In [13]:
X_train_temp = X_train.copy()
results_catboost_all = train_catboost_and_get_metrics(X_train_temp, y_train, 
                                                                catboost_params = {"verbose" : False})



In [15]:
df_results = pd.concat([pd.DataFrame(filter_metrics(results_catboost_without_embeddings, only_validation=True, format_mean_std_together=True), index=["Without embeddings"]),
                        pd.DataFrame(filter_metrics(results_catboost_without_params, only_validation=True, format_mean_std_together=True), index=["Without params"]),
                        pd.DataFrame(filter_metrics(results_catboost_without_embeddings_and_params, only_validation=True, format_mean_std_together=True), index=["Without embeddings and params"]),
                        pd.DataFrame(filter_metrics(results_catboost_without_bert, only_validation=True, format_mean_std_together=True), index=["Without bert"]),
                        pd.DataFrame(filter_metrics(results_catboost_all, only_validation=True, format_mean_std_together=True), index=["All features"])])

format_results_df(df_results, column_names=["Without embeddings", "Without params", "Without embeddings and params", "Without bert", "All features"])

Unnamed: 0,Without embeddings,Without params,Without embeddings and params,Without bert,All features
test_explained_variance,0.63 ± 0.09,0.64 ± 0.10,0.63 ± 0.09,0.65 ± 0.10,0.64 ± 0.10
test_r2,0.63 ± 0.09,0.64 ± 0.10,0.63 ± 0.09,0.65 ± 0.10,0.64 ± 0.10
test_mape,217.73 ± 73.98,201.55 ± 77.81,217.73 ± 73.98,203.94 ± 98.59,201.55 ± 77.81
test_median_absolute_error,451.92 ± 3.56,444.31 ± 4.52,451.92 ± 3.56,440.94 ± 6.40,444.31 ± 4.52
test_mean_absolute_error,694.41 ± 5.52,682.71 ± 6.87,694.41 ± 5.52,673.32 ± 8.47,682.71 ± 6.87
test_mean_squared_log_error,0.13 ± 0.03,0.12 ± 0.02,0.13 ± 0.03,0.12 ± 0.03,0.12 ± 0.02
test_custom_1,10.54 ± 0.24,10.30 ± 0.38,10.54 ± 0.24,10.46 ± 0.27,10.30 ± 0.38
test_custom_5,45.27 ± 0.11,45.68 ± 0.52,45.27 ± 0.11,46.22 ± 0.76,45.68 ± 0.52
test_custom_10,71.54 ± 0.23,71.89 ± 0.18,71.54 ± 0.23,72.38 ± 0.15,71.89 ± 0.18
test_custom_20,91.61 ± 0.18,92.03 ± 0.24,91.61 ± 0.18,92.32 ± 0.24,92.03 ± 0.24


### Use catboost feature elimination

In [16]:
new_bst = CatBoostRegressor(verbose=False)

new_bst.select_features(X_train, y_train,
                eval_set=None,
                features_for_select=X_train.columns.tolist(),
                num_features_to_select=10,
                algorithm=None,
                steps=None,
                shap_calc_type=None,
                train_final_model=True,
                verbose=None,
                logging_level=None,
                plot=False,
                log_cout=sys.stdout,
                log_cerr=sys.stderr)

Learning rate set to 0.077961
Step #1 out of 1
Feature #10 eliminated
Feature #24 eliminated
Feature #53 eliminated
Feature #54 eliminated
Feature #57 eliminated
Feature #49 eliminated
Feature #21 eliminated
Feature #56 eliminated
Feature #105 eliminated
Feature #39 eliminated
Feature #44 eliminated
Feature #59 eliminated
Feature #52 eliminated
Feature #58 eliminated
Feature #50 eliminated
Feature #20 eliminated
Feature #15 eliminated
Feature #12 eliminated
Feature #13 eliminated
Feature #40 eliminated
Feature #104 eliminated
Feature #108 eliminated
Feature #34 eliminated
Feature #16 eliminated
Feature #29 eliminated
Feature #14 eliminated
Feature #18 eliminated
Feature #17 eliminated
Feature #102 eliminated
Feature #33 eliminated
Feature #103 eliminated
Feature #100 eliminated
Feature #38 eliminated
Feature #107 eliminated
Feature #35 eliminated
Feature #19 eliminated
Feature #85 eliminated
Feature #25 eliminated
Feature #22 eliminated
Feature #51 eliminated
Feature #55 eliminated
Fea

{'selected_features': [3, 5, 7, 30, 67, 74, 75, 94, 95, 98],
 'eliminated_features_names': ['param_equipment_types_fridge',
  'param_security_types_roller_shutters',
  'param_building_material_cellular_concrete',
  'param_building_material_concrete',
  'param_building_material_reinforced_concrete',
  'param_heating_other',
  'param_security_types_closed_area',
  'param_building_material_other',
  'created_at_first_dayofweek_cos',
  'param_building_type_infill',
  'param_building_ownership_co_operative_ownership_with_a_land_and_mortgage_registe',
  'param_building_material_wood',
  'param_building_material_brick',
  'param_building_material_silikat',
  'param_heating_tiled_stove',
  'param_security_types_anti_burglary_door',
  'param_equipment_types_washing_machine',
  'param_equipment_types_oven',
  'param_equipment_types_stove',
  'param_building_type_ribbon',
  'created_at_first_dayofweek_sin',
  'updated_at_dayofweek_sin',
  'param_extras_types_usable_room',
  'param_media_types_cab

## Train with only selected features

In [18]:
X_train_temp = X_train.copy()[['m',
  'map_lat',
  'param_building_floors_num',
  'param_extras_types_lift',
  'pls2_tfidf_titles_0',
  'pls2_tfidf_descriptions_0',
  'pls2_tfidf_descriptions_1',
  'total_number_presence_description',
  'created_at_first_year',
  'updated_at_year']]
results_catboost_selected = train_catboost_and_get_metrics(X_train_temp, y_train, 
                                                                catboost_params = {"verbose" : False})


In [19]:
df_results = pd.concat([pd.DataFrame(filter_metrics(results_catboost_selected, only_validation=True, format_mean_std_together=True), index=["Selected features"])])

format_results_df(df_results, column_names=["Selected features"])

Unnamed: 0,Selected features
test_explained_variance,0.57 ± 0.08
test_r2,0.57 ± 0.08
test_mape,229.02 ± 76.24
test_median_absolute_error,524.84 ± 6.41
test_mean_absolute_error,795.26 ± 5.20
test_mean_squared_log_error,0.13 ± 0.03
test_custom_1,9.00 ± 0.36
test_custom_5,40.16 ± 0.55
test_custom_10,65.65 ± 0.72
test_custom_20,88.72 ± 0.13
