In [None]:
import sys
import re

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
import xgboost as xgb

from tqdm import tqdm

In [None]:
country_levels = [
    "NDF",
    "US",
    "other",
    "FR",
    "IT",
    "GB",
    "ES",
    "CA",
    "DE",
    "NL",
    "AU",
    "PT",
]


class LogStream:
    def __init__(self):
        self.logs = []
        self.train_logloss = pd.Series(dtype=float)
        self.val_logloss = pd.Series(dtype=float)

    def write(self, message):
        self.logs.append(message)

        match = re.match(
            r"\[(\d+)\]\s+train-logloss:(\S+)\s+val-logloss:(\S+)", message
        )
        if match:
            iteration = int(match.group(1))
            train_loss = float(match.group(2))
            val_loss = float(match.group(3))
            self.train_logloss.at[iteration] = train_loss
            self.val_logloss.at[iteration] = val_loss

    def flush(self):
        pass

In [None]:
train_data = pd.read_feather("data/preprocessed/train_data")
train_data

In [None]:
test_data = pd.read_feather("data/preprocessed/test_data")
test_data

In [None]:
def get_binary_outcome(train, test, country_name, num_folds, pbar):
    label_country = (train["country_destination"] == country_name).astype(int)

    train_pred = np.zeros(len(train))
    test_pred = np.zeros(len(test))

    test_dm = xgb.DMatrix(data=test.drop(columns=["id", "country_destination"]).values)

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    log_stream = LogStream()
    original_stdout = sys.stdout
    sys.stdout = log_stream

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train)):
        print(f"Fold {fold_idx}")

        # Data preparation
        train_data = train.iloc[train_idx]
        val_data = train.iloc[val_idx]
        train_dm = xgb.DMatrix(
            data=train_data.drop(columns=["id", "country_destination"]).values,
            label=label_country.iloc[train_idx].values,
        )
        val_dm = xgb.DMatrix(
            data=val_data.drop(columns=["id", "country_destination"]).values,
            label=label_country.iloc[val_idx].values,
        )

        # Hyper parameters
        params = {
            "tree_method": "hist",
            "device": "cuda",
            "max_depth": 6,
            "eta": 0.03,
            "booster": "gbtree",
            "subsample": 0.7,
            "colsample_bytree": 0.7,
            "objective": "binary:logistic",
        }

        watchlist = [(train_dm, "train"), (val_dm, "val")]

        # Training
        model = xgb.train(
            params=params,
            dtrain=train_dm,
            num_boost_round=2000,
            early_stopping_rounds=50,
            evals=watchlist,
            verbose_eval=300,
        )

        best_iteration = model.best_iteration

        # Prediction
        train_pred[val_idx] += model.predict(
            val_dm, iteration_range=(0, best_iteration)
        )
        test_pred += model.predict(test_dm, iteration_range=(0, best_iteration))

        pbar.update(1)

    # Normalization
    test_pred /= num_folds

    sys.stdout = original_stdout

    return {
        "train_pred": train_pred,
        "test_pred": test_pred,
        "train_logloss_log": log_stream.train_logloss,
        "val_logloss_log": log_stream.val_logloss,
    }

In [None]:
num_folds = 2

train_results = {}
test_results = {}
log_df = pd.DataFrame()

with tqdm(
    total=len(country_levels) * num_folds, desc="Training Binary Classifier"
) as pbar:
    for country in country_levels:
        results = get_binary_outcome(train_data, test_data, country, num_folds, pbar)
        train_results[country] = results["train_pred"]
        test_results[country] = results["test_pred"]
        log_df[f"{country}_train"] = results["train_logloss_log"]
        log_df[f"{country}_val"] = results["val_logloss_log"]

log_df.reset_index(drop=True, inplace=True)
log_df

In [None]:
for country in country_levels:
    train_data[f"Pred{country}"] = train_results[country]
    test_data[f"Pred{country}"] = test_results[country]

train_data.to_feather("data/preprocessed/train_data_with_binary_classification")
test_data.to_feather("data/preprocessed/test_data_with_binary_classification")