In [1]:
import sys
import re

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
import xgboost as xgb

from tqdm import tqdm

In [2]:
country_levels = [
    "NDF",
    "US",
    "other",
    "FR",
    "IT",
    "GB",
    "ES",
    "CA",
    "DE",
    "NL",
    "AU",
    "PT",
]


class LogStream:
    def __init__(self):
        self.logs = []
        self.train_logloss = pd.Series(dtype=float)
        self.val_logloss = pd.Series(dtype=float)

    def write(self, message):
        self.logs.append(message)

        match = re.match(
            r"\[(\d+)\]\s+train-logloss:(\S+)\s+val-logloss:(\S+)", message
        )
        if match:
            iteration = int(match.group(1))
            train_loss = float(match.group(2))
            val_loss = float(match.group(3))
            self.train_logloss.at[iteration] = train_loss
            self.val_logloss.at[iteration] = val_loss

    def flush(self):
        pass

In [3]:
train_data = pd.read_feather("data/preprocessed/train_data")
train_data

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,timestamp_first_active_year,timestamp_first_active_month,timestamp_first_active_day,age_group,destination_distance_km,destination_area,...,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Tablet,device_type_Windows Desktop,device_type_Windows Phone,device_type_iPad Tablet,device_type_iPhone,device_type_iPodtouch,booking,not_so_english
0,gxn3p5htnn,2010,6,28,2009,3,19,-42424242.0,-42424242.0,-42424242.0,...,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0
1,820tgsjxq7,2011,5,25,2009,5,23,7.0,-42424242.0,-42424242.0,...,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0
2,4ft3gnwmtx,2010,9,28,2009,6,9,11.0,0.0,9826675.0,...,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0
3,bjjt8pjhuk,2011,12,5,2009,10,31,8.0,-42424242.0,-42424242.0,...,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0
4,87mebub9p4,2010,9,14,2009,12,8,8.0,0.0,9826675.0,...,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0,-42424242.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213446,zxodksqpep,2014,6,30,2014,6,30,6.0,-42424242.0,-42424242.0,...,0.0,108.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0
213447,mhewnxesx9,2014,6,30,2014,6,30,-42424242.0,-42424242.0,-42424242.0,...,0.0,2.0,0.0,232.0,0.0,0.0,4.0,0.0,0.0,0.0
213448,6o3arsjbb4,2014,6,30,2014,6,30,6.0,-42424242.0,-42424242.0,...,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213449,jh95kwisub,2014,6,30,2014,6,30,-42424242.0,-42424242.0,-42424242.0,...,0.0,0.0,0.0,0.0,0.0,0.0,75.0,0.0,0.0,0.0


In [4]:
test_data = pd.read_feather("data/preprocessed/test_data")
test_data

Unnamed: 0,id,date_account_created_year,date_account_created_month,date_account_created_day,timestamp_first_active_year,timestamp_first_active_month,timestamp_first_active_day,age_group,destination_distance_km,destination_area,...,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Tablet,device_type_Windows Desktop,device_type_Windows Phone,device_type_iPad Tablet,device_type_iPhone,device_type_iPodtouch,booking,not_so_english
213451,5uwns89zht,2014,7,1,2014,7,1,7.0,-42424242.0,-42424242.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
213452,jtl0dijy2j,2014,7,1,2014,7,1,-42424242.0,-42424242.0,-42424242.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
213453,xx0ulgorjt,2014,7,1,2014,7,1,-42424242.0,-42424242.0,-42424242.0,...,0.0,0.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0
213454,6c6puo6ix0,2014,7,1,2014,7,1,-42424242.0,-42424242.0,-42424242.0,...,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0
213455,czqhjk3yfe,2014,7,1,2014,7,1,-42424242.0,-42424242.0,-42424242.0,...,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,2014,9,30,2014,9,30,6.0,-42424242.0,-42424242.0,...,0.0,0.0,0.0,89.0,0.0,0.0,4.0,0.0,0.0,1.0
275543,zp8xfonng8,2014,9,30,2014,9,30,-42424242.0,-42424242.0,-42424242.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
275544,fa6260ziny,2014,9,30,2014,9,30,-42424242.0,-42424242.0,-42424242.0,...,0.0,0.0,0.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0
275545,87k0fy4ugm,2014,9,30,2014,9,30,-42424242.0,-42424242.0,-42424242.0,...,0.0,11.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0


In [5]:
def get_binary_outcome(train, test, country_name, num_folds, pbar):
    """
    Train a binary classification model to predict a binary target for a specific country.
    :param train: Training data
    :param test: Testing data
    :param country_name: The target country for the current prediction
    :return: A dictionary containing prediction results for both the training and testing datasets
    """
    label_country = (train["country_destination"] == country_name).astype(int)

    train_pred = np.zeros(len(train))
    test_pred = np.zeros(len(test))

    test_dm = xgb.DMatrix(data=test.drop(columns=["id", "country_destination"]).values)

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    log_stream = LogStream()
    original_stdout = sys.stdout
    sys.stdout = log_stream

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train)):
        print(f"Fold {fold_idx}")

        # Data preparation
        train_data = train.iloc[train_idx]
        val_data = train.iloc[val_idx]
        train_dm = xgb.DMatrix(
            data=train_data.drop(columns=["id", "country_destination"]).values,
            label=label_country.iloc[train_idx].values,
        )
        val_dm = xgb.DMatrix(
            data=val_data.drop(columns=["id", "country_destination"]).values,
            label=label_country.iloc[val_idx].values,
        )

        # Hyper parameters
        params = {
            "tree_method": "hist",
            "device": "cuda",
            "max_depth": 6,
            "eta": 0.03,
            "booster": "gbtree",
            "subsample": 0.7,
            "colsample_bytree": 0.7,
            "objective": "binary:logistic",
        }

        watchlist = [(train_dm, "train"), (val_dm, "val")]

        # Training
        model = xgb.train(
            params=params,
            dtrain=train_dm,
            num_boost_round=2000,
            early_stopping_rounds=50,
            evals=watchlist,
            verbose_eval=300,
        )

        best_iteration = model.best_iteration

        # Prediction
        train_pred[val_idx] += model.predict(
            val_dm, iteration_range=(0, best_iteration)
        )
        test_pred += model.predict(test_dm, iteration_range=(0, best_iteration))

        pbar.update(1)

    # Normalization
    test_pred /= num_folds

    sys.stdout = original_stdout

    return {
        "train_pred": train_pred,
        "test_pred": test_pred,
        "train_logloss_log": log_stream.train_logloss,
        "val_logloss_log": log_stream.val_logloss,
    }

In [6]:
num_folds = 2

train_results = {}
test_results = {}
log_df = pd.DataFrame()

with tqdm(
    total=len(country_levels) * num_folds, desc="Training Binary Classifier"
) as pbar:
    for country in country_levels:
        results = get_binary_outcome(train_data, test_data, country, num_folds, pbar)
        train_results[country] = results["train_pred"]
        test_results[country] = results["test_pred"]
        log_df[f"{country}_train"] = results["train_logloss_log"]
        log_df[f"{country}_val"] = results["val_logloss_log"]

log_df.reset_index(drop=True, inplace=True)
log_df

Training Binary Classifier: 100%|██████████| 24/24 [06:19<00:00, 15.80s/it]


Unnamed: 0,NDF_train,NDF_val,US_train,US_val,other_train,other_val,FR_train,FR_val,IT_train,IT_val,...,CA_train,CA_val,DE_train,DE_val,NL_train,NL_val,AU_train,AU_val,PT_train,PT_val
0,0.65541,0.65401,0.57727,0.57411,0.23242,0.23529,0.17485,0.17435,0.15276,0.15216,...,0.13774,0.13797,0.13468,0.1337,0.13103,0.1309,0.12887,0.12846,0.12548,0.12509
1,0.13521,0.14683,0.00011,0.00011,0.13424,0.14679,6e-05,6e-05,5e-05,5e-05,...,4e-05,4e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,5e-05,4e-05
2,0.13158,0.14253,,,,,,,,,...,,,,,,,,,,
3,0.12866,0.14651,,,,,,,,,...,,,,,,,,,,


In [7]:
for country in country_levels:
    train_data[f"Pred{country}"] = train_results[country]
    test_data[f"Pred{country}"] = test_results[country]

train_data.to_feather("data/preprocessed/train_data_with_binary_classification")
test_data.to_feather("data/preprocessed/test_data_with_binary_classification")