In [7]:
from __future__ import annotations
from pathlib import Path
from typing import Union
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier


In [8]:


def load_training_dataframe(parquet_root: Union[str, Path]) -> pd.DataFrame:
    """Load and prepare parquet files for model training.

    Parameters
    ----------
    parquet_root:
        Directory containing parquet files. All parquet files found recursively
        under this directory are concatenated into a single DataFrame.

    Returns
    -------
    pd.DataFrame
        Data ready for XGBoost. Rows with missing values are dropped and
        columns are converted to integer or boolean types where appropriate.
    """

    parquet_root = Path(parquet_root)
    files = sorted(parquet_root.rglob("*.parquet"))
    if not files:
        raise FileNotFoundError(f"No parquet files found under {parquet_root}")

    # Read all parquet files and concatenate
    frames = [pd.read_parquet(f, engine="pyarrow") for f in files]
    df = pd.concat(frames, ignore_index=True)

    # Drop any rows containing NA values
    df = df.dropna().reset_index(drop=True)

    # Convert object columns to numeric or categorical codes
    obj_cols = df.select_dtypes(include="object").columns 
    obj_cols = obj_cols.drop("match_id")
    for col in obj_cols:
        lower = df[col].str.lower()
        if set(lower.unique()) <= {"true", "false"}:
            df[col] = lower == "true"
        else:
            df[col] = df[col].astype("category").cat.codes

    # Convert numeric columns to int or bool where possible
    num_cols = df.select_dtypes(include="number").columns
    for col in num_cols:
        series = df[col]
        if pd.api.types.is_float_dtype(series) and np.allclose(series, series.astype(int)):
            series = series.astype(int)
        if set(series.unique()) <= {0, 1}:
            series = series.astype(bool)
        else:
            series = pd.to_numeric(series, downcast="integer")
        df[col] = series

    return df

In [9]:
data = load_training_dataframe("data/matches")

In [10]:
matches = data['match_id'].unique()
feature_cols = [c for c in data.columns if c not in ('match_id', 'y_match')]
train_ids = set(np.random.choice(matches, size=int(0.8*len(matches)), replace=False))
train_data = data[data['match_id'].isin(train_ids)]
test_data  = data[~data['match_id'].isin(train_ids)]
X_train, y_train = train_data[feature_cols], train_data['y_match']
X_test,  y_test  = test_data[feature_cols],  test_data['y_match']


In [13]:
model = XGBClassifier(
    objective="binary:logistic",  # outputs a probability for class 1
    eval_metric="logloss",        # train-time metric; good for probability models
    max_depth=50,                  # tree depth (controls complexity)
    learning_rate=0.1,            # step size for boosting
    n_estimators=1000,             # number of trees
    early_stopping_rounds=25,

)


In [None]:
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)
