In [1]:
import pandas as pd, numpy as np, ast
from time import time
from datetime import timedelta
from sklearn.preprocessing    import StandardScaler
from sklearn.model_selection  import GroupShuffleSplit
from itertools                import product
from tqdm                     import tqdm
import xgboost as xgb

import warnings
from sklearn.exceptions import FitFailedWarning

warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
TRAIN_FEATS_PATH   = '../../data/normalized_expanded_train.csv'
TRAIN_META_PATH    = '../../data/train.csv'
TEST_FEATS_PATH    = '../../data/normalized_expanded_test.csv'
TEST_META_PATH     = '../../data/test.csv'
LABELED_TEST_PATH  = '../../data/labeled_test.csv'

In [3]:
FEATURES = [
    'n','degree','closeness','harmonic','betweeness','load','pagerank',
    'eigenvector','katz','information','current_flow_betweeness',
    'percolation','second_order','laplacian'
]

In [4]:
# Hyperparameter grid
param_grid = {
  'max_depth':        [4,6,8],
  'eta':               [0.01,0.05,0.1],
  'subsample':        [0.7,1.0],
  'colsample_bytree': [0.7,1.0],
  'gamma':            [0,1],
  # new ones:
  'min_child_weight': [1,5,10],
  'reg_alpha':        [0,0.01,0.1],
  'reg_lambda':       [1,10,100],
}

In [5]:
# Custom root‐accuracy scoring (1 root per sentence)
def root_score(sent_ids, y_true, probs):
    dfp = pd.DataFrame({'sent': sent_ids, 'y': y_true, 'p': probs})
    picks = dfp.loc[dfp.groupby('sent')['p'].idxmax()]
    return (picks.y == 1).mean()

In [6]:
print("Loading training data…")
exp  = pd.read_csv(TRAIN_FEATS_PATH)
meta = pd.read_csv(TRAIN_META_PATH)
meta['edgelist'] = meta['edgelist'].apply(ast.literal_eval)
df   = exp.merge(
    meta[['language','sentence','edgelist','root']],
    on=['language','sentence']
)

Loading training data…


In [7]:
models = {}
t0 = time()
print("→ Tuning per-language XGB on GPU with early stopping\n")

for lang in tqdm(sorted(df.language.unique())):
    sub = df[df.language == lang].reset_index(drop=True)

    # 1) 80/20 sentence-wise split
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    tr_idx, val_idx = next(gss.split(sub, sub.root, groups=sub.sentence))
    train, val = sub.iloc[tr_idx], sub.iloc[val_idx]

    # 2) scale features
    scaler = StandardScaler().fit(train[FEATURES])
    X_tr = scaler.transform(train[FEATURES])
    y_tr = (train.root == train.vertex).astype(int)
    X_val = scaler.transform(val[FEATURES])
    y_val = (val.root   == val.vertex  ).astype(int)

    # 3) build DMatrix
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval   = xgb.DMatrix(X_val, label=y_val)

    # class‐imbalance weight
    pos = y_tr.sum();  neg = len(y_tr) - pos
    spw = (neg/pos) if pos>0 else 1.0

    best_sc, best_cfg, best_bst = -1, None, None

    # 4) exhaustive grid‐search over all your hyperparams
    for md, eta, subs, colsm, gm, mcw, alpha, lmbda in product(
        param_grid['max_depth'],
        param_grid['eta'],
        param_grid['subsample'],
        param_grid['colsample_bytree'],
        param_grid['gamma'],
        param_grid['min_child_weight'],
        param_grid['reg_alpha'],
        param_grid['reg_lambda'],
    ):
        cfg = {
            'objective':         'binary:logistic',
            'eval_metric':       'error',       # basic validation‐error
            'tree_method':       'hist',
            'device':            'cuda',
            'max_depth':         md,
            'eta':               eta,
            'subsample':         subs,
            'colsample_bytree':  colsm,
            'gamma':             gm,
            'min_child_weight':  mcw,
            'alpha':             alpha,
            'lambda':            lmbda,
            'scale_pos_weight':  spw,
            'seed':              42,
            'verbosity':         0,
        }

        # 5) train up to 200 rounds, early-stop on validation‐error
        bst = xgb.train(
            cfg,
            dtrain,
            num_boost_round=200,
            evals=[(dval, 'validation')],
            early_stopping_rounds=20,
            verbose_eval=False
        )

        # 6) now re‐tune the exact #trees on YOUR root_score
        best_iter      = bst.best_iteration
        best_local_sc  = -1
        best_local_it  = best_iter

        for it in (best_iter-20, best_iter, best_iter+20):
            it = int(np.clip(it, 1, best_iter))
            p_it = bst.predict(dval, iteration_range=(0, it))
            sc_it = root_score(val.sentence.values, y_val, p_it)
            if sc_it > best_local_sc:
                best_local_sc, best_local_it = sc_it, it

        # 7) if this combo is the best so far, keep it
        if best_local_sc > best_sc:
            best_sc    = best_local_sc
            best_cfg   = dict(cfg, ntree_limit=best_local_it)
            best_bst   = bst

    print(f"{lang:12s} → val-root-acc={best_sc:.3f}  "
          f"best_round={best_cfg['ntree_limit']}  cfg={best_cfg}")

    # store scaler, final booster and tuned tree‐count
    models[lang] = (scaler, best_bst, best_cfg['ntree_limit'])

print(f"\nTuning completed in {timedelta(seconds=time()-t0)}\n")

→ Tuning per-language XGB on GPU with early stopping



  5%|█████▌                                                                                                                | 1/21 [10:53<3:37:43, 653.19s/it]

Arabic       → val-root-acc=0.570  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 17.4775, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 10%|███████████▏                                                                                                          | 2/21 [26:14<4:16:51, 811.13s/it]

Chinese      → val-root-acc=0.340  best_round=6  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 10, 'alpha': 0.1, 'lambda': 1, 'scale_pos_weight': 17.5475, 'seed': 42, 'verbosity': 0, 'ntree_limit': 6}


 14%|████████████████▊                                                                                                     | 3/21 [44:00<4:38:15, 927.51s/it]

Czech        → val-root-acc=0.610  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 15.0875, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 19%|██████████████████████▍                                                                                               | 4/21 [59:22<4:22:06, 925.09s/it]

English      → val-root-acc=0.680  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 17.715, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 24%|███████████████████████████▌                                                                                        | 5/21 [1:11:08<3:45:39, 846.20s/it]

Finnish      → val-root-acc=0.570  best_round=7  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.1, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 12.5775, 'seed': 42, 'verbosity': 0, 'ntree_limit': 7}


 29%|█████████████████████████████████▏                                                                                  | 6/21 [1:26:21<3:37:14, 868.99s/it]

French       → val-root-acc=0.480  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 21.4075, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 33%|██████████████████████████████████████▋                                                                             | 7/21 [1:42:51<3:32:00, 908.60s/it]

Galician     → val-root-acc=0.550  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 20.0975, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 38%|████████████████████████████████████████████▏                                                                       | 8/21 [1:56:39<3:11:17, 882.90s/it]

German       → val-root-acc=0.650  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 17.695, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 43%|█████████████████████████████████████████████████▋                                                                  | 9/21 [2:13:49<3:05:45, 928.79s/it]

Hindi        → val-root-acc=0.320  best_round=17  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.1, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 20.6575, 'seed': 42, 'verbosity': 0, 'ntree_limit': 17}


 48%|██████████████████████████████████████████████████████▊                                                            | 10/21 [2:26:39<2:41:18, 879.91s/it]

Icelandic    → val-root-acc=0.540  best_round=0  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 15.73, 'seed': 42, 'verbosity': 0, 'ntree_limit': 0}


 52%|████████████████████████████████████████████████████████████▏                                                      | 11/21 [2:41:31<2:27:15, 883.53s/it]

Indonesian   → val-root-acc=0.560  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 5, 'alpha': 0, 'lambda': 1, 'scale_pos_weight': 16.1, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 57%|█████████████████████████████████████████████████████████████████▋                                                 | 12/21 [2:58:44<2:19:20, 928.95s/it]

Italian      → val-root-acc=0.550  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 20.595, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 62%|██████████████████████████████████████████████████████████████████████▌                                           | 13/21 [3:20:15<2:18:27, 1038.48s/it]

Japanese     → val-root-acc=0.150  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 5, 'alpha': 0, 'lambda': 1, 'scale_pos_weight': 24.7675, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 67%|████████████████████████████████████████████████████████████████████████████▋                                      | 14/21 [3:33:50<1:53:17, 971.07s/it]

Korean       → val-root-acc=0.370  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.1, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 10, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 14.02, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 71%|██████████████████████████████████████████████████████████████████████████████████▏                                | 15/21 [3:40:30<1:19:54, 799.06s/it]

Polish       → val-root-acc=0.650  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 14.8325, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 76%|███████████████████████████████████████████████████████████████████████████████████████▌                           | 16/21 [3:55:06<1:08:30, 822.10s/it]

Portuguese   → val-root-acc=0.510  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 19.825, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 81%|██████████████████████████████████████████████████████████████████████████████████████████████▋                      | 17/21 [4:08:02<53:52, 808.19s/it]

Russian      → val-root-acc=0.670  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 15.4125, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 86%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 18/21 [4:21:58<40:49, 816.59s/it]

Spanish      → val-root-acc=0.500  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 5, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 20.0925, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


 90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 19/21 [4:35:12<26:59, 809.95s/it]

Swedish      → val-root-acc=0.650  best_round=8  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 6, 'eta': 0.05, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 10, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 16.1525, 'seed': 42, 'verbosity': 0, 'ntree_limit': 8}


 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 20/21 [4:51:49<14:26, 866.15s/it]

Thai         → val-root-acc=0.620  best_round=1  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 6, 'eta': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.7, 'gamma': 0, 'min_child_weight': 5, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 21.0775, 'seed': 42, 'verbosity': 0, 'ntree_limit': 1}


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [5:02:11<00:00, 863.43s/it]

Turkish      → val-root-acc=0.480  best_round=22  cfg={'objective': 'binary:logistic', 'eval_metric': 'error', 'tree_method': 'hist', 'device': 'cuda', 'max_depth': 8, 'eta': 0.01, 'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0, 'min_child_weight': 1, 'alpha': 0, 'lambda': 100, 'scale_pos_weight': 13.8225, 'seed': 42, 'verbosity': 0, 'ntree_limit': 22}

Tuning completed in 5:02:12.004304






In [8]:
print("Loading test data…")
test_feats = pd.read_csv(TEST_FEATS_PATH)
raw_test   = pd.read_csv(TEST_META_PATH)
raw_test['edgelist'] = raw_test['edgelist'].apply(ast.literal_eval)

test_df = test_feats.merge(
    raw_test[['id','language','sentence']],
    on=['id','language','sentence']
)

Loading test data…


In [9]:
print("Predicting on test set…")
results = []
for tid, grp in test_df.groupby('id', sort=False):
    # ← unpack scaler, model and tuned tree-count
    scaler, bst, ntree_limit = models[grp.language.iloc[0]]

    # feature matrix for this sentence
    Xs = scaler.transform(grp[FEATURES].values)
    dm = xgb.DMatrix(Xs)

    # ← use dm and per-language ntree_limit
    probs = bst.predict(dm, iteration_range=(0, ntree_limit))

    pick = int(grp.vertex.values[probs.argmax()])
    results.append({'id': tid, 'root_pred': pick})

submission = pd.DataFrame(results)

Predicting on test set…


In [11]:
truth = pd.read_csv('../../data/labeled_test.csv')
cmp   = submission.merge(truth, on='id')
cmp['correct'] = cmp['root_pred'] == cmp['root']
acc = cmp['correct'].mean()

print(f"XGB Test accuracy: {acc:.3%} ({cmp['correct'].sum()}/{len(cmp)})\n")
print("Some mis‐predictions:")
print(cmp.loc[~cmp['correct'], ['id','root_pred','root']].head())

XGB Test accuracy: 51.544% (5358/10395)

Some mis‐predictions:
   id  root_pred  root
0   1         38     4
2   3         15     5
3   4         14    15
4   5          3     9
5   6          3    17


In [12]:
submission.to_csv('../../data/submission_XGB_advanced.csv', index=False)