In [181]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier, callback

In [182]:
df = pd.read_csv('../../data/expanded_train_with_leaf.csv')
df = df[df.language == 'Finnish'].reset_index(drop=True)

In [183]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(df, df.is_root, groups=df.sentence))
train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]

In [184]:
CENT = [
    'degree','closeness','harmonic','betweeness','load','pagerank',
    'eigenvector','katz','information','current_flow_betweeness',
    'percolation','second_order','laplacian','is_leaf'
]

In [185]:
X_train = train_df[CENT];  y_train = train_df['is_root'];  s_train = train_df['sentence']
X_val   = val_df[CENT];    y_val   = val_df['is_root'];  s_val   = val_df['sentence']

In [186]:
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_val_s   = scaler.transform(X_val)

In [187]:
def root_score(sent_ids, y_true, probs):
    tmp = pd.DataFrame({'sentence': sent_ids, 'is_root': y_true, 'p': probs})
    pick = tmp.loc[tmp.groupby('sentence')['p'].idxmax()]
    return (pick.is_root == 1).mean()

In [188]:
rf_grid = {
    'n_estimators':    [100, 300, 500],
    'max_depth':       [5, 10, 20, None],
    'min_samples_leaf':[5, 10, 20],
    'max_features':    ['sqrt', 'log2', 0.5],
}

In [189]:
best_rf_score = -1
best_rf = None
for ne in rf_grid['n_estimators']:
    for md in rf_grid['max_depth']:
        for msl in rf_grid['min_samples_leaf']:
            for mf in rf_grid['max_features']:
                rf = RandomForestClassifier(
                    n_estimators=ne,
                    max_depth=md,
                    min_samples_leaf=msl,
                    max_features=mf,
                    class_weight='balanced',
                    random_state=42,
                    n_jobs=-1
                )
                rf.fit(X_train_s, y_train)
                p = rf.predict_proba(X_val_s)[:,1]
                score = root_score(s_val, y_val, p)
                if score > best_rf_score:
                    best_rf_score = score
                    best_rf = rf

print(f"Best RF hold-out root-score: {best_rf_score:.3f}")

Best RF hold-out root-score: 0.370


In [190]:
lgb_grid = {
    'learning_rate':     [0.01, 0.05, 0.1],
    'num_leaves':        [16, 31, 63],
    'min_child_samples': [10, 20, 40],
}

best_lgb_score = -1
best_lgb = None
for lr in lgb_grid['learning_rate']:
    for nl in lgb_grid['num_leaves']:
        for mcs in lgb_grid['min_child_samples']:
            lgb = LGBMClassifier(
                objective='binary',
                n_estimators=500,
                learning_rate=lr,
                num_leaves=nl,
                min_child_samples=mcs,
                feature_fraction=0.8,
                bagging_fraction=0.8,
                bagging_freq=5,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )
            lgb.fit(
                X_train_s, y_train,
                eval_set=[(X_val_s, y_val)],
                eval_metric='binary_logloss',
                callbacks=[
                    callback.early_stopping(stopping_rounds=20),
                    callback.log_evaluation(period=0)
                ]
            )
            p = lgb.predict_proba(X_val_s)[:,1]
            score = root_score(s_val, y_val, p)
            if score > best_lgb_score:
                best_lgb_score = score
                best_lgb = lgb

print(f"Best LGB hold-out root-score: {best_lgb_score:.3f}")

[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.39106




[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000227 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.393572




[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.400272
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[Lig



Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.341033
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.347637
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.357205
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.295838
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.302984
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.325457
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[231]	valid_0's binary_logloss: 0.340956
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Nu



Early stopping, best iteration is:
[230]	valid_0's binary_logloss: 0.344863
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[335]	valid_0's binary_logloss: 0.33086
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data p



Early stopping, best iteration is:
[230]	valid_0's binary_logloss: 0.301729
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Early stopping, best iteration is:
[230]	valid_0's binary_logloss: 0.306848
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[335]	valid_0's binary_logloss: 0.309207




[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[142]	valid_0's binary_logloss: 0.292338
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info]



Early stopping, best iteration is:
[270]	valid_0's binary_logloss: 0.312642
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[230]	valid_0's binary_logloss: 0.32125
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data p



Early stopping, best iteration is:
[269]	valid_0's binary_logloss: 0.317339
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[138]	valid_0's binary_logloss: 0.315891
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data 



Early stopping, best iteration is:
[181]	valid_0's binary_logloss: 0.304282
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[110]	valid_0's binary_logloss: 0.323037
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data 



Early stopping, best iteration is:
[98]	valid_0's binary_logloss: 0.308667
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3059
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[97]	valid_0's binary_logloss: 0.312555
Best LGB hold-out root-score: 0.370




In [191]:
p_rf  = best_rf.predict_proba(X_val_s)[:,1]
p_lgb = best_lgb.predict_proba(X_val_s)[:,1]
p_ens = 0.5 * p_rf + 0.5 * p_lgb
ens_score = root_score(s_val, y_val, p_ens)

print(f"Ensemble root-score: {ens_score:.3f}")

Ensemble root-score: 0.350


