In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier, callback

In [14]:
df = pd.read_csv('../../data/expanded_with_features_non-linear.csv')
df = df[df.language == 'Finnish'].reset_index(drop=True)

In [15]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(df, df.is_root, groups=df.sentence))
train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]

In [16]:
CENT = [
    'degree','closeness','harmonic','betweeness','load','pagerank',
    'eigenvector','katz','information','current_flow_betweeness',
    'percolation','second_order','laplacian',
    'is_leaf','pos_norm','max_branch_size','subtree_entropy'
]

In [17]:
X_train = train_df[CENT];  y_train = train_df['is_root'];  s_train = train_df['sentence']
X_val   = val_df[CENT];    y_val   = val_df['is_root'];  s_val   = val_df['sentence']

In [18]:
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_val_s   = scaler.transform(X_val)

In [19]:
def root_score(sent_ids, y_true, probs):
    tmp = pd.DataFrame({'sentence': sent_ids, 'is_root': y_true, 'p': probs})
    pick = tmp.loc[tmp.groupby('sentence')['p'].idxmax()]
    return (pick.is_root == 1).mean()

In [20]:
rf_grid = {
    'n_estimators':    [100, 300, 500],
    'max_depth':       [5, 10, 20, None],
    'min_samples_leaf':[5, 10, 20],
    'max_features':    ['sqrt', 'log2', 0.5],
}

In [21]:
best_rf_score = -1
best_rf = None
for ne in rf_grid['n_estimators']:
    for md in rf_grid['max_depth']:
        for msl in rf_grid['min_samples_leaf']:
            for mf in rf_grid['max_features']:
                rf = RandomForestClassifier(
                    n_estimators=ne,
                    max_depth=md,
                    min_samples_leaf=msl,
                    max_features=mf,
                    class_weight='balanced',
                    random_state=42,
                    n_jobs=-1
                )
                rf.fit(X_train_s, y_train)
                p = rf.predict_proba(X_val_s)[:,1]
                score = root_score(s_val, y_val, p)
                if score > best_rf_score:
                    best_rf_score = score
                    best_rf = rf

print(f"Best RF hold-out root-score: {best_rf_score:.3f}")

Best RF hold-out root-score: 0.380


In [22]:
lgb_grid = {
    'learning_rate':     [0.01, 0.05, 0.1],
    'num_leaves':        [16, 31, 63],
    'min_child_samples': [10, 20, 40],
}

best_lgb_score = -1
best_lgb = None
for lr in lgb_grid['learning_rate']:
    for nl in lgb_grid['num_leaves']:
        for mcs in lgb_grid['min_child_samples']:
            lgb = LGBMClassifier(
                objective='binary',
                n_estimators=500,
                learning_rate=lr,
                num_leaves=nl,
                min_child_samples=mcs,
                feature_fraction=0.8,
                bagging_fraction=0.8,
                bagging_freq=5,
                class_weight='balanced',
                random_state=42,
                n_jobs=-1
            )
            lgb.fit(
                X_train_s, y_train,
                eval_set=[(X_val_s, y_val)],
                eval_metric='binary_logloss',
                callbacks=[
                    callback.early_stopping(stopping_rounds=20),
                    callback.log_evaluation(period=0)
                ]
            )
            p = lgb.predict_proba(X_val_s)[:,1]
            score = root_score(s_val, y_val, p)
            if score > best_lgb_score:
                best_lgb_score = score
                best_lgb = lgb

print(f"Best LGB hold-out root-score: {best_lgb_score:.3f}")

[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.378031
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[Lig



Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.381308
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.387527




[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000317 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.318534
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000278 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[Lig



Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.325432
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.337101
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.265939
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.271645
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000707 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.298404
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[337]	valid_0's binary_logloss: 0.293711
[LightGBM] [Info] Number of positive: 400, number of negative: 5031




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[340]	valid_0's binary_logloss: 0.292066
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[Ligh



Early stopping, best iteration is:
[452]	valid_0's binary_logloss: 0.287377
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Early stopping, best iteration is:
[318]	valid_0's binary_logloss: 0.264279
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Early stopping, best iteration is:
[284]	valid_0's binary_logloss: 0.271699
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Early stopping, best iteration is:
[335]	valid_0's binary_logloss: 0.27299
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Early stopping, best iteration is:
[176]	valid_0's binary_logloss: 0.25892
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Early stopping, best iteration is:
[162]	valid_0's binary_logloss: 0.257288
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000262 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds




Early stopping, best iteration is:
[214]	valid_0's binary_logloss: 0.272851
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[270]	valid_0's binary_logloss: 0.281029
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data 



Early stopping, best iteration is:
[279]	valid_0's binary_logloss: 0.267519
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[270]	valid_0's binary_logloss: 0.278576
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data 



Early stopping, best iteration is:
[148]	valid_0's binary_logloss: 0.266478
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000270 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[179]	valid_0's binary_logloss: 0.2649
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data po



Early stopping, best iteration is:
[139]	valid_0's binary_logloss: 0.279529
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[85]	valid_0's binary_logloss: 0.260959
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data p



Early stopping, best iteration is:
[82]	valid_0's binary_logloss: 0.273309
[LightGBM] [Info] Number of positive: 400, number of negative: 5031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3577
[LightGBM] [Info] Number of data points in the train set: 5431, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[84]	valid_0's binary_logloss: 0.279611
Best LGB hold-out root-score: 0.370




In [23]:
p_rf  = best_rf.predict_proba(X_val_s)[:,1]
p_lgb = best_lgb.predict_proba(X_val_s)[:,1]
p_ens = 0.5 * p_rf + 0.5 * p_lgb
ens_score = root_score(s_val, y_val, p_ens)

print(f"Ensemble root-score: {ens_score:.3f}")

Ensemble root-score: 0.380


