# Non-Linear Approach
This attempt focuses on using normalized centrality measures and a random forest for each of the languages

Starting by importing the right libraries:

In [33]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.ensemble import ExtraTreesClassifier
import statistics
import random
import warnings
from sklearn.exceptions import FitFailedWarning

warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

Next we read the train data:

In [34]:
data = pd.read_csv('../data/expanded_train_with_leaf.csv')
data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root,is_leaf
0,Japanese,2,6,23,0.090909,0.15942,5.823846,0.090909,0.090909,0.048565,0.149505,0.209086,0.007246,0.090909,0.090909,98.762341,0.101449,0,0
1,Japanese,2,4,23,0.045455,0.138365,4.561122,0.0,0.0,0.027162,0.068517,0.188298,0.006289,0.0,0.0,112.48111,0.043478,0,1
2,Japanese,2,2,23,0.136364,0.184874,6.991703,0.255411,0.255411,0.066901,0.257706,0.22866,0.008403,0.255411,0.255411,84.451169,0.15942,0,0
3,Japanese,2,23,23,0.045455,0.157143,5.157179,0.0,0.0,0.025477,0.118104,0.190256,0.007143,0.0,0.0,100.149888,0.057971,0,1
4,Japanese,2,20,23,0.090909,0.211538,7.146825,0.311688,0.311688,0.042552,0.29471,0.213357,0.009615,0.311688,0.311688,71.147734,0.130435,0,0


Now we will focus only in one language, opting to go for `Polish`:

In [35]:
german_data = (
    data
    [ (data.language == 'German') ]  # only non-leaf nodes
    .sample(frac=1, random_state=1)
    .copy()
)
german_data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root,is_leaf
116304,German,797,13,16,0.2,0.306122,6.902381,0.447619,0.447619,0.093527,0.441785,0.279043,0.020408,0.447619,0.447619,44.631827,0.265306,0,0
115825,German,733,9,24,0.043478,0.264368,7.416667,0.0,0.0,0.022941,0.191661,0.187486,0.011494,0.0,0.0,91.858587,0.071429,0,1
112469,German,397,5,19,0.055556,0.162162,4.145238,0.0,0.0,0.033421,0.022759,0.205355,0.009009,2.67034e-16,0.0,103.932671,0.048387,0,1
114550,German,618,15,21,0.1,0.307692,7.65,0.1,0.1,0.050601,0.196587,0.218293,0.015385,0.1,0.1,75.299402,0.109589,0,0
114158,German,567,11,37,0.027778,0.208092,9.046429,0.0,0.0,0.014849,0.066178,0.147068,0.00578,0.0,0.0,184.412581,0.030075,0,1


Now we can split the data in train and test in a way to keep the same sentences (sentence level and node level of sentence) in the same group. Each sentence and its nodes will only be found in train or validation set

In [36]:
X = german_data.copy()#.drop(['is_root','language','n'], axis=1).copy()
y = german_data['is_root'].copy()

groups = german_data['sentence']

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val= X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

Now we also define the cross validation strategy. We also split the training data in 5 different folds ensuring that data of of the same sentence can not be found in different folds:

In [37]:
cv = StratifiedGroupKFold(n_splits=5)
groups = X_train['sentence']

Next we can define the logistic regression model parameters that we want to try:

In [38]:
# Use an imbalanced‐aware RF
model = BalancedRandomForestClassifier(
    n_jobs=8,
    random_state=2,
    sampling_strategy='auto'   # balances classes by undersampling the majority
)
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2']
}

We use a custom scoring method that picks one root per sentence. For each sentence, it chooses the node with the highest chance of being the root (class 1). The final score is just the percentage of sentences where we picked the correct root.:

In [39]:
def root_prediction_score(estimator, X, y_true):
    """
    Scoring function that extracts sentence IDs from X and computes
    root prediction accuracy per sentence.
    """
    
    sentence_ids = X['sentence'].values
    X_features = X.copy()
    # Predict probabilities
    probs = estimator.predict_proba(X_features)[:, 1]
    # Build DataFrame for groupby
    df_pred = pd.DataFrame({
        'sentence': sentence_ids,
        'is_root': y_true,
        'root_prob': probs
    })

    predicted_roots = df_pred.loc[df_pred.groupby('sentence')['root_prob'].idxmax()]
    accuracy = float((predicted_roots['is_root'] == 1).mean())
    return accuracy

root_scorer = make_scorer(root_prediction_score, greater_is_better=True)


Now we can do the gridsearch, combining defined parameters, the custom scoring function, the cross validation strategy (that ensures no data leakage):

In [41]:

feature_cols = [
    'sentence', 'degree', 'closeness', 'harmonic', 'betweeness', 'load', 'pagerank',
    'eigenvector', 'katz', 'information', 'current_flow_betweeness',
    'percolation', 'second_order', 'laplacian'
]

grid_search = RandomizedSearchCV(n_iter=200,
    estimator=model,
    param_distributions=param_grid,
    cv=cv,
    scoring=root_prediction_score,
    n_jobs=8,
    random_state=2
)

grid_search.fit(X_train[feature_cols], y_train, groups=groups)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best params: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}
Best score: 0.29


We will use the best estimator found to see the performance in the validation dataset:

In [42]:
best_estimator = grid_search.best_estimator_
root_prediction_score(best_estimator, X_val[feature_cols], y_val)


0.29

### Extending all languages
Next we create a script that follows the previous steps for each of the 21 languages. As a result we will have one logistic regression model for each language. To predict the test data, we will use the particular logistic regression depending on the language of the row:

In [10]:
score = []
best_models = {}
random.seed(42)
for language in data.language.unique():
    X = data[data.language == language].sample(frac=1, random_state=1).copy()
    y = X['is_root'].copy()

    groups = X['sentence']

    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, val_idx = next(gss.split(X, y, groups=groups))

    X_train, X_val= X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    cv = StratifiedGroupKFold(n_splits=5)
    groups = X_train['sentence']
    
    # For Japanese we use an extra trees classifier because logistic regression works poorly
    model = BalancedRandomForestClassifier(
    n_jobs=7,
    random_state=2,
    sampling_strategy='auto'   # balances classes by undersampling the majority
    )
    param_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5],
        'max_features': ['sqrt', 'log2']
    }
    feature_cols = [
    'sentence', 'degree', 'closeness', 'harmonic', 'betweeness', 'load', 'pagerank',
    'eigenvector', 'katz', 'information', 'current_flow_betweeness',
    'percolation', 'second_order', 'laplacian'
    ]

    grid_search = RandomizedSearchCV(n_iter=300,
    estimator=model,
    param_distributions=param_grid,
    cv=cv,
    scoring=root_prediction_score,
    n_jobs=7,
    # We want to get random combination of params for each different language
    random_state=random.randint(1, 1000)
    )

    grid_search.fit(X_train[feature_cols], y_train, groups=groups)

    best_params = grid_search.best_params_
    
    current_score = root_prediction_score(grid_search.best_estimator_, X_val[feature_cols], y_val)
    score.append(current_score)
    print(language)
    print('Grid search score: ' + str(grid_search.best_score_))
    print('Validation score: ' + str(current_score))
    print(best_params)
    print('---------------------------------------')
    

    # Fit a new model on the full training data (including any previously held-out validation)
    if language == 'Japanese':
        final_model = ExtraTreesClassifier(**best_params, random_state=2, n_jobs=7)
    else:
        final_model = LogisticRegression(**best_params, random_state=2, n_jobs=7)
    final_model.fit(X[feature_cols], y)
    best_models[language] = final_model

Japanese
Grid search score: 0.12000000000000002
Validation score: 0.11
{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 10, 'criterion': 'gini', 'bootstrap': True}
---------------------------------------
Finnish
Grid search score: 0.34750000000000003
Validation score: 0.36
{'warm_start': True, 'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 5000, 'l1_ratio': 0.5, 'fit_intercept': True, 'C': 100}
---------------------------------------
Galician
Grid search score: 0.28500000000000003
Validation score: 0.3
{'warm_start': True, 'solver': 'lbfgs', 'penalty': None, 'max_iter': 5000, 'l1_ratio': 0.5, 'fit_intercept': False, 'C': 100}
---------------------------------------
English
Grid search score: 0.29250000000000004
Validation score: 0.28
{'warm_start': True, 'solver': 'lbfgs', 'penalty': None, 'max_iter': 5000, 'l1_ratio': 1, 'fit_intercept': True, 'C': 100}
---------------------------------------
Hindi
Grid search score: 0.2125
Val

Printing the average score for the whole validation dataset:

In [11]:
statistics.mean(score)

0.2957142857142857