# Linear Approach
Starting by importing the right libraries:

In [None]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, GridSearchCV, StratifiedGroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer

Next we read the train data:

In [41]:
data = pd.read_csv('../data/normalized_expanded_train.csv')
data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root
0,Japanese,2,6,23,0.133038,-0.55816,-0.308023,-0.676962,-0.676962,0.409538,-0.327923,0.074694,-0.55816,-0.676962,-0.676962,0.489782,-0.031273,0
1,Japanese,2,4,23,-1.396899,-1.111953,-1.442852,-1.126101,-1.126101,-1.313638,-1.131636,-1.460142,-1.111953,-1.126101,-1.126101,1.149235,-1.469827,0
2,Japanese,2,2,23,1.662975,0.111311,0.74155,0.135764,0.135764,1.885752,0.745846,1.519954,0.111311,0.135764,0.135764,-0.198147,1.407281,0
3,Japanese,2,23,23,-1.396899,-0.61806,-0.907167,-1.126101,-1.126101,-1.449223,-0.639537,-1.315616,-0.61806,-1.126101,-1.126101,0.556481,-1.110188,0
4,Japanese,2,20,23,0.133038,0.81263,0.880961,0.413803,0.413803,-0.074579,1.11307,0.390094,0.81263,0.413803,0.413803,-0.837635,0.688004,0


Now we will focus only in one language, opting to go for `Polish`:

In [42]:
polish_data = data[data.language == 'Polish'].sample(frac=1).copy()
polish_data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root
130343,Polish,571,5,16,-1.021055,-1.296689,-1.416117,-0.953935,-0.953935,-0.957508,-1.299322,-1.126988,-1.296689,-0.953935,-0.953935,1.397441,-1.154014,0
130950,Polish,663,9,11,1.262672,0.593176,0.93614,0.687714,0.687714,1.342575,0.789891,1.143079,0.593176,0.687714,0.687714,-0.647289,0.950139,0
132643,Polish,871,4,19,-1.13586,0.470146,0.02132,-0.970478,-0.970478,-1.364014,0.360487,-0.881947,0.470146,-0.970478,-0.970478,-0.533836,-0.556987,0
133482,Polish,977,2,20,2.110579,2.278207,2.167471,2.558252,2.558252,1.952752,2.542,2.238517,2.278207,2.558252,2.558252,-2.121269,2.483897,0
132470,Polish,843,12,22,-0.957826,-0.115115,-0.245679,-0.87296,-0.87296,-1.028892,0.145024,-0.796896,-0.115115,-0.87296,-0.87296,0.014297,-0.577092,0


Now we can split the data in train and test in a way to keep the same sentences (sentence level and node level of sentence) in the same group. Each sentence and its nodes will only be found in train or validation set

In [52]:
X = polish_data.copy()#.drop(['is_root','language','n'], axis=1).copy()
y = polish_data['is_root'].copy()

groups = polish_data['sentence']

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val= X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

Now we also define the cross validation strategy. We also split the training data in 5 different folds ensuring that data of of the same sentence can not be found in different folds:

In [53]:
cv = StratifiedGroupKFold(n_splits=5)
groups = X_train['sentence']

Next we can define the logistic regression model parameters that we want to try:

In [97]:
model = LogisticRegression(random_state=2)
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['saga'],
    'l1_ratio': [0, 0.5, 1],
    'max_iter': [1000, 2500, 5000]
}


We use a custom scoring method that picks one root per sentence. For each sentence, it chooses the node with the highest chance of being the root (class 1). The final score is just the percentage of sentences where we picked the correct root.:

In [113]:
def root_prediction_score(estimator, X, y_true):
    """
    Scoring function that extracts sentence IDs from X and computes
    root prediction accuracy per sentence.
    """
    
    sentence_ids = X['sentence'].values
    X_features = X.copy()
    # Predict probabilities
    probs = estimator.predict_proba(X_features)[:, 1]
    # Build DataFrame for groupby
    df_pred = pd.DataFrame({
        'sentence': sentence_ids,
        'is_root': y_true,
        'root_prob': probs
    })

    predicted_roots = df_pred.loc[df_pred.groupby('sentence')['root_prob'].idxmax()]
    accuracy = float((predicted_roots['is_root'] == 1).mean())
    return accuracy

root_scorer = make_scorer(root_prediction_score, greater_is_better=True)


Now we can do the gridsearch, combining defined parameters, the custom scoring function, the cross validation strategy (that ensures no data leakage):

In [100]:

feature_cols = [
    'sentence', 'degree', 'closeness', 'harmonic', 'betweeness', 'load', 'pagerank',
    'eigenvector', 'katz', 'information', 'current_flow_betweeness',
    'percolation', 'second_order', 'laplacian'
]


grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring=root_prediction_score,
    n_jobs=7,
    verbose=1
)

grid_search.fit(X_train[feature_cols], y_train, groups=groups)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


225 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
62 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\stef4\Documents\ml

Best params: {'C': 0.1, 'l1_ratio': 0, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'saga'}
Best score: 0.26749999999999996




We will use the best estimator found to see the performance:

In [115]:
best_estimator = grid_search.best_estimator_
root_prediction_score(best_estimator, X_val[feature_cols], y_val)


0.33