# Linear Approach
Starting by importing the right libraries:

In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
import statistics

Next we read the train data:

In [2]:
data = pd.read_csv('../data/normalized_expanded_train.csv')
data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root
0,Japanese,2,6,23,0.133038,-0.55816,-0.308023,-0.676962,-0.676962,0.409538,-0.327923,0.074694,-0.55816,-0.676962,-0.676962,0.489782,-0.031273,0
1,Japanese,2,4,23,-1.396899,-1.111953,-1.442852,-1.126101,-1.126101,-1.313638,-1.131636,-1.460142,-1.111953,-1.126101,-1.126101,1.149235,-1.469827,0
2,Japanese,2,2,23,1.662975,0.111311,0.74155,0.135764,0.135764,1.885752,0.745846,1.519954,0.111311,0.135764,0.135764,-0.198147,1.407281,0
3,Japanese,2,23,23,-1.396899,-0.61806,-0.907167,-1.126101,-1.126101,-1.449223,-0.639537,-1.315616,-0.61806,-1.126101,-1.126101,0.556481,-1.110188,0
4,Japanese,2,20,23,0.133038,0.81263,0.880961,0.413803,0.413803,-0.074579,1.11307,0.390094,0.81263,0.413803,0.413803,-0.837635,0.688004,0


Now we will focus only in one language, opting to go for `Polish`:

In [None]:
polish_data = data[data.language == 'Polish'].sample(frac=1, random_state=1).copy()
polish_data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root
128080,Polish,290,4,16,-0.881917,-1.368804,-1.295781,-0.882725,-0.882725,-0.821045,-1.331697,-0.932232,-1.368804,-0.882725,-0.882725,1.512314,-0.926524,0
126400,Polish,83,14,30,0.074744,1.160345,1.13123,0.407602,0.407602,-0.104629,1.42274,0.432176,1.160345,0.407602,0.407602,-1.145291,0.718597,0
130200,Polish,554,4,29,-0.949857,-0.890688,-0.958092,-0.750504,-0.750504,-0.940995,-0.861625,-0.931453,-0.890688,-0.750504,-0.750504,0.911619,-0.83594,0
132523,Polish,848,14,24,0.080322,0.806343,0.729296,0.11756,0.11756,-0.001283,0.764421,0.262557,0.806343,0.11756,0.11756,-0.834676,0.331184,0
132995,Polish,931,5,10,0.229416,1.26266,0.884976,1.068251,1.068251,0.108305,0.866356,0.42179,1.26266,1.068251,1.068251,-1.248548,0.614203,0


Now we can split the data in train and test in a way to keep the same sentences (sentence level and node level of sentence) in the same group. Each sentence and its nodes will only be found in train or validation set

In [4]:
X = polish_data.copy()#.drop(['is_root','language','n'], axis=1).copy()
y = polish_data['is_root'].copy()

groups = polish_data['sentence']

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val= X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

Now we also define the cross validation strategy. We also split the training data in 5 different folds ensuring that data of of the same sentence can not be found in different folds:

In [5]:
cv = StratifiedGroupKFold(n_splits=5)
groups = X_train['sentence']

Next we can define the logistic regression model parameters that we want to try:

In [6]:
model = LogisticRegression(random_state=2, n_jobs=7)
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'l1_ratio': [0, 0.5, 1],
    'fit_intercept':[True,False],
    'warm_start': [True,False],
    'max_iter': [1000, 2500, 5000]
}


We use a custom scoring method that picks one root per sentence. For each sentence, it chooses the node with the highest chance of being the root (class 1). The final score is just the percentage of sentences where we picked the correct root.:

In [7]:
def root_prediction_score(estimator, X, y_true):
    """
    Scoring function that extracts sentence IDs from X and computes
    root prediction accuracy per sentence.
    """
    
    sentence_ids = X['sentence'].values
    X_features = X.copy()
    # Predict probabilities
    probs = estimator.predict_proba(X_features)[:, 1]
    # Build DataFrame for groupby
    df_pred = pd.DataFrame({
        'sentence': sentence_ids,
        'is_root': y_true,
        'root_prob': probs
    })

    predicted_roots = df_pred.loc[df_pred.groupby('sentence')['root_prob'].idxmax()]
    accuracy = float((predicted_roots['is_root'] == 1).mean())
    return accuracy

root_scorer = make_scorer(root_prediction_score, greater_is_better=True)


Now we can do the gridsearch, combining defined parameters, the custom scoring function, the cross validation strategy (that ensures no data leakage):

In [8]:

feature_cols = [
    'sentence', 'degree', 'closeness', 'harmonic', 'betweeness', 'load', 'pagerank',
    'eigenvector', 'katz', 'information', 'current_flow_betweeness',
    'percolation', 'second_order', 'laplacian'
]

grid_search = RandomizedSearchCV(n_iter=200,
    estimator=model,
    param_distributions=param_grid,
    cv=cv,
    scoring=root_prediction_score,
    n_jobs=7,
    random_state=2
)

grid_search.fit(X_train[feature_cols], y_train, groups=groups)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best params: {'warm_start': False, 'solver': 'newton-cholesky', 'penalty': 'l2', 'max_iter': 5000, 'l1_ratio': 1, 'fit_intercept': True, 'C': 0.1}
Best score: 0.2725000000000001


415 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in f

We will use the best estimator found to see the performance in the validation dataset:

In [9]:
best_estimator = grid_search.best_estimator_
root_prediction_score(best_estimator, X_val[feature_cols], y_val)


0.35

### Extending all languages
Next we create a script that follows the previous steps for each of the 21 languages. As a result we will have one logistic regression model for each language. To predict the test data, we will use the particular logistic regression depending on the language of the row:

In [12]:
score = []
best_models = {}
for language in data.language.unique():
    X = data[data.language == language].sample(frac=1, random_state=1).copy()
    y = X['is_root'].copy()

    groups = X['sentence']

    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, val_idx = next(gss.split(X, y, groups=groups))

    X_train, X_val= X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    cv = StratifiedGroupKFold(n_splits=5)
    groups = X_train['sentence']
    
    model = LogisticRegression(random_state=2, n_jobs=7)
    param_grid = {
        'penalty': ['l1', 'l2', 'elasticnet', None],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'sag', 'saga'],
        'l1_ratio': [0, 0.5, 1],
        'fit_intercept':[True,False],
        'warm_start': [True,False],
        'max_iter': [1000, 2500, 5000]
    }
    feature_cols = [
    'sentence', 'degree', 'closeness', 'harmonic', 'betweeness', 'load', 'pagerank',
    'eigenvector', 'katz', 'information', 'current_flow_betweeness',
    'percolation', 'second_order', 'laplacian'
    ]

    grid_search = RandomizedSearchCV(n_iter=150,
    estimator=model,
    param_distributions=param_grid,
    cv=cv,
    scoring=root_prediction_score,
    n_jobs=7,
    random_state=2
    )

    grid_search.fit(X_train[feature_cols], y_train, groups=groups)
    best_estimator = grid_search.best_estimator_
    best_models[language] = best_estimator

    current_score = root_prediction_score(best_estimator, X_val[feature_cols], y_val)
    score.append(current_score)
    print(language)
    print('Validation score: ' + str(current_score))
    print(grid_search.best_params_)
    print('---------------------------------------')

215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Japanese
Validation score: 0.08
{'warm_start': True, 'solver': 'saga', 'penalty': None, 'max_iter': 5000, 'l1_ratio': 1, 'fit_intercept': False, 'C': 0.1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Finnish
Validation score: 0.38
{'warm_start': False, 'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 2500, 'l1_ratio': 0.5, 'fit_intercept': True, 'C': 10}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Galician
Validation score: 0.3
{'warm_start': False, 'solver': 'lbfgs', 'penalty': None, 'max_iter': 2500, 'l1_ratio': 0, 'fit_intercept': False, 'C': 1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

English
Validation score: 0.28
{'warm_start': True, 'solver': 'lbfgs', 'penalty': None, 'max_iter': 2500, 'l1_ratio': 1, 'fit_intercept': True, 'C': 10}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Hindi
Validation score: 0.27
{'warm_start': True, 'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 1000, 'l1_ratio': 0.5, 'fit_intercept': True, 'C': 1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

French
Validation score: 0.29
{'warm_start': True, 'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 1000, 'l1_ratio': 0.5, 'fit_intercept': True, 'C': 1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Italian
Validation score: 0.2
{'warm_start': False, 'solver': 'lbfgs', 'penalty': None, 'max_iter': 2500, 'l1_ratio': 0, 'fit_intercept': False, 'C': 1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Indonesian
Validation score: 0.28
{'warm_start': False, 'solver': 'lbfgs', 'penalty': None, 'max_iter': 2500, 'l1_ratio': 0, 'fit_intercept': False, 'C': 1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Swedish
Validation score: 0.34
{'warm_start': False, 'solver': 'sag', 'penalty': None, 'max_iter': 2500, 'l1_ratio': 1, 'fit_intercept': False, 'C': 0.1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Spanish
Validation score: 0.32
{'warm_start': False, 'solver': 'sag', 'penalty': None, 'max_iter': 5000, 'l1_ratio': 0.5, 'fit_intercept': False, 'C': 100}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Icelandic
Validation score: 0.33
{'warm_start': False, 'solver': 'sag', 'penalty': 'l2', 'max_iter': 5000, 'l1_ratio': 1, 'fit_intercept': False, 'C': 1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

German
Validation score: 0.31
{'warm_start': False, 'solver': 'lbfgs', 'penalty': None, 'max_iter': 2500, 'l1_ratio': 0, 'fit_intercept': False, 'C': 1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Korean
Validation score: 0.34
{'warm_start': False, 'solver': 'sag', 'penalty': None, 'max_iter': 5000, 'l1_ratio': 0.5, 'fit_intercept': False, 'C': 100}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Polish
Validation score: 0.32
{'warm_start': False, 'solver': 'sag', 'penalty': None, 'max_iter': 1000, 'l1_ratio': 0, 'fit_intercept': True, 'C': 100}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Thai
Validation score: 0.29
{'warm_start': True, 'solver': 'saga', 'penalty': 'l1', 'max_iter': 2500, 'l1_ratio': 0.5, 'fit_intercept': True, 'C': 0.1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Turkish
Validation score: 0.36
{'warm_start': True, 'solver': 'saga', 'penalty': 'l1', 'max_iter': 1000, 'l1_ratio': 0.5, 'fit_intercept': True, 'C': 100}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Czech
Validation score: 0.32
{'warm_start': False, 'solver': 'saga', 'penalty': 'elasticnet', 'max_iter': 2500, 'l1_ratio': 1, 'fit_intercept': False, 'C': 0.01}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Chinese
Validation score: 0.26
{'warm_start': True, 'solver': 'lbfgs', 'penalty': None, 'max_iter': 2500, 'l1_ratio': 1, 'fit_intercept': True, 'C': 10}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Portuguese
Validation score: 0.28
{'warm_start': False, 'solver': 'lbfgs', 'penalty': None, 'max_iter': 2500, 'l1_ratio': 0, 'fit_intercept': False, 'C': 1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Arabic
Validation score: 0.33
{'warm_start': True, 'solver': 'saga', 'penalty': 'l2', 'max_iter': 1000, 'l1_ratio': 0, 'fit_intercept': True, 'C': 1}
---------------------------------------


215 fits failed out of a total of 750.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\stef4\Documents\ml-project-root-node-in-free-tree\myenv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fi

Russian
Validation score: 0.32
{'warm_start': True, 'solver': 'sag', 'penalty': 'l2', 'max_iter': 5000, 'l1_ratio': 0, 'fit_intercept': True, 'C': 1}
---------------------------------------




Printing the average score for the whole validation dataset:

In [13]:
statistics.mean(score)

0.29523809523809524

## Test data
Now let's use the best estimators found to predict the test data:

In [14]:
test = pd.read_csv('../data/normalized_expanded_test.csv')
test.head()

Unnamed: 0,id,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian
0,1,Japanese,1,38,43,0.043173,0.114668,0.270083,-0.710642,-0.710642,0.120017,-0.482073,0.031637,0.114668,-0.710642,-0.710642,-0.204072,-0.079853
1,1,Japanese,1,33,43,-0.885052,-0.428544,-0.934764,-0.951023,-0.951023,-0.801517,-0.482239,-1.040168,-0.428544,-0.951023,-0.951023,0.347011,-0.983454
2,1,Japanese,1,10,43,1.899625,0.766913,1.821878,0.743366,0.743366,1.764984,-0.481724,1.960983,0.766913,0.743366,0.743366,-0.799779,1.727348
3,1,Japanese,1,24,43,0.043173,-1.078191,-1.090006,-0.710642,-0.710642,0.299057,-0.482315,-0.174878,-1.078191,-0.710642,-0.710642,1.103667,-0.441293
4,1,Japanese,1,16,43,-0.885052,-1.40519,-1.871333,-0.951023,-0.951023,-0.725446,-0.48233,-1.06082,-1.40519,-0.951023,-0.951023,1.540413,-0.983454


Next we create a script that for each sample of the test data  (unique id), uses the corresponding language model to make the predictions (binary classification node level predictions). Then the final prediction of the root node of the sentence instance id is the node with the highest probability to belong to the class 1: `root` class:

In [15]:
results = []
for test_sample_id in test.id.unique():
    current_sample = test[test.id == test_sample_id].copy()
    language = current_sample.language.iloc[0]
    # Create a new column storing the probabilities
    current_sample['probabilities'] = best_models[language].predict_proba(current_sample[feature_cols])[:, 1]
    # Returning the vertex number of the row witht the highest probabilities
    predicted_root = current_sample.loc[current_sample['probabilities'].idxmax(), 'vertex']
    results.append({'id':test_sample_id, 'root':predicted_root})


Now we can create a dataframe from the results and save it as csv file

In [16]:
final_results_df = pd.DataFrame(results)
final_results_df.to_csv('../data/submission.csv', index=False)