# Linear Approach

This approach focuses on using non-normalized centrality measures, a PCA and logistic regression for each of the languages.

Starting by importing the right libraries:

In [1]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
import statistics
import random
import warnings
from sklearn.exceptions import FitFailedWarning

warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

Next we read the train data:

In [2]:
data = pd.read_csv('../data/expanded_train.csv')
data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root
0,Japanese,2,6,23,0.090909,0.15942,5.823846,0.090909,0.090909,0.048565,0.149505,0.209086,0.007246,0.090909,0.090909,98.762341,0.101449,0
1,Japanese,2,4,23,0.045455,0.138365,4.561122,0.0,0.0,0.027162,0.068517,0.188298,0.006289,0.0,0.0,112.48111,0.043478,0
2,Japanese,2,2,23,0.136364,0.184874,6.991703,0.255411,0.255411,0.066901,0.257706,0.22866,0.008403,0.255411,0.255411,84.451169,0.15942,0
3,Japanese,2,23,23,0.045455,0.157143,5.157179,0.0,0.0,0.025477,0.118104,0.190256,0.007143,0.0,0.0,100.149888,0.057971,0
4,Japanese,2,20,23,0.090909,0.211538,7.146825,0.311688,0.311688,0.042552,0.29471,0.213357,0.009615,0.311688,0.311688,71.147734,0.130435,0


Now we will focus only in one language, opting to go for `Polish`:

In [3]:
polish_data = data[data.language == 'Polish'].sample(frac=1, random_state=1).copy()
polish_data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root
126122,Polish,46,2,23,0.045455,0.164179,5.207179,0.0,0.0,0.026311,0.075918,0.18866,0.007463,0.0,0.0,119.289564,0.054054,0
130297,Polish,567,25,35,0.029412,0.116838,6.134683,0.0,0.0,0.01717,0.006415,0.152032,0.003436,0.0,0.0,251.872984,0.033898,0
128519,Polish,357,9,10,0.111111,0.346154,3.866667,0.0,0.0,0.059713,0.251846,0.293119,0.038462,1.2335810000000002e-17,0.0,31.843367,0.166667,0
132310,Polish,825,10,15,0.285714,0.297872,6.742857,0.494505,0.494505,0.136593,0.291091,0.306236,0.021277,0.4945055,0.494505,47.222876,0.333333,0
129666,Polish,490,26,27,0.076923,0.282609,9.45119,0.212308,0.212308,0.036232,0.209253,0.196472,0.01087,0.2123077,0.212308,93.797655,0.108696,0


Now we can use pca to reduce the centrality dimensions:

In [4]:
pca = PCA(n_components=7)
feature_cols = ['degree', 'closeness','harmonic', 'betweeness', 'load', 'pagerank', 'eigenvector', 'katz',
       'information', 'current_flow_betweeness', 'percolation', 'second_order',
       'laplacian']
X_pca = pca.fit_transform(polish_data[feature_cols])

The explained variance reaches almost 100% by just using 7 principal components

In [5]:
sum(pca.explained_variance_ratio_)

np.float64(0.9999999131391364)

Now we can split the data in train and test in a way to keep the same sentences (sentence level and node level of sentence) in the same group. Each sentence and its nodes will only be found in train or validation set

In [6]:
X = polish_data.copy()#.drop(['is_root','language','n'], axis=1).copy()
y = polish_data['is_root'].copy()

groups = polish_data['sentence']

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val= X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

Now we also define the cross validation strategy. We also split the training data in 5 different folds ensuring that data of of the same sentence can not be found in different folds:

In [7]:
cv = StratifiedGroupKFold(n_splits=5)
groups = X_train['sentence']

Next we can define the pipeline of pca and logistic regression model parameters that we want to try:

In [8]:
# Define columns
feature_cols = ['sentence', 'degree', 'closeness','harmonic', 'betweeness', 'load', 'pagerank', 'eigenvector', 'katz',
       'information', 'current_flow_betweeness', 'percolation', 'second_order',
       'laplacian']

pca_columns = X_train[feature_cols].columns.difference(['sentence'])
passthrough_columns = ['sentence']

# We want the sentence column to passthrough and not be transformed since it is used for the scoring
preprocessor = ColumnTransformer([
     ('pca_pipeline', Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=7))
    ]), pca_columns),
    ('passthrough', 'passthrough', passthrough_columns)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=2, n_jobs=7))
])

param_grid = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet', None],
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'classifier__l1_ratio': [0, 0.5, 1],
    'classifier__fit_intercept':[True,False],
    'classifier__warm_start': [True,False],
    'classifier__max_iter': [100, 250, 500, 1000, 2500, 5000]
}


We use a custom scoring method that picks one root per sentence. For each sentence, it chooses the node with the highest chance of being the root (class 1). The final score is just the percentage of sentences where we picked the correct root.:

In [9]:
def root_prediction_score(estimator, X, y_true):
    """
    Scoring function that extracts sentence IDs from X and computes
    root prediction accuracy per sentence.
    """
    
    sentence_ids = X['sentence'].values
    X_features = X.copy()
    # Predict probabilities
    probs = estimator.predict_proba(X_features)[:, 1]
    # Build DataFrame for groupby
    df_pred = pd.DataFrame({
        'sentence': sentence_ids,
        'is_root': y_true,
        'root_prob': probs
    })

    predicted_roots = df_pred.loc[df_pred.groupby('sentence')['root_prob'].idxmax()]
    accuracy = float((predicted_roots['is_root'] == 1).mean())
    return accuracy

root_scorer = make_scorer(root_prediction_score, greater_is_better=True)


Now we can do the gridsearch, combining defined parameters, the custom scoring function, the cross validation strategy (that ensures no data leakage):

In [10]:

grid_search = RandomizedSearchCV(n_iter=500,
    estimator=pipeline,
    param_distributions=param_grid,
    cv=cv,
    scoring=root_prediction_score,
    n_jobs=7,
    random_state=2
)

grid_search.fit(X_train[feature_cols], y_train, groups=groups)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best params: {'classifier__warm_start': True, 'classifier__solver': 'saga', 'classifier__penalty': 'elasticnet', 'classifier__max_iter': 1000, 'classifier__l1_ratio': 0, 'classifier__fit_intercept': False, 'classifier__C': 10}
Best score: 0.2725


We will use the best estimator found to see the performance in the validation dataset:

In [11]:
best_estimator = grid_search.best_estimator_
root_prediction_score(best_estimator, X_val[feature_cols], y_val)

0.34

### Extending all languages
Next we create a script that follows the previous steps for each of the 21 languages. As a result we will have one logistic regression model for each language. To predict the test data, we will use the particular logistic regression depending on the language of the row:

In [12]:
score = []
best_models = {}
random.seed(42)
for language in data.language.unique():
    X = data[data.language == language].sample(frac=1, random_state=1).copy()
    y = X['is_root'].copy()

    groups = X['sentence']

    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, val_idx = next(gss.split(X, y, groups=groups))

    X_train, X_val= X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    cv = StratifiedGroupKFold(n_splits=5)
    groups = X_train['sentence']
    
    feature_cols = [
    'sentence', 'degree', 'closeness', 'harmonic', 'betweeness', 'load', 'pagerank',
    'eigenvector', 'katz', 'information', 'current_flow_betweeness',
    'percolation', 'second_order', 'laplacian'
    ]
    pca_columns = X_train[feature_cols].columns.difference(['sentence'])
    passthrough_columns = ['sentence']

    preprocessor = ColumnTransformer([
        ('pca_pipeline', Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA())
        ]), pca_columns),
        ('passthrough', 'passthrough', passthrough_columns)
    ])

    # Final pipeline of pca and classifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=2, n_jobs=7))
    ])

    param_grid = {
        'preprocessor__pca_pipeline__pca__n_components': [5,6,7,8,9],
        'classifier__penalty': ['l1', 'l2', 'elasticnet', None],
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
        'classifier__l1_ratio': [0, 0.5, 1],
        'classifier__fit_intercept':[True,False],
        'classifier__warm_start': [True,False],
        'classifier__max_iter': [50, 100, 250, 500, 750, 1000, 2000]
    }

    grid_search = RandomizedSearchCV(n_iter=200,
    estimator=pipeline,
    param_distributions=param_grid,
    cv=cv,
    scoring=root_prediction_score,
    n_jobs=7,
    # We want to get random combination of params for each different language
    random_state=random.randint(1, 1000)
    )

    grid_search.fit(X_train[feature_cols], y_train, groups=groups)

    best_params = grid_search.best_params_
    
    current_score = root_prediction_score(grid_search.best_estimator_, X_val[feature_cols], y_val)
    score.append(current_score)
    print(language)
    print('Grid search score: ' + str(grid_search.best_score_))
    print('Validation score: ' + str(current_score))
    print(best_params)
    print('---------------------------------------')
    
    # Rebuild the whole pipeline using the best params
    final_preprocessor = ColumnTransformer([
        ('pca_pipeline', Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=best_params['preprocessor__pca_pipeline__pca__n_components']))
        ]), pca_columns),
        ('passthrough', 'passthrough', passthrough_columns)
    ])

    # Handle optional params
    classifier_params = {
        key.split("classifier__")[1]: value
        for key, value in best_params.items()
        if key.startswith("classifier__")
    }

    final_model = Pipeline([
        ('preprocessor', final_preprocessor),
        ('classifier', LogisticRegression(
            **classifier_params,
            n_jobs=7,
            random_state=2
        ))
    ])


    final_model.fit(X[feature_cols], y)
    best_models[language] = final_model

Japanese
Grid search score: 0.0675
Validation score: 0.03
{'preprocessor__pca_pipeline__pca__n_components': 8, 'classifier__warm_start': True, 'classifier__solver': 'newton-cholesky', 'classifier__penalty': 'l2', 'classifier__max_iter': 2000, 'classifier__l1_ratio': 0.5, 'classifier__fit_intercept': True, 'classifier__C': 10}
---------------------------------------
Finnish
Grid search score: 0.34500000000000003
Validation score: 0.36
{'preprocessor__pca_pipeline__pca__n_components': 6, 'classifier__warm_start': True, 'classifier__solver': 'liblinear', 'classifier__penalty': 'l2', 'classifier__max_iter': 750, 'classifier__l1_ratio': 0, 'classifier__fit_intercept': False, 'classifier__C': 100}
---------------------------------------
Galician
Grid search score: 0.28
Validation score: 0.37
{'preprocessor__pca_pipeline__pca__n_components': 6, 'classifier__warm_start': False, 'classifier__solver': 'newton-cholesky', 'classifier__penalty': 'l2', 'classifier__max_iter': 100, 'classifier__l1_ra

Printing the average score for the whole validation dataset:

In [17]:
statistics.mean(score)

0.2957142857142857