# Linear Approach

This approach focuses on using normalized centrality measures, a PCA and logistic regression for each of the languages.
A pipeline is trained that uses PCA and a logistic regression for each of the 21 unique languages.

Starting by importing the right libraries:

In [18]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, StratifiedGroupKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
import statistics
import random
import warnings
from sklearn.exceptions import FitFailedWarning

warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=UserWarning)

Next we read the train data:

In [19]:
data = pd.read_csv('../data/normalized_expanded_train.csv')
data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root
0,Japanese,2,6,23,0.133038,-0.55816,-0.308023,-0.676962,-0.676962,0.409538,-0.327923,0.074694,-0.55816,-0.676962,-0.676962,0.489782,-0.031273,0
1,Japanese,2,4,23,-1.396899,-1.111953,-1.442852,-1.126101,-1.126101,-1.313638,-1.131636,-1.460142,-1.111953,-1.126101,-1.126101,1.149235,-1.469827,0
2,Japanese,2,2,23,1.662975,0.111311,0.74155,0.135764,0.135764,1.885752,0.745846,1.519954,0.111311,0.135764,0.135764,-0.198147,1.407281,0
3,Japanese,2,23,23,-1.396899,-0.61806,-0.907167,-1.126101,-1.126101,-1.449223,-0.639537,-1.315616,-0.61806,-1.126101,-1.126101,0.556481,-1.110188,0
4,Japanese,2,20,23,0.133038,0.81263,0.880961,0.413803,0.413803,-0.074579,1.11307,0.390094,0.81263,0.413803,0.413803,-0.837635,0.688004,0


Now we will focus only in one language, opting to go for `Polish`:

In [20]:
polish_data = data[data.language == 'Polish'].sample(frac=1, random_state=1).copy()
polish_data.head(5)

Unnamed: 0,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian,is_root
126122,Polish,46,2,23,-0.983415,-1.112357,-1.140267,-0.8622,-0.8622,-0.946343,-0.721122,-0.987128,-1.112357,-0.8622,-0.8622,1.154875,-0.898371,0
130297,Polish,567,25,35,-0.931149,-1.425134,-1.327955,-0.80835,-0.80835,-0.898689,-0.937838,-0.938361,-1.425134,-0.80835,-0.80835,1.544666,-0.839108,0
128519,Polish,357,9,10,-0.816497,-0.465846,-0.528847,-0.836117,-0.836117,-0.841449,-0.189595,-0.724639,-0.465846,-0.836117,-0.836117,0.404574,-0.525001,0
132310,Polish,825,10,15,2.083023,0.453251,1.35047,1.156678,1.156678,2.162341,0.52067,1.968188,0.453251,1.156678,1.156678,-0.514653,1.923926,0
129666,Polish,490,26,27,0.068199,0.728343,0.712585,0.420129,0.420129,-0.047339,0.516672,0.236797,0.728343,0.420129,0.420129,-0.75339,0.259074,0


Now we can use pca to reduce the centrality dimensions:

In [21]:
pca = PCA(n_components=7)
feature_cols = ['degree', 'closeness','harmonic', 'betweeness', 'load', 'pagerank', 'eigenvector', 'katz',
       'information', 'current_flow_betweeness', 'percolation', 'second_order',
       'laplacian']
X_pca = pca.fit_transform(polish_data[feature_cols])

The explained variance reaches almost 100% by just using 7 principal components

In [22]:
sum(pca.explained_variance_ratio_)

np.float64(0.9999398498243797)

Now we can split the data in train and test in a way to keep the same sentences (sentence level and node level of sentence) in the same group. Each sentence and its nodes will only be found in train or validation set

In [23]:
X = polish_data.copy()#.drop(['is_root','language','n'], axis=1).copy()
y = polish_data['is_root'].copy()

groups = polish_data['sentence']

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val= X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

Now we also define the cross validation strategy. We also split the training data in 5 different folds ensuring that data of the same sentence can not be found in different folds:

In [24]:
cv = StratifiedGroupKFold(n_splits=5)
groups = X_train['sentence']

Next we can define the pipeline of pca and logistic regression model parameters that we want to try:

In [25]:
# Define columns
feature_cols = ['sentence', 'degree', 'closeness','harmonic', 'betweeness', 'load', 'pagerank', 'eigenvector', 'katz',
       'information', 'current_flow_betweeness', 'percolation', 'second_order',
       'laplacian']

pca_columns = X_train[feature_cols].columns.difference(['sentence'])
passthrough_columns = ['sentence']

# We want the sentence column to passthrough and not be transformed since it is used for the scoring
preprocessor = ColumnTransformer([
     ('pca_pipeline', Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=7))
    ]), pca_columns),
    ('passthrough', 'passthrough', passthrough_columns)
])

# Full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=2, n_jobs=7))
])

param_grid = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet', None],
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'classifier__l1_ratio': [0, 0.5, 1],
    'classifier__fit_intercept':[True,False],
    'classifier__warm_start': [True,False],
    'classifier__max_iter': [100, 250, 500, 1000, 2500, 5000]
}


We use a custom scoring method that picks one root per sentence. For each sentence, it chooses the node with the highest chance of being the root (class 1). The final score is just the percentage of sentences where we picked the correct root.:

In [26]:
def root_prediction_score(estimator, X, y_true):
    """
    Scoring function that extracts sentence IDs from X and computes
    root prediction accuracy per sentence.
    """
    
    sentence_ids = X['sentence'].values
    X_features = X.copy()
    # Predict probabilities
    probs = estimator.predict_proba(X_features)[:, 1]
    # Build DataFrame for groupby
    df_pred = pd.DataFrame({
        'sentence': sentence_ids,
        'is_root': y_true,
        'root_prob': probs
    })

    predicted_roots = df_pred.loc[df_pred.groupby('sentence')['root_prob'].idxmax()]
    accuracy = float((predicted_roots['is_root'] == 1).mean())
    return accuracy

root_scorer = make_scorer(root_prediction_score, greater_is_better=True)


Now we can do the gridsearch, combining defined parameters, the custom scoring function, the cross validation strategy (that ensures no data leakage):

In [27]:

grid_search = RandomizedSearchCV(n_iter=500,
    estimator=pipeline,
    param_distributions=param_grid,
    cv=cv,
    scoring=root_prediction_score,
    n_jobs=7,
    random_state=2
)

grid_search.fit(X_train[feature_cols], y_train, groups=groups)

print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best params: {'classifier__warm_start': False, 'classifier__solver': 'saga', 'classifier__penalty': 'elasticnet', 'classifier__max_iter': 250, 'classifier__l1_ratio': 1, 'classifier__fit_intercept': True, 'classifier__C': 0.01}
Best score: 0.2725


We will use the best estimator found to see the performance in the validation dataset:

In [28]:
best_estimator = grid_search.best_estimator_
root_prediction_score(best_estimator, X_val[feature_cols], y_val)

0.34

### Extending all languages
Next we create a script that follows the previous steps for each of the 21 languages. As a result we will have one logistic regression model for each language. To predict the test data, we will use the particular logistic regression depending on the language of the row:

In [29]:
score = []
best_models = {}
random.seed(42)
for language in data.language.unique():
    X = data[data.language == language].sample(frac=1, random_state=1).copy()
    y = X['is_root'].copy()

    groups = X['sentence']

    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, val_idx = next(gss.split(X, y, groups=groups))

    X_train, X_val= X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    cv = StratifiedGroupKFold(n_splits=5)
    groups = X_train['sentence']
    
    feature_cols = [
    'sentence', 'degree', 'closeness', 'harmonic', 'betweeness', 'load', 'pagerank',
    'eigenvector', 'katz', 'information', 'current_flow_betweeness',
    'percolation', 'second_order', 'laplacian'
    ]
    pca_columns = X_train[feature_cols].columns.difference(['sentence'])
    passthrough_columns = ['sentence']

    preprocessor = ColumnTransformer([
        ('pca_pipeline', Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA())
        ]), pca_columns),
        ('passthrough', 'passthrough', passthrough_columns)
    ])

    # Final pipeline of pca and classifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=2, n_jobs=7))
    ])

    param_grid = {
        'preprocessor__pca_pipeline__pca__n_components': [5,6,7,8,9],
        'classifier__penalty': ['l1', 'l2', 'elasticnet', None],
        'classifier__C': [0.01, 0.1, 1, 10, 100],
        'classifier__solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
        'classifier__l1_ratio': [0, 0.5, 1],
        'classifier__fit_intercept':[True,False],
        'classifier__warm_start': [True,False],
        'classifier__max_iter': [50, 100, 250, 500, 750, 1000, 2000]
    }

    grid_search = RandomizedSearchCV(n_iter=400,
    estimator=pipeline,
    param_distributions=param_grid,
    cv=cv,
    scoring=root_prediction_score,
    n_jobs=7,
    # We want to get random combination of params for each different language
    random_state=random.randint(1, 1000)
    )

    grid_search.fit(X_train[feature_cols], y_train, groups=groups)

    best_params = grid_search.best_params_
    
    current_score = root_prediction_score(grid_search.best_estimator_, X_val[feature_cols], y_val)
    score.append(current_score)
    print(language)
    print('Grid search score: ' + str(grid_search.best_score_))
    print('Validation score: ' + str(current_score))
    print(best_params)
    print('---------------------------------------')
    
    # Rebuild the whole pipeline using the best params
    final_preprocessor = ColumnTransformer([
        ('pca_pipeline', Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=best_params['preprocessor__pca_pipeline__pca__n_components']))
        ]), pca_columns),
        ('passthrough', 'passthrough', passthrough_columns)
    ])

    # Handle optional params
    classifier_params = {
        key.split("classifier__")[1]: value
        for key, value in best_params.items()
        if key.startswith("classifier__")
    }

    final_model = Pipeline([
        ('preprocessor', final_preprocessor),
        ('classifier', LogisticRegression(
            **classifier_params,
            n_jobs=7,
            random_state=2
        ))
    ])


    final_model.fit(X[feature_cols], y)
    best_models[language] = final_model

Japanese
Grid search score: 0.095
Validation score: 0.09
{'preprocessor__pca_pipeline__pca__n_components': 5, 'classifier__warm_start': True, 'classifier__solver': 'newton-cg', 'classifier__penalty': None, 'classifier__max_iter': 500, 'classifier__l1_ratio': 1, 'classifier__fit_intercept': True, 'classifier__C': 0.1}
---------------------------------------
Finnish
Grid search score: 0.35750000000000004
Validation score: 0.38
{'preprocessor__pca_pipeline__pca__n_components': 8, 'classifier__warm_start': True, 'classifier__solver': 'newton-cg', 'classifier__penalty': 'l2', 'classifier__max_iter': 2000, 'classifier__l1_ratio': 1, 'classifier__fit_intercept': True, 'classifier__C': 10}
---------------------------------------
Galician
Grid search score: 0.295
Validation score: 0.32
{'preprocessor__pca_pipeline__pca__n_components': 9, 'classifier__warm_start': True, 'classifier__solver': 'newton-cholesky', 'classifier__penalty': None, 'classifier__max_iter': 500, 'classifier__l1_ratio': 1, '

Printing the average score for the whole validation dataset:

In [30]:
statistics.mean(score)

0.2976190476190476

## Test data
Now let's use the best estimators found to predict the test data:

In [14]:
test = pd.read_csv('../data/normalized_expanded_test.csv')
test.head()

Unnamed: 0,id,language,sentence,vertex,n,degree,closeness,harmonic,betweeness,load,pagerank,eigenvector,katz,information,current_flow_betweeness,percolation,second_order,laplacian
0,1,Japanese,1,38,43,0.043173,0.114668,0.270083,-0.710642,-0.710642,0.120017,-0.482073,0.031637,0.114668,-0.710642,-0.710642,-0.204072,-0.079853
1,1,Japanese,1,33,43,-0.885052,-0.428544,-0.934764,-0.951023,-0.951023,-0.801517,-0.482239,-1.040168,-0.428544,-0.951023,-0.951023,0.347011,-0.983454
2,1,Japanese,1,10,43,1.899625,0.766913,1.821878,0.743366,0.743366,1.764984,-0.481724,1.960983,0.766913,0.743366,0.743366,-0.799779,1.727348
3,1,Japanese,1,24,43,0.043173,-1.078191,-1.090006,-0.710642,-0.710642,0.299057,-0.482315,-0.174878,-1.078191,-0.710642,-0.710642,1.103667,-0.441293
4,1,Japanese,1,16,43,-0.885052,-1.40519,-1.871333,-0.951023,-0.951023,-0.725446,-0.48233,-1.06082,-1.40519,-0.951023,-0.951023,1.540413,-0.983454


Next we create a script that for each sample of the test data  (unique id), uses the corresponding language model to make the predictions (binary classification node level predictions). Then the final prediction of the root node of the sentence instance id is the node with the highest probability to belong to the class 1: `root` class:

In [15]:
results = []
for test_sample_id in test.id.unique():
    current_sample = test[test.id == test_sample_id].copy()
    language = current_sample.language.iloc[0]
    # Create a new column storing the probabilities
    current_sample['probabilities'] = best_models[language].predict_proba(current_sample[feature_cols])[:, 1]
    # Returning the vertex number of the row witht the highest probabilities
    predicted_root = current_sample.loc[current_sample['probabilities'].idxmax(), 'vertex']
    results.append({'id':test_sample_id, 'root':predicted_root})


Now we can create a dataframe from the results and save it as csv file

In [16]:
final_results_df = pd.DataFrame(results)
final_results_df.to_csv('../data/submission.csv', index=False)