# Graph Convolutional Network for Root Prediction
This notebook demonstrates a non-linear approach using a GCN to predict the root node in parsed dependency trees for one language.

In [233]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [234]:
# !pip install torch-geometric

In [290]:
import pandas as pd
import ast
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
import random
import itertools

In [291]:
# CONFIG
EXP_PATH = '../../data/expanded_with_features_non-linear.csv'
TRAIN_PATH = '../../data/train.csv'
CENT_COLS = [
    'degree','closeness','harmonic','betweeness','load','pagerank',
    'eigenvector','katz','information','current_flow_betweeness',
    'percolation','second_order','laplacian',
]
VALID_SPLIT = 0.2
BATCH_SIZE = 32
PATIENCE = 5
MAX_EPOCHS = 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [292]:
# BUILD GRAPHS
def build_graphs(df_sub):
    graphs = []
    for (_, sent), grp in df_sub.groupby(['language','sentence'], sort=False):
        x = torch.tensor(grp[CENT_COLS].values, dtype=torch.float)
        edges = grp.edgelist.iloc[0]
        edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous() - 1
        y = torch.zeros(len(grp), dtype=torch.long)
        y[grp.root.iloc[0] - 1] = 1
        graphs.append(Data(x=x, edge_index=edge_index, y=y))
    return graphs

In [293]:
# SINGLE TRAIN/EVALUATE
def eval_config(graphs, hidden, dropout, lr, wd):
    # split
    random.shuffle(graphs)
    cut = int(len(graphs)*(1-VALID_SPLIT))
    train_loader = DataLoader(graphs[:cut], batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(graphs[cut:], batch_size=BATCH_SIZE, shuffle=False)

    # model def
    class GCN(torch.nn.Module):
        def __init__(self, in_feats):
            super().__init__()
            self.conv1 = GCNConv(in_feats, hidden)
            self.conv2 = GCNConv(hidden, 2)
            self.drop  = torch.nn.Dropout(dropout)
        def forward(self, data):
            x, edge = data.x.to(DEVICE), data.edge_index.to(DEVICE)
            x = F.relu(self.conv1(x, edge))
            x = self.drop(x)
            x = self.conv2(x, edge)
            return F.log_softmax(x, dim=1)
    def root_acc(pred, data):
        probs = pred.exp()[:,1]
        correct, total = 0, 0
        batch = data.batch if hasattr(data, 'batch') else torch.zeros(len(data.y), dtype=torch.long)
        for i in range(data.num_graphs):
            mask = (batch == i)
            idx = probs[mask].argmax()
            correct += int(data.y.to(DEVICE)[mask][idx] == 1)
            total += 1
        return correct / total

    model = GCN(len(CENT_COLS)).to(DEVICE)
    opt   = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, 'max', patience=3, factor=0.5)

    best_val, cnt = 0, 0
    for ep in range(1, MAX_EPOCHS+1):
        model.train()
        for batch in train_loader:
            batch = batch.to(DEVICE)
            opt.zero_grad()
            loss = F.nll_loss(model(batch), batch.y.to(DEVICE))
            loss.backward(); opt.step()
        model.eval()
        val_acc = sum(root_acc(model(b), b) for b in val_loader)/len(val_loader)
        sched.step(val_acc)
        if val_acc > best_val + 1e-4:
            best_val, cnt = val_acc, 0
        else:
            cnt += 1
        if cnt >= PATIENCE:
            break
    return best_val, model

In [294]:
param_grid = {
    'hidden':    [32, 64],
    'dropout':   [0.3, 0.5],
    'lr':        [1e-2, 5e-3],
    'wd':        [1e-3, 1e-4],
}

In [295]:
# LOAD DATA
exp   = pd.read_csv(EXP_PATH)
train = pd.read_csv(TRAIN_PATH)
train['edgelist'] = train['edgelist'].apply(ast.literal_eval)
df = exp.merge(train[['language','sentence','edgelist','root']], on=['language','sentence'])

In [296]:
def tune_language(df_lang, lang_name):
    print(f"\n>> Tuning for {lang_name}")
    graphs = build_graphs(df_lang)
    best_acc, best_cfg, best_model = 0, None, None
    for cfg in itertools.product(*param_grid.values()):
        cfg_dict = dict(zip(param_grid.keys(), cfg))
        acc, _ = eval_config(graphs, **cfg_dict)
        if acc > best_acc:
            best_acc, best_cfg = acc, cfg_dict
    print(f"{lang_name} best val-acc={best_acc:.4f} with {best_cfg}")
    # retrain best
    _, best_model = eval_config(graphs, **best_cfg)
    return best_model, best_acc

In [298]:
# TUNE NON-JAPANESE
df_nonjp      = df[df.language != 'Japanese']
model_nonjp, a_nonjp = tune_language(df_nonjp, "NON_JAPANESE")


>> Tuning for NON_JAPANESE




NON_JAPANESE best val-acc=0.4563 with {'hidden': 32, 'dropout': 0.5, 'lr': 0.005, 'wd': 0.0001}


In [299]:
# TUNE JAPANESE
df_jp         = df[df.language == 'Japanese']
model_jp, a_jp = tune_language(df_jp, "JAPANESE_ONLY")


>> Tuning for JAPANESE_ONLY




JAPANESE_ONLY best val-acc=0.4062 with {'hidden': 64, 'dropout': 0.3, 'lr': 0.005, 'wd': 0.0001}


In [300]:
# AVERAGE
print(f"\nNon-JP acc: {a_nonjp:.4f}  JP acc: {a_jp:.4f}  AVG: {(a_nonjp+a_jp)/2:.4f}")


Non-JP acc: 0.4563  JP acc: 0.4062  AVG: 0.4313


## Test data
Now let's use the best estimators found to predict the test data:

In [308]:
import pandas as pd, ast, torch
from torch_geometric.data import Data

In [309]:
# Load normalized features + raw edgelists
test_feats = pd.read_csv('../../data/normalized_expanded_test.csv')
raw_test  = pd.read_csv('../../data/test.csv')
raw_test['edgelist'] = raw_test['edgelist'].apply(ast.literal_eval)

In [310]:
# join them on (language, sentence, id)
df_test = (
    test_feats
    .merge(raw_test[['id','language','sentence','edgelist']], on=['id','language','sentence'])
)

In [311]:
# Build a list of Data() graphs with a vertex‐ID field
test_graphs = []
for tid, grp in df_test.groupby('id', sort=False):
    x = torch.tensor(grp[CENT_COLS].values, dtype=torch.float)
    edge_index = torch.tensor(grp.edgelist.iloc[0], dtype=torch.long).t().contiguous() - 1
    verts = torch.tensor(grp['vertex'].values, dtype=torch.long)
    lang  = grp.language.iloc[0]
    data = Data(x=x, edge_index=edge_index, vertex=verts, id=torch.tensor(tid), language=lang)
    test_graphs.append(data)

In [312]:
# Predict with the right model, pick argmax of root‐probs
results = []
for data in test_graphs:
    data = data.to(DEVICE)
    model = model_jp if data.language == 'Japanese' else model_all
    model.eval()
    with torch.no_grad():
        out = model(data)               # log‐probs [num_nodes, 2]
        probs = out.exp()[:,1]          # P(root)
    best_idx    = probs.argmax().item()
    best_vertex = data.vertex[best_idx].item()
    results.append({'id': int(data.id.item()), 'root': best_vertex})

In [316]:
submission = pd.DataFrame(results)
submission.to_csv('../../data/submission_NL.csv', index=False)
print(f"Wrote {len(submission)} rows to ../../data/submission_NL.csv")

Wrote 10395 rows to ../../data/submission_NL.csv


In [318]:
sub_gnn = pd.read_csv('../../data/submission.csv')
sub_nl  = pd.read_csv('../../data/submission_NL.csv')

In [319]:
# join on id
cmp = sub_gnn.merge(sub_nl, on='id', suffixes=('_gnn', '_nl'))
cmp['match'] = cmp['root_gnn'] == cmp['root_nl']

# summary
print(f"Agreement rate: {cmp['match'].mean():.3%} ({cmp['match'].sum()}/{len(cmp)})")
# show a few disagreements
print("\nExamples of mismatches:")
print(cmp.loc[~cmp['match']].head())

Agreement rate: 9.649% (1003/10395)

Examples of mismatches:
   id  root_gnn  root_nl  match
0   1         2       17  False
1   2        17       11  False
2   3        15       34  False
5   6        15       19  False
6   7         2        7  False
