In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [49]:
df = pd.read_csv('df_gnn.csv')

In [50]:
df['date'] = pd.to_datetime(df['tourney_date'].astype(str))
df.sort_values('date').reset_index(drop=True, inplace=True)

In [51]:
cut_val  = np.datetime64('2019-01-01')
cut_test = np.datetime64('2020-01-01')
idx_train = np.where(df['date'].values <  cut_val)[0]
idx_val   = np.where((df['date'].values >= cut_val) & (df['date'].values < cut_test))[0]
idx_test  = np.where(df['date'].values >= cut_test)[0]

In [52]:
y = df['target'].astype('int32').values 

In [53]:
for col in ['surface', 'tourney_level', 'player_A_hand', 'player_B_hand']:
    df[col] = df[col].fillna('UNK').astype('category')
surf_ohe   = pd.get_dummies(df['surface'], prefix='surf', drop_first=False)
level_ohe  = pd.get_dummies(df['tourney_level'], prefix='lvl', drop_first=False)
Ahand_ohe  = pd.get_dummies(df['player_A_hand'], prefix='Ah', drop_first=False)
Bhand_ohe  = pd.get_dummies(df['player_B_hand'], prefix='Bh', drop_first=False)

In [54]:
num_cols = ['player_A_ht','player_B_ht','player_A_rank','player_B_rank',
            'player_A_rank_points','player_B_rank_points',
            'A_Glicko1_Rating','B_Glicko1_Rating',
            'A_Glicko1_Surface_Rating','B_Glicko1_Surface_Rating',
            'A_H2H_Streak','B_H2H_Streak',
            'A_H2H_LevelWeighted_Wins','B_H2H_LevelWeighted_Wins',
            'A_Age30','B_Age30','A_AgeInt','B_AgeInt',
            'A_Elo_Overall','B_Elo_Overall','A_Elo_Surface','B_Elo_Surface',
            'best_of']
for c in num_cols:
    miss = df[c].isna().astype('int32')
    df[f'{c}_isna'] = miss
    df[c] = df[c].fillna(df[c].median())

In [55]:
A_ids = df['player_A_id'].astype('int64').values
B_ids = df['player_B_id'].astype('int64').values

In [56]:
all_players = np.unique(np.concatenate([A_ids, B_ids]))
pid2idx = {pid:i for i,pid in enumerate(all_players)}
n_nodes = len(all_players)
A_idx = np.array([pid2idx[p] for p in A_ids], dtype='int64')
B_idx = np.array([pid2idx[p] for p in B_ids], dtype='int64')

In [57]:
X_edge = pd.concat([surf_ohe, level_ohe, df[['best_of']].astype('float32')], axis=1).astype('float32').values

In [58]:
A_dyn_cols = ['player_A_ht','player_A_rank','player_A_rank_points',
              'A_Glicko1_Rating','A_Glicko1_Surface_Rating',
              'A_H2H_Streak','A_H2H_LevelWeighted_Wins',
              'A_Age30','A_AgeInt','A_Elo_Overall','A_Elo_Surface'] + \
             [c for c in df.columns if c.endswith('_isna') and c.startswith('player_A_') or c.startswith('A_')]


In [59]:
B_dyn_cols = ['player_B_ht','player_B_rank','player_B_rank_points',
              'B_Glicko1_Rating','B_Glicko1_Surface_Rating',
              'B_H2H_Streak','B_H2H_LevelWeighted_Wins',
              'B_Age30','B_AgeInt','B_Elo_Overall','B_Elo_Surface'] + \
             [c for c in df.columns if c.endswith('_isna') and c.startswith('player_B_') or c.startswith('B_')]


In [60]:
A_dyn = df[A_dyn_cols].astype('float32').values
B_dyn = df[B_dyn_cols].astype('float32').values

In [61]:
A_hand = Ahand_ohe.astype('float32').values
B_hand = Bhand_ohe.astype('float32').values

In [62]:
A_inp = np.concatenate([A_dyn, A_hand], axis=1).astype('float32')
B_inp = np.concatenate([B_dyn, B_hand], axis=1).astype('float32')

In [63]:
tr, va, te = idx_train, idx_val, idx_test
data_train = (A_idx[tr], B_idx[tr], A_inp[tr], B_inp[tr], X_edge[tr], y[tr])
data_val   = (A_idx[va], B_idx[va], A_inp[va], B_inp[va], X_edge[va], y[va])
data_test  = (A_idx[te], B_idx[te], A_inp[te], B_inp[te], X_edge[te], y[te])

In [64]:
sc_A = StandardScaler().fit(A_inp[tr])
sc_B = StandardScaler().fit(B_inp[tr])
sc_E = StandardScaler().fit(X_edge[tr])

A_inp = sc_A.transform(A_inp)
B_inp = sc_B.transform(B_inp)
X_edge = sc_E.transform(X_edge)

data_train = (A_idx[tr], B_idx[tr], A_inp[tr], B_inp[tr], X_edge[tr], y[tr])
data_val   = (A_idx[va], B_idx[va], A_inp[va], B_inp[va], X_edge[va], y[va])
data_test  = (A_idx[te], B_idx[te], A_inp[te], B_inp[te], X_edge[te], y[te])


In [65]:
import scipy.sparse as sp

src_tr = A_idx[tr]
dst_tr = B_idx[tr]
# Use an undirected graph to share signal both ways; you can try directed as well.
rows = np.concatenate([src_tr, dst_tr])
cols = np.concatenate([dst_tr, src_tr])
vals = np.ones_like(rows, dtype='float32')
A_csr = sp.csr_matrix((vals, (rows, cols)), shape=(n_nodes, n_nodes))

# Row-normalize for mean aggregation
deg = np.asarray(A_csr.sum(axis=1)).ravel()
deg[deg == 0] = 1.0
D_inv = sp.diags(1.0 / deg)
A_norm = D_inv @ A_csr  # simple mean aggregator


In [66]:
import tensorflow as tf

def scipy_csr_to_tf_sparse(mat):
    mat = mat.tocoo().astype(np.float32)
    idx = np.stack([mat.row, mat.col], axis=1)
    return tf.sparse.SparseTensor(indices=idx, values=mat.data, dense_shape=mat.shape)

A_tf = scipy_csr_to_tf_sparse(A_norm)
A_tf = tf.sparse.reorder(A_tf)

class SimpleMPN(tf.keras.layers.Layer):
    def __init__(self, n_nodes, emb_dim=64, mp_hidden=64, mp_layers=2, dropout=0.1):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(n_nodes, emb_dim)
        self.ws_self = [tf.keras.layers.Dense(mp_hidden, use_bias=True) for _ in range(mp_layers)]
        self.ws_nei  = [tf.keras.layers.Dense(mp_hidden, use_bias=False) for _ in range(mp_layers)]
        self.act = tf.keras.layers.ReLU()
        self.drop = tf.keras.layers.Dropout(dropout)
        self.out_dim = mp_hidden

    def call(self, A_sparse, training=False):
        H = self.emb(tf.range(n_nodes))  # (N, emb_dim)
        for W_self, W_nei in zip(self.ws_self, self.ws_nei):
            neigh = tf.sparse.sparse_dense_matmul(A_sparse, H)  # mean aggregated neighbors
            H = self.act(W_self(H) + W_nei(neigh))
            H = self.drop(H, training=training)
        return H  # (N, out_dim)

class EdgeGNN(tf.keras.Model):
    def __init__(self, n_nodes, emb_dim=64, mp_hidden=64, mp_layers=2, mlp_hidden=128, dropout=0.1):
        super().__init__()
        self.mp = SimpleMPN(n_nodes, emb_dim=emb_dim, mp_hidden=mp_hidden, mp_layers=mp_layers, dropout=dropout)
        self.drop = tf.keras.layers.Dropout(dropout)
        self.edge_mlp = tf.keras.Sequential([
            tf.keras.layers.Dense(mlp_hidden, activation='relu'),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(mlp_hidden, activation='relu'),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    def call(self, inputs, training=False):
        src, dst, A_feat, B_feat, E_feat, A_sparse = inputs
        H = self.mp(A_sparse, training=training)  # (N, D)
        h_src = tf.gather(H, src)
        h_dst = tf.gather(H, dst)
        z = tf.concat([h_src, h_dst, h_src - h_dst, h_src * h_dst, A_feat, B_feat, E_feat], axis=-1)
        z = self.drop(z, training=training)
        return self.edge_mlp(z, training=training)


In [71]:
def make_ds(Ai, Bi, Af, Bf, Ef, y, batch=4096, shuffle=True):
    ds = tf.data.Dataset.from_tensor_slices((
        {'src': Ai, 'dst': Bi, 'Af': Af, 'Bf': Bf, 'Ef': Ef}, y.astype('float32')
    ))
    if shuffle:
        ds = ds.shuffle(200_000, reshuffle_each_iteration=True)
    ds = ds.batch(batch).prefetch(tf.data.AUTOTUNE)
    return ds

Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr = data_train
Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va = data_val
Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te = data_test

ds_tr = make_ds(Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr, batch=4096, shuffle=True)
ds_va = make_ds(Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va, batch=8192, shuffle=False)
ds_te = make_ds(Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te, batch=8192, shuffle=False)

def attach_adj(ds):
    def _map(features, label):
        return ((features['src'], features['dst'], features['Af'], features['Bf'], features['Ef'], A_tf), label)
    return ds.map(_map)

ds_tr = attach_adj(ds_tr); ds_va = attach_adj(ds_va); ds_te = attach_adj(ds_te)

# Instantiate and train
model = EdgeGNN(n_nodes=n_nodes, emb_dim=256, mp_hidden=256, mp_layers=3, mlp_hidden=512, dropout=0.1)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss='binary_crossentropy',
              metrics=[tf.keras.metrics.AUC(name='AUC'), 'accuracy'])


history = model.fit(ds_tr, validation_data=ds_va, epochs=100, verbose=2)

print("Test:")
model.evaluate(ds_te, verbose=2)


Epoch 1/100


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Only input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: 2640 (of type <class 'int'>)''


ValueError: Exception encountered when calling EdgeGNN.call().

[1mOnly input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: 2640 (of type <class 'int'>)[0m

Arguments received by EdgeGNN.call():
  • inputs=('tf.Tensor(shape=(None,), dtype=int64)', 'tf.Tensor(shape=(None,), dtype=int64)', 'tf.Tensor(shape=(None, 35), dtype=float32)', 'tf.Tensor(shape=(None, 35), dtype=float32)', 'tf.Tensor(shape=(None, 12), dtype=float32)', 'tf.Tensor(shape=(2640, 2640), dtype=float32)')
  • training=True

In [77]:
# ============================================
# Tennis Match GNN — End-to-end, single script
# Optimizes log loss (binary cross-entropy)
# ============================================

# ---- Imports ----
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import matplotlib.pyplot as plt

# ============================================
# 0) Load your DataFrame `df` with the 31 columns you shared
#    If you already have it in memory, comment the next two lines.
# ============================================
df = pd.read_csv("df_gnn.csv")  # Ensure it has the exact schema you shared

# ============================================
# 1) Chronological split and basic preprocessing (leakage-safe)
# ============================================
assert {'player_A_id','player_B_id','tourney_date','target'}.issubset(df.columns)

# Convert date and sort
df['date'] = pd.to_datetime(df['tourney_date'].astype(str), format='%Y%m%d')
df = df.sort_values('date').reset_index(drop=True)

# Splits: adjust as needed
cut_val  = np.datetime64('2019-01-01')
cut_test = np.datetime64('2020-01-01')
dates = df['date'].values
idx_train = np.where(dates <  cut_val)[0]
idx_val   = np.where((dates >= cut_val) & (dates < cut_test))[0]
idx_test  = np.where(dates >= cut_test)[0]

y = df['target'].astype('int32').values

# Categorical encodings
for col in ['surface', 'tourney_level', 'player_A_hand', 'player_B_hand']:
    df[col] = df[col].fillna('UNK').astype('category')

surf_ohe   = pd.get_dummies(df['surface'], prefix='surf', drop_first=False)
level_ohe  = pd.get_dummies(df['tourney_level'], prefix='lvl', drop_first=False)
Ahand_ohe  = pd.get_dummies(df['player_A_hand'], prefix='Ah', drop_first=False)
Bhand_ohe  = pd.get_dummies(df['player_B_hand'], prefix='Bh', drop_first=False)

# Numeric columns with potential missingness (per-endpoint + context)
A_num = [
    'player_A_ht','player_A_rank','player_A_rank_points',
    'A_Glicko1_Rating','A_Glicko1_Surface_Rating',
    'A_H2H_Streak','A_H2H_LevelWeighted_Wins',
    'A_Age30','A_AgeInt','A_Elo_Overall','A_Elo_Surface'
]
B_num = [
    'player_B_ht','player_B_rank','player_B_rank_points',
    'B_Glicko1_Rating','B_Glicko1_Surface_Rating',
    'B_H2H_Streak','B_H2H_LevelWeighted_Wins',
    'B_Age30','B_AgeInt','B_Elo_Overall','B_Elo_Surface'
]
edge_num = ['best_of']

for c in A_num + B_num + edge_num:
    miss = df[c].isna().astype('int32')
    df[f'{c}_isna'] = miss
    df[c] = df[c].fillna(df[c].median())

# Build per-endpoint dynamic features (numeric + hand one-hot + missing flags)
A_miss = [f'{c}_isna' for c in A_num]
B_miss = [f'{c}_isna' for c in B_num]

A_dyn = df[A_num + A_miss].astype('float32').values
B_dyn = df[B_num + B_miss].astype('float32').values
A_hand = Ahand_ohe.astype('float32').values
B_hand = Bhand_ohe.astype('float32').values

A_inp = np.concatenate([A_dyn, A_hand], axis=1).astype('float32')
B_inp = np.concatenate([B_dyn, B_hand], axis=1).astype('float32')

# Edge/context features: surface/level OHE + numeric + its missing flag
edge_feats = pd.concat([surf_ohe, level_ohe, df[edge_num], df[[f'{c}_isna' for c in edge_num]]], axis=1)
X_edge = edge_feats.astype('float32').values

# Map player ids to contiguous indices
A_ids = df['player_A_id'].astype('int64').values
B_ids = df['player_B_id'].astype('int64').values
all_players = np.unique(np.concatenate([A_ids, B_ids]))
pid2idx = {pid:i for i,pid in enumerate(all_players)}
n_nodes = len(all_players)
A_idx = np.array([pid2idx[p] for p in A_ids], dtype='int64')
B_idx = np.array([pid2idx[p] for p in B_ids], dtype='int64')

# Standardize continuous features using training stats only
tr = idx_train; va = idx_val; te = idx_test

sc_A = StandardScaler().fit(A_inp[tr])
sc_B = StandardScaler().fit(B_inp[tr])
sc_E = StandardScaler().fit(X_edge[tr])

A_inp = sc_A.transform(A_inp).astype('float32')
B_inp = sc_B.transform(B_inp).astype('float32')
X_edge = sc_E.transform(X_edge).astype('float32')

# Final split packs
data_train = (A_idx[tr], B_idx[tr], A_inp[tr], B_inp[tr], X_edge[tr], y[tr])
data_val   = (A_idx[va], B_idx[va], A_inp[va], B_inp[va], X_edge[va], y[va])
data_test  = (A_idx[te], B_idx[te], A_inp[te], B_inp[te], X_edge[te], y[te])

# ============================================
# 2) Build train adjacency (train edges only), row-normalized
# ============================================
def build_norm_adj_from_edges(src_idx, dst_idx, n_nodes, undirected=True):
    if undirected:
        rows = np.concatenate([src_idx, dst_idx])
        cols = np.concatenate([dst_idx, src_idx])
    else:
        rows, cols = src_idx, dst_idx
    vals = np.ones_like(rows, dtype=np.float32)
    A = sp.csr_matrix((vals, (rows, cols)), shape=(n_nodes, n_nodes))
    deg = np.asarray(A.sum(axis=1)).ravel()
    deg[deg == 0] = 1.0
    D_inv = sp.diags(1.0 / deg)
    return D_inv @ A

src_tr = A_idx[tr]; dst_tr = B_idx[tr]
A_norm = build_norm_adj_from_edges(src_tr, dst_tr, n_nodes=n_nodes, undirected=True)

def scipy_csr_to_tf_sparse(mat):
    mat = mat.tocoo().astype(np.float32)
    idx = np.stack([mat.row, mat.col], axis=1)
    st = tf.sparse.SparseTensor(indices=idx, values=mat.data, dense_shape=mat.shape)
    return tf.sparse.reorder(st)

A_tf = scipy_csr_to_tf_sparse(A_norm)

# ============================================
# 3) tf.data pipelines (no adjacency threaded)
# ============================================
def make_ds(Ai, Bi, Af, Bf, Ef, y, batch=4096, shuffle=True):
    X = (Ai, Bi, Af.astype('float32'), Bf.astype('float32'), Ef.astype('float32'))
    y2 = y.astype('float32').reshape(-1, 1)  # match model output shape (B, 1)
    ds = tf.data.Dataset.from_tensor_slices((X, y2))
    if shuffle:
        ds = ds.shuffle(min(len(Ai), 200_000), reshuffle_each_iteration=True)
    return ds.batch(batch).prefetch(tf.data.AUTOTUNE)

Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr = data_train
Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va = data_val
Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te = data_test

ds_tr = make_ds(Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr, batch=4096, shuffle=True)
ds_va = make_ds(Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va, batch=8192, shuffle=False)
ds_te = make_ds(Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te, batch=8192, shuffle=False)

# ============================================
# 4) Model: message passing + edge classifier (log-loss oriented)
# ============================================
class SimpleMPN(tf.keras.layers.Layer):
    def __init__(self, n_nodes, emb_dim=64, mp_hidden=64, mp_layers=2, dropout=0.0):
        super().__init__()
        self.n_nodes = n_nodes
        self.emb = tf.keras.layers.Embedding(n_nodes, emb_dim)
        self.ws_self = [tf.keras.layers.Dense(mp_hidden, use_bias=True) for _ in range(mp_layers)]
        self.ws_nei  = [tf.keras.layers.Dense(mp_hidden, use_bias=False) for _ in range(mp_layers)]
        self.act = tf.keras.layers.ReLU()
        self.drop = tf.keras.layers.Dropout(dropout)
        self.out_dim = mp_hidden if mp_layers > 0 else emb_dim
        self.mp_layers = mp_layers

    def call(self, A_sparse, training=False):
        H = self.emb(tf.range(self.n_nodes))
        if self.mp_layers == 0:
            return H
        for W_self, W_nei in zip(self.ws_self, self.ws_nei):
            neigh = tf.sparse.sparse_dense_matmul(A_sparse, H)  # mean-aggregated neighbors
            H = self.act(W_self(H) + W_nei(neigh))
            H = self.drop(H, training=training)
        return H

class EdgeGNN(tf.keras.Model):
    def __init__(self, n_nodes, A_sparse, emb_dim=64, mp_hidden=64, mp_layers=2, mlp_hidden=256, dropout=0.0):
        super().__init__()
        self.A_sparse = A_sparse
        self.mp = SimpleMPN(n_nodes, emb_dim=emb_dim, mp_hidden=mp_hidden, mp_layers=mp_layers, dropout=dropout)
        self.drop = tf.keras.layers.Dropout(dropout)
        self.edge_mlp = tf.keras.Sequential([
            tf.keras.layers.Dense(mlp_hidden, activation='relu'),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(mlp_hidden, activation='relu'),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])

    def call(self, inputs, training=False):
        src, dst, A_feat, B_feat, E_feat = inputs
        H = self.mp(self.A_sparse, training=training)
        h_src = tf.gather(H, src)
        h_dst = tf.gather(H, dst)
        z = tf.concat([h_src, h_dst, h_src - h_dst, h_src * h_dst, A_feat, B_feat, E_feat], axis=-1)
        z = self.drop(z, training=training)
        logits = self.edge_mlp(z, training=training)   # (B, 1)
        return logits
    
def compile_for_logloss(model, lr=1e-3):
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr),
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        metrics=[
            tf.keras.metrics.BinaryCrossentropy(name='log_loss', from_logits=False),
            tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=0.5),
        ],
    )
    return model

# Instantiate, compile, train, evaluate
tf.keras.backend.clear_session()
model = EdgeGNN(n_nodes=n_nodes, A_sparse=A_tf, emb_dim=64, mp_hidden=64, mp_layers=2, mlp_hidden=256, dropout=0.0)
compile_for_logloss(model, lr=1e-3)

history = model.fit(ds_tr, validation_data=ds_va, epochs=20, verbose=2)
print("Test evaluation:")
model.evaluate(ds_te, verbose=2)

# ============================================
# 5) Double-descent experiments (log-loss)
# ============================================
def capacity_sweep(
    data_train, data_val, data_test, A_tf,
    widths=(8,16,32,64,128,256,512,1024),
    emb_dims=(16,64),
    mp_layers_list=(0,1,2,3),
    mlp_multipliers=(1,),
    epochs=40,
    lr=1e-3,
    seed=7
):
    tf.keras.utils.set_random_seed(seed)
    Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr = data_train
    Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va = data_val
    Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te = data_test

    ds_tr = make_ds(Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr, batch=4096, shuffle=True)
    ds_va = make_ds(Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va, batch=8192, shuffle=False)
    ds_te = make_ds(Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te, batch=8192, shuffle=False)

    rows = []
    for emb_dim in emb_dims:
        for w in widths:
            for mp_layers in mp_layers_list:
                for mult in mlp_multipliers:
                    mlp_hidden = max(4, int(w * mult))
                    tf.keras.backend.clear_session()
                    m = EdgeGNN(n_nodes=n_nodes, A_sparse=A_tf,
                                emb_dim=emb_dim, mp_hidden=w, mp_layers=mp_layers,
                                mlp_hidden=mlp_hidden, dropout=0.0)
                    compile_for_logloss(m, lr=lr)
                    m.fit(ds_tr, epochs=epochs, verbose=0, validation_data=ds_va)
                    tr_loss, tr_log, tr_acc = m.evaluate(ds_tr, verbose=0)
                    va_loss, va_log, va_acc = m.evaluate(ds_va, verbose=0)
                    te_loss, te_log, te_acc = m.evaluate(ds_te, verbose=0)
                    rows.append({
                        'emb_dim': emb_dim, 'width': w, 'mp_layers': mp_layers, 'mlp_hidden': mlp_hidden,
                        'train_log_loss': float(tr_loss), 'val_log_loss': float(va_loss), 'test_log_loss': float(te_loss),
                        'train_acc': float(tr_acc), 'val_acc': float(va_acc), 'test_acc': float(te_acc)
                    })
                    print(f"[cap] emb={emb_dim} w={w} L={mp_layers} mlp={mlp_hidden} -> test_log={te_loss:.4f}")
    return pd.DataFrame(rows)

def size_sweep(
    data_train, data_val, data_test,
    fractions=(0.1,0.2,0.4,0.6,0.8,1.0),
    emb_dim=64, mp_hidden=256, mp_layers=2, mlp_hidden=512,
    epochs=40, lr=1e-3, seed=13
):
    tf.keras.utils.set_random_seed(seed)
    Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr = data_train
    Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va = data_val
    Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te = data_test

    rows = []
    n_tr = len(Ai_tr)
    for frac in fractions:
        k = max(1000, int(n_tr * frac))
        rng = np.random.RandomState(seed)
        idx = rng.choice(n_tr, size=k, replace=False)

        Ai_sub, Bi_sub = Ai_tr[idx], Bi_tr[idx]
        Af_sub, Bf_sub, Ef_sub, y_sub = Af_tr[idx], Bf_tr[idx], Ef_tr[idx], y_tr[idx]

        # Rebuild adjacency from the subsampled training edges
        A_sub = build_norm_adj_from_edges(Ai_sub, Bi_sub, n_nodes=n_nodes, undirected=True)
        A_sub_tf = scipy_csr_to_tf_sparse(A_sub)

        ds_tr = make_ds(Ai_sub, Bi_sub, Af_sub, Bf_sub, Ef_sub, y_sub, batch=4096, shuffle=True)
        ds_va = make_ds(Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va, batch=8192, shuffle=False)
        ds_te = make_ds(Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te, batch=8192, shuffle=False)

        tf.keras.backend.clear_session()
        m = EdgeGNN(n_nodes=n_nodes, A_sparse=A_sub_tf,
                    emb_dim=emb_dim, mp_hidden=mp_hidden, mp_layers=mp_layers, mlp_hidden=mlp_hidden, dropout=0.0)
        compile_for_logloss(m, lr=lr)
        m.fit(ds_tr, epochs=epochs, verbose=0, validation_data=ds_va)
        tr_loss, tr_log, tr_acc = m.evaluate(ds_tr, verbose=0)
        va_loss, va_log, va_acc = m.evaluate(ds_va, verbose=0)
        te_loss, te_log, te_acc = m.evaluate(ds_te, verbose=0)
        rows.append({
            'fraction': float(frac), 'n_train_edges': int(k),
            'train_log_loss': float(tr_loss), 'val_log_loss': float(va_loss), 'test_log_loss': float(te_loss),
            'train_acc': float(tr_acc), 'val_acc': float(va_acc), 'test_acc': float(te_acc),
            'emb_dim': emb_dim, 'mp_hidden': mp_hidden, 'mp_layers': mp_layers, 'mlp_hidden': mlp_hidden
        })
        print(f"[size] frac={frac:.2f} k={k} -> test_log={te_loss:.4f}")
    return pd.DataFrame(rows)

# ============================================
# 6) Run the sweeps (optional; can be time-consuming)
# ============================================
# cap_results = capacity_sweep(data_train, data_val, data_test, A_tf,
#                              widths=(8,16,32,64,128,256,512,1024),
#                              emb_dims=(16,64),
#                              mp_layers_list=(0,1,2,3),
#                              mlp_multipliers=(1,),
#                              epochs=40, lr=1e-3, seed=7)
#
# size_results = size_sweep(data_train, data_val, data_test,
#                           fractions=(0.1,0.2,0.4,0.6,0.8,1.0),
#                           emb_dim=64, mp_hidden=256, mp_layers=2, mlp_hidden=512,
#                           epochs=40, lr=1e-3, seed=13)

# Example plot for capacity (uncomment after running cap_results)
# mask = (cap_results['mp_layers'] == 2) & (cap_results['emb_dim'] == 64)
# sub = cap_results[mask].sort_values('width')
# plt.figure()
# plt.plot(sub['width'], sub['test_log_loss'], marker='o')
# plt.xscale('log', base=2)
# plt.xlabel('Width (log2 scale)')
# plt.ylabel('Test log loss')
# plt.title('Double Descent: Capacity Sweep (mp_layers=2, emb_dim=64)')
# plt.show()

# Example plot for sample size (uncomment after running size_results)
# ssub = size_results.sort_values('n_train_edges')
# plt.figure()
# plt.plot(ssub['n_train_edges'], ssub['test_log_loss'], marker='o')
# plt.xlabel('# Training edges')
# plt.ylabel('Test log loss')
# plt.title('Double Descent: Sample-Size Sweep')
# plt.show()

# ============================================
# 7) Inference helper for a future match (P(A wins))
#    Provide pre-match A_features, B_features, edge_features in the same order used above.
# ============================================
def predict_match_proba(model, player_A_id, player_B_id, A_features_vec, B_features_vec, edge_features_vec):
    src = np.array([pid2idx[player_A_id]], dtype='int64')
    dst = np.array([pid2idx[player_B_id]], dtype='int64')
    Af = sc_A.transform(np.asarray(A_features_vec, dtype='float32').reshape(1, -1))
    Bf = sc_B.transform(np.asarray(B_features_vec, dtype='float32').reshape(1, -1))
    Ef = sc_E.transform(np.asarray(edge_features_vec, dtype='float32').reshape(1, -1))
    p = model((src, dst, Af.astype('float32'), Bf.astype('float32'), Ef.astype('float32')), training=False).numpy()[0,0]
    return float(p)


Epoch 1/20
15/15 - 3s - 180ms/step - accuracy: 0.6639 - log_loss: 0.6142 - loss: 0.6142 - val_accuracy: 0.6527 - val_log_loss: 0.6309 - val_loss: 0.6309
Epoch 2/20
15/15 - 1s - 46ms/step - accuracy: 0.6843 - log_loss: 0.5861 - loss: 0.5861 - val_accuracy: 0.6549 - val_log_loss: 0.6144 - val_loss: 0.6144
Epoch 3/20
15/15 - 1s - 52ms/step - accuracy: 0.6907 - log_loss: 0.5784 - loss: 0.5784 - val_accuracy: 0.6516 - val_log_loss: 0.6225 - val_loss: 0.6225
Epoch 4/20
15/15 - 1s - 46ms/step - accuracy: 0.6972 - log_loss: 0.5708 - loss: 0.5708 - val_accuracy: 0.6457 - val_log_loss: 0.6295 - val_loss: 0.6295
Epoch 5/20
15/15 - 1s - 40ms/step - accuracy: 0.7021 - log_loss: 0.5629 - loss: 0.5629 - val_accuracy: 0.6435 - val_log_loss: 0.6406 - val_loss: 0.6406
Epoch 6/20
15/15 - 1s - 40ms/step - accuracy: 0.7047 - log_loss: 0.5561 - loss: 0.5561 - val_accuracy: 0.6479 - val_log_loss: 0.6452 - val_loss: 0.6452
Epoch 7/20
15/15 - 1s - 49ms/step - accuracy: 0.7090 - log_loss: 0.5505 - loss: 0.5505 

In [78]:
def capacity_sweep_multi(
    data_train, data_val, data_test, A_tf,
    widths=(8,16,32,64,128,256,512,1024),
    emb_dims=(16,64),
    mp_layers_list=(0,1,2,3),
    mlp_multipliers=(1,),
    epochs=40, lr=1e-3, seeds=(1,2,3,4,5)
):
    Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr = data_train
    Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va = data_val
    Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te = data_test

    rows = []
    for emb_dim in emb_dims:
        for w in widths:
            for mp_layers in mp_layers_list:
                for mult in mlp_multipliers:
                    mlp_hidden = max(4, int(w * mult))
                    for seed in seeds:
                        tf.keras.utils.set_random_seed(seed)
                        ds_tr = make_ds(Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr, batch=4096, shuffle=True)
                        ds_va = make_ds(Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va, batch=8192, shuffle=False)
                        ds_te = make_ds(Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te, batch=8192, shuffle=False)

                        tf.keras.backend.clear_session()
                        m = EdgeGNN(
                            n_nodes=n_nodes, A_sparse=A_tf,
                            emb_dim=emb_dim, mp_hidden=w, mp_layers=mp_layers,
                            mlp_hidden=mlp_hidden, dropout=0.0
                        )
                        compile_for_logloss(m, lr=lr)
                        m.fit(ds_tr, epochs=epochs, verbose=0, validation_data=ds_va)

                        # evaluate returns [loss, log_loss(metric), accuracy]
                        tr_loss, tr_log, tr_acc = m.evaluate(ds_tr, verbose=0)
                        va_loss, va_log, va_acc = m.evaluate(ds_va, verbose=0)
                        te_loss, te_log, te_acc = m.evaluate(ds_te, verbose=0)

                        rows.append({
                            'seed': seed, 'emb_dim': emb_dim, 'width': w, 'mp_layers': mp_layers, 'mlp_hidden': mlp_hidden,
                            'train_log_loss': float(tr_log), 'val_log_loss': float(va_log), 'test_log_loss': float(te_log),
                            'train_acc': float(tr_acc), 'val_acc': float(va_acc), 'test_acc': float(te_acc)
                        })
                        print(f"[cap] seed={seed} emb={emb_dim} w={w} L={mp_layers} mlp={mlp_hidden} -> test_log={te_log:.4f}")
    return pd.DataFrame(rows)


In [79]:
def summarize_capacity(df, emb_dim=64, mp_layers=2):
    sub = df[(df['emb_dim']==emb_dim) & (df['mp_layers']==mp_layers)].copy()
    g = sub.groupby('width')['test_log_loss']
    summary = g.agg(['mean','std','count']).reset_index()
    # 95% CI assuming normality across seeds
    summary['sem'] = summary['std'] / np.sqrt(summary['count'].clip(lower=1))
    summary['ci95'] = 1.96 * summary['sem']
    # finite differences on the mean curve (log2 spacing is okay for widths)
    summary = summary.sort_values('width')
    diffs = np.diff(summary['mean'].values)
    widths_arr = summary['width'].values
    return summary, diffs, widths_arr

def has_double_descent(summary):
    # A minimal criterion: exists i<j<k with mean[i] < mean[j] > mean[k],
    # and both rises/drops exceed the local CI bands.
    m = summary['mean'].values
    c = summary['ci95'].values
    n = len(m)
    found = False
    where = None
    for j in range(1, n-1):
        if m[j] - max(m[j-1], m[j+1]) > max(c[j], c[j-1], c[j+1]):  # peak beyond uncertainty
            if (m[j] > m[j-1]) and (m[j] > m[j+1]):
                found = True
                where = j
                break
    return found, where


In [80]:
def plot_capacity(summary, train_summary=None, title="Double Descent: Capacity Sweep"):
    import matplotlib.pyplot as plt
    x = summary['width'].values
    y = summary['mean'].values
    ci = summary['ci95'].values
    plt.figure()
    plt.fill_between(x, y-ci, y+ci, alpha=0.2, linewidth=0)
    plt.plot(x, y, marker='o')
    if train_summary is not None:
        plt.plot(train_summary['width'].values, train_summary['mean'].values, marker='x', linestyle='--')
    plt.xscale('log', base=2)
    plt.xlabel('Width (log2 scale)')
    plt.ylabel('Test log loss')
    plt.title(title)
    plt.show()


In [81]:
# ============================================
# Double Descent — multi-seed sweeps, summaries, CI plots, detection
# Assumes your previous cells defined:
#   - data_train, data_val, data_test
#   - A_tf, n_nodes
#   - make_ds, build_norm_adj_from_edges, scipy_csr_to_tf_sparse
#   - EdgeGNN, compile_for_logloss
# ============================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

# ---------- Capacity sweep across multiple seeds ----------
def capacity_sweep_multi(
    data_train, data_val, data_test, A_tf,
    widths=(8,16,32,64,128,256,512,1024),
    emb_dims=(16,64),
    mp_layers_list=(0,1,2,3),
    mlp_multipliers=(1,),
    epochs=40, lr=1e-3, seeds=(1,2,3,4,5),
    batch_tr=4096, batch_eval=8192, dropout=0.0
):
    Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr = data_train
    Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va = data_val
    Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te = data_test

    rows = []
    for emb_dim in emb_dims:
        for w in widths:
            for mp_layers in mp_layers_list:
                for mult in mlp_multipliers:
                    mlp_hidden = max(4, int(w * mult))
                    for seed in seeds:
                        tf.keras.utils.set_random_seed(seed)

                        ds_tr = make_ds(Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr, batch=batch_tr, shuffle=True)
                        ds_va = make_ds(Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va, batch=batch_eval, shuffle=False)
                        ds_te = make_ds(Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te, batch=batch_eval, shuffle=False)

                        tf.keras.backend.clear_session()
                        m = EdgeGNN(
                            n_nodes=n_nodes, A_sparse=A_tf,
                            emb_dim=emb_dim, mp_hidden=w, mp_layers=mp_layers,
                            mlp_hidden=mlp_hidden, dropout=dropout
                        )
                        compile_for_logloss(m, lr=lr)
                        m.fit(ds_tr, epochs=epochs, verbose=0, validation_data=ds_va)

                        # evaluate() returns: [loss, log_loss(metric), accuracy] in our compile order
                        tr_loss, tr_log, tr_acc = m.evaluate(ds_tr, verbose=0)
                        va_loss, va_log, va_acc = m.evaluate(ds_va, verbose=0)
                        te_loss, te_log, te_acc = m.evaluate(ds_te, verbose=0)

                        rows.append({
                            'seed': seed, 'emb_dim': emb_dim, 'width': w, 'mp_layers': mp_layers,
                            'mlp_hidden': mlp_hidden, 'dropout': dropout, 'epochs': epochs, 'lr': lr,
                            'train_log_loss': float(tr_log), 'val_log_loss': float(va_log), 'test_log_loss': float(te_log),
                            'train_acc': float(tr_acc), 'val_acc': float(va_acc), 'test_acc': float(te_acc)
                        })
                        print(f"[cap] seed={seed} emb={emb_dim} w={w} L={mp_layers} mlp={mlp_hidden} -> test_log={te_log:.4f}")
    return pd.DataFrame(rows)

# ---------- Summaries, CI, and detection for capacity ----------
def summarize_capacity(df, emb_dim=64, mp_layers=2):
    sub = df[(df['emb_dim']==emb_dim) & (df['mp_layers']==mp_layers)].copy()
    sub = sub.sort_values(['width', 'seed'])
    g_test = sub.groupby('width')['test_log_loss']
    g_train = sub.groupby('width')['train_log_loss']

    def _agg(g):
        out = g.agg(['mean','std','count']).reset_index()
        out['sem'] = out['std'] / np.sqrt(out['count'].clip(lower=1))
        out['ci95'] = 1.96 * out['sem']
        return out

    test_summary = _agg(g_test)
    train_summary = _agg(g_train).rename(columns={'mean':'mean_train','std':'std_train','count':'count_train','sem':'sem_train','ci95':'ci95_train'})
    merged = pd.merge(test_summary, train_summary[['width','mean_train','ci95_train']], on='width', how='left')
    merged = merged.sort_values('width').reset_index(drop=True)
    return merged

def has_double_descent(summary_df):
    # Minimal criterion: exists j with mean[j] > mean[j-1] and mean[j] > mean[j+1],
    # and the peak height exceeds local uncertainty bands.
    m = summary_df['mean'].values
    c = summary_df['ci95'].values
    n = len(m)
    for j in range(1, n-1):
        rise = m[j] - m[j-1]
        drop = m[j] - m[j+1]
        if rise > 0 and drop > 0:
            # Require peak above neighbors by more than average of CIs in the triplet
            thr = max(c[j], c[j-1], c[j+1])
            if min(rise, drop) > thr:
                return True, j
    return False, None

def plot_capacity(summary_df, title="Capacity sweep: Test log loss vs width"):
    x = summary_df['width'].values
    y = summary_df['mean'].values
    ci = summary_df['ci95'].values
    ytr = summary_df['mean_train'].values
    citr = summary_df.get('ci95_train', pd.Series(np.zeros_like(y))).values

    plt.figure()
    plt.fill_between(x, y-ci, y+ci, alpha=0.2, linewidth=0)
    plt.plot(x, y, marker='o', label='Test log loss')
    plt.fill_between(x, ytr-citr, ytr+citr, alpha=0.15, linewidth=0)
    plt.plot(x, ytr, marker='x', linestyle='--', label='Train log loss')
    plt.xscale('log', base=2)
    plt.xlabel('Width (log2 scale)')
    plt.ylabel('Log loss')
    plt.title(title)
    plt.legend()
    plt.show()

# ---------- Sample-size sweep across multiple seeds ----------
def size_sweep_multi(
    data_train, data_val, data_test,
    fractions=(0.1,0.2,0.4,0.6,0.8,1.0),
    emb_dim=64, mp_hidden=256, mp_layers=2, mlp_hidden=512,
    epochs=40, lr=1e-3, seeds=(1,2,3,4,5),
    batch_tr=4096, batch_eval=8192, dropout=0.0
):
    Ai_tr, Bi_tr, Af_tr, Bf_tr, Ef_tr, y_tr = data_train
    Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va = data_val
    Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te = data_test

    rows = []
    n_tr = len(Ai_tr)
    for seed in seeds:
        rng = np.random.RandomState(seed)
        for frac in fractions:
            k = max(1000, int(n_tr * frac))
            idx = rng.choice(n_tr, size=k, replace=False)

            Ai_sub, Bi_sub = Ai_tr[idx], Bi_tr[idx]
            Af_sub, Bf_sub, Ef_sub, y_sub = Af_tr[idx], Bf_tr[idx], Ef_tr[idx], y_tr[idx]

            # Rebuild adjacency from subsampled training edges
            A_sub = build_norm_adj_from_edges(Ai_sub, Bi_sub, n_nodes=n_nodes, undirected=True)
            A_sub_tf = scipy_csr_to_tf_sparse(A_sub)

            ds_tr = make_ds(Ai_sub, Bi_sub, Af_sub, Bf_sub, Ef_sub, y_sub, batch=batch_tr, shuffle=True)
            ds_va = make_ds(Ai_va, Bi_va, Af_va, Bf_va, Ef_va, y_va, batch=batch_eval, shuffle=False)
            ds_te = make_ds(Ai_te, Bi_te, Af_te, Bf_te, Ef_te, y_te, batch=batch_eval, shuffle=False)

            tf.keras.backend.clear_session()
            m = EdgeGNN(n_nodes=n_nodes, A_sparse=A_sub_tf,
                        emb_dim=emb_dim, mp_hidden=mp_hidden, mp_layers=mp_layers,
                        mlp_hidden=mlp_hidden, dropout=dropout)
            compile_for_logloss(m, lr=lr)
            m.fit(ds_tr, epochs=epochs, verbose=0, validation_data=ds_va)
            tr_loss, tr_log, tr_acc = m.evaluate(ds_tr, verbose=0)
            va_loss, va_log, va_acc = m.evaluate(ds_va, verbose=0)
            te_loss, te_log, te_acc = m.evaluate(ds_te, verbose=0)

            rows.append({
                'seed': seed, 'fraction': float(frac), 'n_train_edges': int(k),
                'emb_dim': emb_dim, 'mp_hidden': mp_hidden, 'mp_layers': mp_layers, 'mlp_hidden': mlp_hidden,
                'epochs': epochs, 'lr': lr, 'dropout': dropout,
                'train_log_loss': float(tr_log), 'val_log_loss': float(va_log), 'test_log_loss': float(te_log),
                'train_acc': float(tr_acc), 'val_acc': float(va_acc), 'test_acc': float(te_acc),
            })
            print(f"[size] seed={seed} frac={frac:.2f} k={k} -> test_log={te_log:.4f}")
    return pd.DataFrame(rows)

def summarize_size(df):
    sub = df.copy()
    sub = sub.sort_values(['n_train_edges', 'seed'])
    g_test = sub.groupby('n_train_edges')['test_log_loss']
    g_train = sub.groupby('n_train_edges')['train_log_loss']

    def _agg(g):
        out = g.agg(['mean','std','count']).reset_index()
        out['sem'] = out['std'] / np.sqrt(out['count'].clip(lower=1))
        out['ci95'] = 1.96 * out['sem']
        return out

    test_summary = _agg(g_test)
    train_summary = _agg(g_train).rename(columns={'mean':'mean_train','std':'std_train','count':'count_train','sem':'sem_train','ci95':'ci95_train'})
    merged = pd.merge(test_summary, train_summary[['n_train_edges','mean_train','ci95_train']], on='n_train_edges', how='left')
    merged = merged.sort_values('n_train_edges').reset_index(drop=True)
    return merged

def plot_size(summary_df, title="Sample-size sweep: Test log loss vs # train edges"):
    x = summary_df['n_train_edges'].values
    y = summary_df['mean'].values
    ci = summary_df['ci95'].values
    ytr = summary_df['mean_train'].values
    citr = summary_df.get('ci95_train', pd.Series(np.zeros_like(y))).values

    plt.figure()
    plt.fill_between(x, y-ci, y+ci, alpha=0.2, linewidth=0)
    plt.plot(x, y, marker='o', label='Test log loss')
    plt.fill_between(x, ytr-citr, ytr+citr, alpha=0.15, linewidth=0)
    plt.plot(x, ytr, marker='x', linestyle='--', label='Train log loss')
    plt.xlabel('# Training edges')
    plt.ylabel('Log loss')
    plt.title(title)
    plt.legend()
    plt.show()

# ---------- Example usage ----------
# You can narrow the grids first to confirm wiring, then expand.
# Comment/uncomment as needed.

# Capacity sweep run (quick example settings)
cap_df = capacity_sweep_multi(
    data_train, data_val, data_test, A_tf,
    widths=(16,32,64,128,256,512),
    emb_dims=(64,),
    mp_layers_list=(0,1,2),
    mlp_multipliers=(1,),
    epochs=30, lr=1e-3, seeds=(1,2,3), dropout=0.0
)
cap_df.to_csv("cap_results_multi.csv", index=False)

cap_sum = summarize_capacity(cap_df, emb_dim=64, mp_layers=2)
flag, peak_idx = has_double_descent(cap_sum)
print(f"Double descent detected (capacity)? {flag}. Peak index: {peak_idx}.")
plot_capacity(cap_sum, title="Capacity sweep (mp_layers=2, emb_dim=64)")

# Sample-size sweep run (quick example settings)
size_df = size_sweep_multi(
    data_train, data_val, data_test,
    fractions=(0.2,0.4,0.6,0.8,1.0),
    emb_dim=64, mp_hidden=256, mp_layers=2, mlp_hidden=512,
    epochs=30, lr=1e-3, seeds=(1,2,3), dropout=0.0
)
size_df.to_csv("size_results_multi.csv", index=False)

size_sum = summarize_size(size_df)
# A loose check for a "bump then drop" as n_train_edges increases:
# We can reuse the same detector by renaming columns
tmp = size_sum.rename(columns={'n_train_edges':'width'})[['width','mean','ci95','mean_train','ci95_train']]
flag_size, peak_idx_size = has_double_descent(tmp)
print(f"Double descent detected (sample size)? {flag_size}. Peak index: {peak_idx_size}.")
plot_size(size_sum, title="Sample-size sweep")


[cap] seed=1 emb=64 w=16 L=0 mlp=16 -> test_log=0.8733
[cap] seed=2 emb=64 w=16 L=0 mlp=16 -> test_log=0.8516
[cap] seed=3 emb=64 w=16 L=0 mlp=16 -> test_log=0.8477
[cap] seed=1 emb=64 w=16 L=1 mlp=16 -> test_log=0.9070
[cap] seed=2 emb=64 w=16 L=1 mlp=16 -> test_log=0.8384
[cap] seed=3 emb=64 w=16 L=1 mlp=16 -> test_log=0.8024
[cap] seed=1 emb=64 w=16 L=2 mlp=16 -> test_log=0.9061
[cap] seed=2 emb=64 w=16 L=2 mlp=16 -> test_log=0.8674
[cap] seed=3 emb=64 w=16 L=2 mlp=16 -> test_log=0.9470
[cap] seed=1 emb=64 w=32 L=0 mlp=32 -> test_log=0.9376
[cap] seed=2 emb=64 w=32 L=0 mlp=32 -> test_log=0.9008
[cap] seed=3 emb=64 w=32 L=0 mlp=32 -> test_log=0.9584
[cap] seed=1 emb=64 w=32 L=1 mlp=32 -> test_log=0.9206
[cap] seed=2 emb=64 w=32 L=1 mlp=32 -> test_log=0.9514
[cap] seed=3 emb=64 w=32 L=1 mlp=32 -> test_log=0.9101
[cap] seed=1 emb=64 w=32 L=2 mlp=32 -> test_log=0.9872
[cap] seed=2 emb=64 w=32 L=2 mlp=32 -> test_log=1.0192
[cap] seed=3 emb=64 w=32 L=2 mlp=32 -> test_log=1.0913
[cap] seed

KeyboardInterrupt: 