In [21]:
# %pip install torch torch_geometric pytorch_lightning wandb scikit-learn

In [22]:
import torch
from torch_geometric.data import Data
import pandas as pd
import wandb
from sklearn.preprocessing import MultiLabelBinarizer
import ast
import numpy as np

pd.set_option('display.max_columns', None)

# Load the data

In [23]:
df = pd.read_csv('../get_lineup_target_score/nba_with_lineup_score.csv')
pre_df = pd.read_csv('../pre_season/pre_lineup_vs_lineup.csv')

In [24]:
pre_df.fillna(0, inplace=True)

In [25]:
df.dropna(subset=['net_score'], inplace=True)
df = df[~df['net_score'].isin([np.inf, -np.inf])]

In [26]:
df.size

91200

In [27]:
all_lineups = pd.concat([df['home_lineup'], df['away_lineup']]).unique()
pre_df_lineups = pd.concat([pre_df['home_lineup'], pre_df['away_lineup']]).unique()
filtered_lineups = [lineup for lineup in all_lineups if lineup in pre_df_lineups]
all_lineups = np.array(filtered_lineups)

In [28]:
df = df[df['home_lineup'].isin(all_lineups) & df['away_lineup'].isin(all_lineups)]
pre_df = pre_df[pre_df['home_lineup'].isin(all_lineups) & pre_df['away_lineup'].isin(all_lineups)]

In [29]:
lineups_df = pd.DataFrame(all_lineups, columns=['lineup'])
lineups_df["3pt_made"] = 0
lineups_df["points"] = 0
lineups_df["assists"] = 0
lineups_df["def_rebounds"] = 0
lineups_df["off_rebounds"] = 0
lineups_df["fouls"] = 0
lineups_df["2pt_made"] = 0
lineups_df["turnovers"] = 0
lineups_df["ft_made"] = 0
lineups_df["steals"] = 0
lineups_df["blocks"] = 0

In [30]:
cols = ['3pt_made', 'points', 'assists', 'def_rebounds', 'off_rebounds', 'fouls', '2pt_made', 'turnovers', 'ft_made', 'steals', 'blocks']
for index, row in pre_df.iterrows():
    home_lineup = row['home_lineup']
    away_lineup = row['away_lineup']
    
    for col in cols:
        home_col = 'home_' + col
        away_col = 'away_' + col
        if home_col in row and away_col in row:
            lineups_df.loc[lineups_df['lineup'] == home_lineup, col] += row[home_col]
            lineups_df.loc[lineups_df['lineup'] == away_lineup, col] += row[away_col]
        else:
            print(f'Column {home_col} or {away_col} not found in row')

In [31]:
def create_lineup_graph(df, pre_df):
    all_lineups = pd.concat([df['home_lineup'], df['away_lineup']]).unique()
    pre_df_lineups = pd.concat([pre_df['home_lineup'], pre_df['away_lineup']]).unique()
    filtered_lineups = [lineup for lineup in all_lineups if lineup in pre_df_lineups]
    all_lineups = np.array(filtered_lineups)

    df = df[df['home_lineup'].isin(all_lineups) & df['away_lineup'].isin(all_lineups)]
    pre_df = pre_df[pre_df['home_lineup'].isin(all_lineups) & pre_df['away_lineup'].isin(all_lineups)]

    lineups_df = pd.DataFrame(all_lineups, columns=['lineup'])
    lineups_df["3pt_made"] = 0
    lineups_df["points"] = 0
    lineups_df["assists"] = 0
    lineups_df["def_rebounds"] = 0
    lineups_df["off_rebounds"] = 0
    lineups_df["fouls"] = 0
    lineups_df["2pt_made"] = 0
    lineups_df["turnovers"] = 0
    lineups_df["ft_made"] = 0
    lineups_df["steals"] = 0
    lineups_df["blocks"] = 0

    cols = ['3pt_made', 'points', 'assists', 'def_rebounds', 'off_rebounds', 'fouls', '2pt_made', 'turnovers', 'ft_made', 'steals', 'blocks']
    for index, row in pre_df.iterrows():
        home_lineup = row['home_lineup']
        away_lineup = row['away_lineup']
        
        for col in cols:
            home_col = 'home_' + col
            away_col = 'away_' + col
            if home_col in row and away_col in row:
                lineups_df.loc[lineups_df['lineup'] == home_lineup, col] += row[home_col]
                lineups_df.loc[lineups_df['lineup'] == away_lineup, col] += row[away_col]
            else:
                print(f'Column {home_col} or {away_col} not found in row')
    
    # Create unique lineup nodes
    lineup2idx = {lineup: idx for idx, lineup in enumerate(lineups_df['lineup'])}
    
    # Create node features based on lineups_df
    x = torch.tensor(lineups_df.iloc[:, 1:].values, dtype=torch.float)
    
    # Create directed edges with score-based direction
    edge_index = []
    edge_attr = []
    
    for _, row in df.iterrows():
        home = row['home_lineup']
        away = row['away_lineup']
        home_idx = lineup2idx[home]
        away_idx = lineup2idx[away]
        
        # Determine edge direction based on normalized scores
        if row['normalized_home_score'] > row['normalized_away_score']:
            src, dst = home_idx, away_idx
        else:
            src, dst = away_idx, home_idx
            
        edge_index.append([src, dst])
        edge_attr.append(abs(row['net_score']))
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float).unsqueeze(1)
    edge_attr = (edge_attr - edge_attr.mean()) / (edge_attr.std() + 1e-8)
    
    return Data(
        x=x,
        edge_index=edge_index,
        edge_attr=edge_attr,
        num_nodes=len(lineups_df))

In [32]:
data = create_lineup_graph(df, pre_df)

In [33]:
data.x.isnan().any()

tensor(False)

# GNN Model

In [34]:
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.nn import GINEConv
from torch_geometric.loader import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

In [35]:
class GNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.conv1 = GINEConv(
            nn.Sequential(
                nn.Linear(input_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU()
            ),
            train_eps=True,
            edge_dim=1  # Edge attribute dimension (scalar)
        )
        self.conv2 = GINEConv(
            nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, output_dim),
                nn.ReLU()
            ),
            train_eps=True,
            edge_dim=1
        )
    
    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.conv2(x, edge_index, edge_attr)
        return x

# Link Prediction Decoder with Edge Attributes
class LinkPredictor(nn.Module):
    def __init__(self, node_emb_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(2 * node_emb_dim + 1, node_emb_dim),  # +1 for edge_attr
            nn.ReLU(),
            nn.Linear(node_emb_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, u_emb, v_emb, edge_attr):
        concatenated = torch.cat([u_emb, v_emb, edge_attr], dim=1)
        return self.mlp(concatenated).squeeze()

# Lightning Module for Training
class LitGNN(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim=64, output_dim=64):
        super().__init__()
        self.gnn = GNNModel(input_dim, hidden_dim, output_dim)
        self.link_predictor = LinkPredictor(output_dim)
        self.loss_fn = nn.BCELoss()
    
    def forward(self, x, edge_index, edge_attr):
        return self.gnn(x, edge_index, edge_attr)
    
    def training_step(self, batch, batch_idx):
        x, edge_index, edge_attr = batch.x, batch.edge_index, batch.edge_attr
        edge_label_index = batch.edge_label_index
        edge_label_attr = batch.edge_label_attr
        edge_label = batch.edge_label
        
        node_emb = self.gnn(x, edge_index, edge_attr)
        u_emb = node_emb[edge_label_index[0]]
        v_emb = node_emb[edge_label_index[1]]
        preds = self.link_predictor(u_emb, v_emb, edge_label_attr)
        
        loss = self.loss_fn(preds, edge_label)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, edge_index, edge_attr = batch.x, batch.edge_index, batch.edge_attr
        edge_label_index = batch.edge_label_index
        edge_label_attr = batch.edge_label_attr
        edge_label = batch.edge_label
        
        node_emb = self.gnn(x, edge_index, edge_attr)
        u_emb = node_emb[edge_label_index[0]]
        v_emb = node_emb[edge_label_index[1]]
        preds = self.link_predictor(u_emb, v_emb, edge_label_attr)
        
        loss = self.loss_fn(preds, edge_label)
        self.log('val_loss', loss)
        acc = ((preds > 0.5).float() == edge_label).float().mean()
        self.log('val_acc', acc)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, edge_index, edge_attr = batch.x, batch.edge_index, batch.edge_attr
        edge_label_index = batch.edge_label_index
        edge_label_attr = batch.edge_label_attr
        edge_label = batch.edge_label
        
        node_emb = self.gnn(x, edge_index, edge_attr)
        u_emb = node_emb[edge_label_index[0]]
        v_emb = node_emb[edge_label_index[1]]
        preds = self.link_predictor(u_emb, v_emb, edge_label_attr)
        
        loss = self.loss_fn(preds, edge_label)
        self.log('test_loss', loss)
        acc = ((preds > 0.5).float() == edge_label).float().mean()
        self.log('test_acc', acc)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

In [36]:
class LineupGINE(pl.LightningModule):
    def __init__(self,
                 input_dim,
                 edge_feature_dim,
                 hidden_dim=32,
                 lr=0.001,
                 num_conv_layers=2,
                 num_linear_layers=2,
                 dropout=0.5):
        super(LineupGINE, self).__init__()
        self.save_hyperparameters()
        self.lr = lr

        # Convolutional layers
        self.conv_layers = nn.ModuleList()
        for i in range(num_conv_layers):
            in_channels = input_dim if i == 0 else hidden_dim
            mlp = nn.Sequential(
                nn.Linear(in_channels, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim)
            )
            self.conv_layers.append(
                GINEConv(mlp,
                         edge_dim=edge_feature_dim)
            )

        self.dropout = nn.Dropout(dropout)
            
        # Decoder linear layers
        layers = []
        current_dim = 2 * hidden_dim  # Concatenated embeddings from src and tgt
        for i in range(num_linear_layers - 1):
            layers.append(nn.Linear(current_dim, hidden_dim))
            layers.append(nn.ReLU())
            current_dim = hidden_dim
        layers.append(nn.Linear(hidden_dim, 1))
        self.decoder = nn.Sequential(*layers)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        for conv in self.conv_layers:
            x = conv(x, edge_index, edge_attr)
            x = F.relu(x)

        x = self.dropout(x)
        return x
    
    def training_step(self, batch, batch_idx):
        embeddings = self(batch)

        src, tgt = batch.edge_index
        print(f"Source: {src.isnan().any()}, Target: {tgt.isnan().any()}")
        src_embeddings = embeddings[src]
        tgt_embeddings = embeddings[tgt]
        edge_feature_input = torch.cat([src_embeddings, tgt_embeddings], dim=1)

        # print(f"Source embeddings: {src_embeddings}")
        # print(f"Target embeddings: {tgt_embeddings}")

        pred_scores = self.decoder(edge_feature_input)

        # print(f"Predicted scores: {pred_scores}")
        # print(f"Actual scores: {batch.edge_attr}")

        loss = F.mse_loss(pred_scores, batch.edge_attr)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        embeddings = self(batch)

        src, tgt = batch.edge_index
        src_embeddings = embeddings[src]
        tgt_embeddings = embeddings[tgt]
        edge_feature_input = torch.cat([src_embeddings, tgt_embeddings], dim=1)
        pred_scores = self.decoder(edge_feature_input)
        loss = F.mse_loss(pred_scores, batch.edge_attr)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        embeddings = self(batch)

        src, tgt = batch.edge_index
        src_embeddings = embeddings[src]
        tgt_embeddings = embeddings[tgt]
        edge_feature_input = torch.cat([src_embeddings, tgt_embeddings], dim=1)
        pred_scores = self.decoder(edge_feature_input)
        loss = F.mse_loss(pred_scores, batch.edge_attr)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


In [37]:
# Function to add reversed edges as negatives to a Data object
def add_reversed_negatives(data):
    if data.edge_index.size(1) == 0:
        return data  # Skip if no edges
    original_edges = data.edge_index
    reversed_edges = torch.flip(original_edges, dims=[0])
    
    # Combine original and reversed edges
    edge_label_index = torch.cat([original_edges, reversed_edges], dim=1)
    edge_label = torch.cat([
        torch.ones(original_edges.size(1)),
        torch.zeros(reversed_edges.size(1))
    ])
    
    # Use original edge_attr for both directions
    edge_label_attr = torch.cat([data.edge_attr, data.edge_attr], dim=0)
    
    # Shuffle the combined data
    perm = torch.randperm(edge_label_index.size(1))
    edge_label_index = edge_label_index[:, perm]
    edge_label_attr = edge_label_attr[perm]
    edge_label = edge_label[perm]
    
    data.edge_label_index = edge_label_index
    data.edge_label_attr = edge_label_attr
    data.edge_label = edge_label
    return data

In [38]:
batch_size = 1
train_data, val_data, test_data = RandomLinkSplit(num_val=0.1, num_test=0.1)(data)
train_data = add_reversed_negatives(train_data)
val_data = add_reversed_negatives(val_data)
test_data = add_reversed_negatives(test_data)
train_loader = DataLoader([train_data], batch_size=batch_size, shuffle=True)
val_loader = DataLoader([val_data], batch_size=batch_size, shuffle=False)
test_loader = DataLoader([test_data], batch_size=batch_size, shuffle=False)

In [39]:
model = LitGNN(input_dim=data.x.shape[1])

trainer = pl.Trainer(
    max_epochs=10,
    gradient_clip_val=1.0
)

trainer.fit(model, train_loader, val_loader)
trainer.test(model, test_loader)

You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name           | Type          | Params | Mode 
---------------------------------------------------------
0 | gnn            | GNNModel      | 13.7 K | train
1 | link_predictor | LinkPredictor | 8.4 K  | train
2 | loss_fn        | BCELoss       | 0      | train
---------------------------------------------------------
22.0 K    Trainable params
0         Non-trainable params
22.0 K    Total params
0.088     Total estimated model params size (MB)
26        Modules in train mode
0         Modules in eval mode

  | Name           | Type          | Params | Mode 
---------------------------------------------------------
0 | gnn         

                                                                           

c:\Users\rokaa\egyetem\basketball_lineup_analysis\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
c:\Users\rokaa\egyetem\basketball_lineup_analysis\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
c:\Users\rokaa\egyetem\basketball_lineup_analysis\.venv\lib\site-packages\pytorch_lightning\loops\fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 9: 100%|██████████| 1/1 [00:00<00:00,  7.46it/s, v_num=51]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 1/1 [00:00<00:00,  6.44it/s, v_num=51]

c:\Users\rokaa\egyetem\basketball_lineup_analysis\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.



Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 38.13it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.7005689740180969
        test_loss           0.6319311857223511
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.7005689740180969
        test_loss           0.6319311857223511
─────────────────────────────────────────────────

[{'test_loss': 0.6319311857223511, 'test_acc': 0.7005689740180969}]

In [16]:
sweep_config = {
    'method': 'bayes',  # bayes, grid, or random
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'   
    },
    'parameters': {
        'hidden_dim': {
            'values': [32, 64, 128]
        },
        'num_conv_layers': {
            'values': [2, 3, 4]
        },
        'num_linear_layers': {
            'values': [1, 2, 3]
        },
        'lr': {
            'distribution': 'log_uniform',
            'min': 1e-4,
            'max': 1e-2
        },
        'dropout': {
            'values': [0.0, 0.2, 0.4]
        }
    }
}

In [17]:
def train_sweep():
    with wandb.init() as run:
        config = wandb.config
        model = LineupGINE(
            input_dim=10,  # Number of features per lineup
            edge_feature_dim=1,  # Number of features per edge (e.g., score difference)
            hidden_dim=config.hidden_dim,
            lr=config.lr,
            num_conv_layers=config.num_conv_layers,
            num_linear_layers=config.num_linear_layers,
            dropout=config.dropout
        )

        trainer = pl.Trainer(
            max_epochs=50,
            logger=pl.loggers.WandbLogger(),
            callbacks=[
                pl.callbacks.EarlyStopping(monitor='val_loss', patience=10),
                pl.callbacks.ModelCheckpoint(monitor='val_loss')
            ]
        )
        trainer.fit(model, train_loader, val_loader)
        trainer.test(model, test_loader)
        wandb.finish()