In [36]:
import os
import math
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, TensorDataset

In [37]:
# Load embeddings
def load_embeddings(file_path):
    """
    Load embeddings and labels from the pickle file and extract MatchID from the PeriodID.
    """
    print(f"Loading embeddings from {file_path}...")
    merged_df = pd.read_pickle(file_path)
    if "ID" in merged_df.columns:
        merged_df["MatchID"] = merged_df["ID"].apply(lambda x: x.split("_")[0])
    print(f"Loaded {len(merged_df)} PeriodIDs.")
    return merged_df

train_embeddings_file = "aggregated_embeddings_with_labels.pkl"
test_embeddings_file = "aggregated_embeddings_with_labels_test.pkl"

train_df = load_embeddings(train_embeddings_file)
test_df = load_embeddings(test_embeddings_file)

Loading embeddings from aggregated_embeddings_with_labels.pkl...
Loaded 2137 PeriodIDs.
Loading embeddings from aggregated_embeddings_with_labels_test.pkl...
Loaded 516 PeriodIDs.


In [38]:
def create_game_based_data_loaders(df, batch_size, sequence_length=5, shuffle=True):
    """
    Create DataLoaders grouped by game. Each DataLoader contains sequences of embeddings per game.
    """
    game_loaders = {}
    grouped = df.groupby("MatchID")

    for match_id, group in grouped:
        all_embeddings = np.vstack(group["aggregated_embedding"].values).astype(np.float32)
        all_labels = group["EventType"].values.astype(int)

        sequences = []
        seq_labels = []

        # Create sequences
        for i in range(len(all_embeddings) - sequence_length + 1):
            seq_emb = all_embeddings[i:i+sequence_length]
            seq_label = all_labels[i+sequence_length-1]
            sequences.append(seq_emb)
            seq_labels.append(seq_label)

        if len(sequences) == 0:
            # If a game is too short, skip it
            continue

        sequences = np.array(sequences)
        seq_labels = np.array(seq_labels)

        dataset = TensorDataset(
            torch.tensor(sequences, dtype=torch.float32),
            torch.tensor(seq_labels, dtype=torch.float32)
        )
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
        game_loaders[match_id] = loader

    return game_loaders

In [39]:
# Manually split into train/val/test based on MatchID as requested
all_games = train_df["MatchID"].unique()
# Suppose we have at least 16 games as mentioned.
# Set a seed for reproducibility
np.random.seed(42)
np.random.shuffle(all_games)

# Take 3 games for final testing (unseen completely)
final_test_games = all_games[:3]
train_val_games = all_games[3:]

# Out of these 13 games, let's do a simple train/val split (e.g. 20% for validation)
val_ratio = 0.2
val_count = int(len(train_val_games) * val_ratio)
val_games = train_val_games[:val_count]
train_games = train_val_games[val_count:]

train_df_cv = train_df[train_df["MatchID"].isin(train_games)]
val_df_cv = train_df[train_df["MatchID"].isin(val_games)]

# Final test set (held-out before Kaggle submission)
final_test_df = train_df[train_df["MatchID"].isin(final_test_games)]

# Create data loaders
train_game_loaders = create_game_based_data_loaders(train_df_cv, batch_size=32, sequence_length=3)
val_game_loaders = create_game_based_data_loaders(val_df_cv, batch_size=32, sequence_length=3)

In [40]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model=84, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        x = x + self.pe[:, :x.size(1), :]
        return x

In [41]:
# LSTM Model
class ChronologicalLSTM(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=128, num_layers=2, bidirectional=True, dropout=0.3):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            dropout=dropout,
            batch_first=True,
            bidirectional=bidirectional
        )
        
        self.intermediate_dim = hidden_dim * 2 if bidirectional else hidden_dim
        self.fc1 = nn.Linear(self.intermediate_dim, self.intermediate_dim)
        self.activation = nn.ReLU()
        self.classifier = nn.Linear(self.intermediate_dim, 1)

    def forward(self, embeddings):
        _, (hidden, _) = self.lstm(embeddings)
        if self.lstm.bidirectional:
            hidden_state = torch.cat((hidden[-2], hidden[-1]), dim=-1)
        else:
            hidden_state = hidden[-1]
        x = self.fc1(hidden_state)
        x = self.activation(x)
        logits = self.classifier(x)
        return torch.sigmoid(logits).squeeze(-1)

In [42]:
# Transformer Encoder Model
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim=768, num_heads=4, hidden_dim=128, num_layers=2, dropout=0.3):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.projection = nn.Linear(input_dim, hidden_dim)
        self.pos_encoder = PositionalEncoding(d_model=hidden_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, 
            nhead=num_heads, 
            dim_feedforward=hidden_dim * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.classifier = nn.Linear(hidden_dim, 1)

    def forward(self, embeddings):
        x = self.projection(embeddings)
        x = self.pos_encoder(x)
        x = self.transformer(x)
        pooled_output = x.mean(dim=1)
        x = self.fc1(pooled_output)
        x = self.activation(x)
        logits = self.classifier(x)
        return torch.sigmoid(logits).squeeze(-1)

In [43]:
def evaluate_model(model, val_game_loaders, device):
    model.eval()
    val_loss, val_accuracy = 0, 0
    total_val_batches = 0
    criterion = nn.BCELoss()

    with torch.no_grad():
        for match_id, val_loader in val_game_loaders.items():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                outputs = model(embeddings)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                preds = (outputs.cpu().numpy() > 0.5).astype(int)
                val_accuracy += accuracy_score(labels.cpu().numpy(), preds)
                total_val_batches += 1

    avg_val_loss = val_loss / total_val_batches if total_val_batches > 0 else 0
    avg_val_acc = val_accuracy / total_val_batches if total_val_batches > 0 else 0
    return avg_val_loss, avg_val_acc

In [44]:
def train_game_based_model(model, train_loaders, val_loaders, epochs, lr, device, patience=10, weight_decay=1e-5):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.BCELoss()

    # Early stopping initialization
    best_val_acc = 0.0
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss, train_accuracy = 0, 0
        total_train_batches = 0

        print(f"Epoch {epoch + 1}/{epochs}...")
        for match_id, game_loader in train_loaders.items():
            for embeddings, labels in game_loader:
                embeddings, labels = embeddings.to(device), labels.float().to(device)
                optimizer.zero_grad()
                outputs = model(embeddings)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                train_loss += loss.item()
                preds = (outputs.detach().cpu().numpy() > 0.5).astype(int)
                train_accuracy += accuracy_score(labels.cpu().numpy(), preds)
                total_train_batches += 1

        avg_train_loss = train_loss / total_train_batches
        avg_train_acc = train_accuracy / total_train_batches

        # Validation
        avg_val_loss, avg_val_acc = evaluate_model(model, val_loaders, device)

        print(f"Epoch {epoch + 1}: Train Loss {avg_train_loss:.4f}, Train Acc {avg_train_acc:.4f}")
        print(f"Validation Loss: {avg_val_loss:.4f}, Validation Acc {avg_val_acc:.4f}")

        # Early stopping check
        if avg_val_acc > best_val_acc:
            best_val_acc = avg_val_acc
            patience_counter = 0
            # Save best model weights
            best_model_state = model.state_dict()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    if 'best_model_state' in locals():
        model.load_state_dict(best_model_state)
    return model

In [45]:
def evaluate_model_comparison(models, val_game_loaders, device):
    results = []
    for model_name, model in models.items():
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for match_id, val_loader in val_game_loaders.items():
                for embeddings, labels in val_loader:
                    embeddings, labels = embeddings.to(device), labels.to(device)
                    outputs = model(embeddings)
                    preds = (outputs > 0.5).int().cpu().numpy()
                    all_preds.extend(preds)
                    all_labels.extend(labels.cpu().numpy())

        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, zero_division=0)
        recall = recall_score(all_labels, all_preds, zero_division=0)
        f1 = f1_score(all_labels, all_preds, zero_division=0)
        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        })

    return pd.DataFrame(results)

In [46]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [47]:
import optuna   # Hyperparameter tuning library

# ===========================
# Hyperparameter Tuning Setup
# ===========================
def objective(trial, train_game_loaders, val_game_loaders, device):
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    hidden_dim = trial.suggest_int("hidden_dim", 64, 256)
    # Ensure hidden_dim is even
    if hidden_dim % 2 != 0:
        hidden_dim += 1
    model_type = trial.suggest_categorical("model_type", ["LSTM", "Transformer"])

    if model_type == "LSTM":
        model = ChronologicalLSTM(input_dim=768, hidden_dim=hidden_dim, num_layers=1, bidirectional=True, dropout=0.3)
    else:
        num_heads = trial.suggest_categorical("num_heads", [2,4])
        num_layers = trial.suggest_int("num_layers", 1, 3)
        # Ensure hidden_dim is divisible by num_heads if needed
        if hidden_dim % num_heads != 0:
            # Adjust hidden_dim to be divisible by num_heads
            hidden_dim += (num_heads - (hidden_dim % num_heads))
        model = TransformerEncoder(input_dim=768, hidden_dim=hidden_dim, num_heads=num_heads, num_layers=num_layers, dropout=0.3)

    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()

    epochs = 5
    best_val_acc = 0.0
    for _ in range(epochs):
        model.train()
        for match_id, game_loader in train_game_loaders.items():
            for embeddings, labels in game_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(embeddings)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

        # Validation
        model.eval()
        val_accuracy = 0
        total_val_batches = 0
        with torch.no_grad():
            for match_id, val_loader in val_game_loaders.items():
                for embeddings, labels in val_loader:
                    embeddings, labels = embeddings.to(device), labels.to(device)
                    outputs = model(embeddings)
                    preds = (outputs > 0.5).int().cpu().numpy()
                    val_accuracy += accuracy_score(labels.cpu().numpy(), preds)
                    total_val_batches += 1

        avg_val_acc = val_accuracy / total_val_batches if total_val_batches > 0 else 0
        if avg_val_acc > best_val_acc:
            best_val_acc = avg_val_acc

    return best_val_acc

def run_hyperparameter_tuning(train_game_loaders, val_game_loaders, device, n_trials=20):
    # Call this function after you have train_game_loaders and val_game_loaders defined
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, train_game_loaders, val_game_loaders, device), n_trials=n_trials)
    return study.best_params

# ===========================
# Training on Final Data
# ===========================
def train_on_final_data(model, final_test_loaders, epochs=5, lr=1e-4, device="cpu", weight_decay=1e-5):
    # This function will train the model on the final_test_loaders after evaluating on them,
    # incorporating all data into the model before submission.
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.BCELoss()

    for epoch in range(epochs):
        model.train()
        total_train_loss, total_train_acc = 0, 0
        total_train_batches = 0

        for match_id, loader in final_test_loaders.items():
            for embeddings, labels in loader:
                embeddings, labels = embeddings.to(device), labels.to(device).float()
                optimizer.zero_grad()
                outputs = model(embeddings)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                total_train_loss += loss.item()
                preds = (outputs.detach().cpu().numpy() > 0.5).astype(int)
                total_train_acc += accuracy_score(labels.cpu().numpy(), preds)
                total_train_batches += 1

        avg_train_loss = total_train_loss / total_train_batches if total_train_batches > 0 else 0
        avg_train_acc = total_train_acc / total_train_batches if total_train_batches > 0 else 0
        print(f"Final Data Training Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.4f}")

    return model

In [48]:
# ===========================
# Main Training & Evaluation
# ===========================
print("Training Chronological LSTM...")
lstm_model = ChronologicalLSTM(input_dim=768, hidden_dim=128, num_layers=1, bidirectional=True, dropout=0.3)
trained_lstm_model = train_game_based_model(lstm_model, train_game_loaders, val_game_loaders, epochs=50, lr=1e-4, device=device, patience=5)

print("Training Transformer Encoder...")
transformer_model = TransformerEncoder(input_dim=768, hidden_dim=128, num_heads=4, num_layers=2, dropout=0.3)
trained_transformer_model = train_game_based_model(transformer_model, train_game_loaders, val_game_loaders, epochs=50, lr=1e-4, device=device, patience=5)

comparison_df = evaluate_model_comparison(
    {"Chronological LSTM": trained_lstm_model, "Transformer Encoder": trained_transformer_model},
    val_game_loaders, device
)
print(comparison_df)

Training Chronological LSTM...
Epoch 1/50...
Epoch 1: Train Loss 0.6902, Train Acc 0.5463
Validation Loss: 0.6879, Validation Acc 0.5508
Epoch 2/50...




Epoch 2: Train Loss 0.6871, Train Acc 0.5525
Validation Loss: 0.6875, Validation Acc 0.5508
Epoch 3/50...
Epoch 3: Train Loss 0.6840, Train Acc 0.5524
Validation Loss: 0.6861, Validation Acc 0.5508
Epoch 4/50...
Epoch 4: Train Loss 0.6751, Train Acc 0.5769
Validation Loss: 0.6813, Validation Acc 0.6289
Epoch 5/50...
Epoch 5: Train Loss 0.6616, Train Acc 0.6011
Validation Loss: 0.6751, Validation Acc 0.6055
Epoch 6/50...
Epoch 6: Train Loss 0.6436, Train Acc 0.6310
Validation Loss: 0.6813, Validation Acc 0.5195
Epoch 7/50...
Epoch 7: Train Loss 0.6350, Train Acc 0.6491
Validation Loss: 0.6440, Validation Acc 0.6406
Epoch 8/50...
Epoch 8: Train Loss 0.6320, Train Acc 0.6360
Validation Loss: 0.6437, Validation Acc 0.6133
Epoch 9/50...
Epoch 9: Train Loss 0.6261, Train Acc 0.6318
Validation Loss: 0.6410, Validation Acc 0.6172
Epoch 10/50...
Epoch 10: Train Loss 0.6324, Train Acc 0.6429
Validation Loss: 0.6754, Validation Acc 0.5391
Epoch 11/50...
Epoch 11: Train Loss 0.6111, Train Acc 0.65

In [49]:
# Evaluate on final unseen test set
final_test_loaders = create_game_based_data_loaders(final_test_df, batch_size=32, sequence_length=3, shuffle=False)
final_comparison_df = evaluate_model_comparison(
    {"Chronological LSTM": trained_lstm_model, "Transformer Encoder": trained_transformer_model},
    final_test_loaders, device
)
print("Final Unseen Test Set Results:")
print(final_comparison_df)

Final Unseen Test Set Results:
                 Model  Accuracy  Precision    Recall  F1-Score
0   Chronological LSTM  0.550691   0.677419  0.276316  0.392523
1  Transformer Encoder  0.490783   1.000000  0.030702  0.059574


In [None]:
# =============================================
# Hyperparameter Tuning (Call before final test)
# =============================================
best_params = run_hyperparameter_tuning(train_game_loaders, val_game_loaders, device, n_trials=20)
print("Best Hyperparameters found by Optuna:", best_params)

# After you get best_params, automatically re-initialize the best model based on model_type
model_type = best_params["model_type"]
if model_type == "LSTM":
    best_model = ChronologicalLSTM(
        input_dim=768, 
        hidden_dim=best_params["hidden_dim"], 
        num_layers=1, 
        bidirectional=True, 
        dropout=0.3
    )
else:
    best_model = TransformerEncoder(
        input_dim=768,
        hidden_dim=best_params["hidden_dim"],
        num_heads=best_params["num_heads"],
        num_layers=best_params["num_layers"],
        dropout=0.3
    )

# Re-train the best model on the training sets with best parameters
best_model = train_game_based_model(best_model, train_game_loaders, val_game_loaders, epochs=50, lr=best_params["lr"], device=device, patience=5)

# Evaluate the fine-tuned best model on the final_test_loaders (3 unseen games)
final_test_results = evaluate_model_comparison({"Best Model": best_model}, final_test_loaders, device)
print("Final Unseen Test Set Results with Finetuned Best Model:")

[I 2024-12-07 00:26:49,429] A new study created in memory with name: no-name-b82ab3af-9354-40c1-b930-df0a337dadd3
[I 2024-12-07 00:26:49,922] Trial 0 finished with value: 0.55078125 and parameters: {'lr': 0.00032934274357231303, 'hidden_dim': 69, 'model_type': 'LSTM'}. Best is trial 0 with value: 0.55078125.
[I 2024-12-07 00:26:51,866] Trial 1 finished with value: 0.55078125 and parameters: {'lr': 5.1943597765505735e-05, 'hidden_dim': 146, 'model_type': 'Transformer', 'num_heads': 2, 'num_layers': 3}. Best is trial 0 with value: 0.55078125.
[I 2024-12-07 00:26:52,931] Trial 2 finished with value: 0.55078125 and parameters: {'lr': 0.0004896847728778276, 'hidden_dim': 216, 'model_type': 'Transformer', 'num_heads': 2, 'num_layers': 2}. Best is trial 0 with value: 0.55078125.
[I 2024-12-07 00:26:54,723] Trial 3 finished with value: 0.55078125 and parameters: {'lr': 0.0008215026119212586, 'hidden_dim': 227, 'model_type': 'Transformer', 'num_heads': 2, 'num_layers': 3}. Best is trial 0 with 

Best Hyperparameters found by Optuna: {'lr': 9.483162463996305e-05, 'hidden_dim': 126, 'model_type': 'LSTM'}
Epoch 1/50...
Epoch 1: Train Loss 0.6904, Train Acc 0.5504
Validation Loss: 0.6885, Validation Acc 0.5508
Epoch 2/50...
Epoch 2: Train Loss 0.6874, Train Acc 0.5566
Validation Loss: 0.6872, Validation Acc 0.5508
Epoch 3/50...
Epoch 3: Train Loss 0.6857, Train Acc 0.5525
Validation Loss: 0.6870, Validation Acc 0.5508
Epoch 4/50...
Epoch 4: Train Loss 0.6808, Train Acc 0.5588
Validation Loss: 0.6842, Validation Acc 0.5508
Epoch 5/50...
Epoch 5: Train Loss 0.6737, Train Acc 0.5718
Validation Loss: 0.6799, Validation Acc 0.5586
Epoch 6/50...
Epoch 6: Train Loss 0.6610, Train Acc 0.5879
Validation Loss: 0.6791, Validation Acc 0.5781
Epoch 7/50...
Epoch 7: Train Loss 0.6471, Train Acc 0.6261
Validation Loss: 0.6557, Validation Acc 0.6133
Epoch 8/50...
Epoch 8: Train Loss 0.6426, Train Acc 0.6274
Validation Loss: 0.6525, Validation Acc 0.6094
Epoch 9/50...
Epoch 9: Train Loss 0.6426, T

In [None]:
# Now train the best model on the final_test_loaders to use your full dataset
best_model = train_on_final_data(best_model, final_test_loaders, epochs=5, lr=1e-4, device=device)

In [52]:
def predict_for_kaggle_submission(model, test_df, device):
    model.to(device)
    model.eval()
    predictions = []
    with torch.no_grad():
        grouped = test_df.groupby("MatchID")
        for match_id, group in grouped:
            embeddings = np.vstack(group["aggregated_embedding"].values).astype(np.float32)
            embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)
            # One-step predictions
            outputs = model(embeddings_tensor.unsqueeze(1)).squeeze(-1)
            preds = (outputs.cpu().numpy() > 0.5).astype(int)
            group["EventType"] = preds
            predictions.append(group[["ID", "EventType"]])
    return pd.concat(predictions)

submission_df = predict_for_kaggle_submission(trained_lstm_model, test_df, device)
submission_df.to_csv("final_submission.csv", index=False)
print("Submission file saved as final_submission.csv.")

Submission file saved as final_submission.csv.
