In [1]:
import os
import pandas as pd

def load_data_grouped(data_dir):
    """
    Load all CSV files from the given directory, grouself.self.self.self.self.self.ped by game.

    Args:
        data_dir (str): Path to the directory containing the dataset.

    Returns:
        dict: Dictionary with keys `train` and `eval`, each containing a dictionary of DataFrames, 
              where each key is the game name.
    """
    data = {"train": {}, "eval": {}}

    for category in data.keys():
        dir_path = os.path.join(data_dir, f"{category}_tweets")
        for root, _, files in os.walk(dir_path):
            for file in files:
                if file.endswith(".csv") and not any(ext in file for ext in [":Zone.Identifier", "_plots"]):
                    file_path = os.path.join(root, file)
                    game_name = os.path.splitext(file)[0]  # Extract game name from file
                    print(f"Loading {file_path} for game {game_name}...")
                    df = pd.read_csv(file_path)
                    data[category][game_name] = df
    
    return data

# Usage example
data_dir = "./"  # Replace with your dataset root directory
data_grouped = load_data_grouped(data_dir)

print(f"Train Games: {list(data_grouped['train'].keys())}")
print(f"Eval Games: {list(data_grouped['eval'].keys())}")


Loading ./train_tweets/MexicoCroatia37.csv for game MexicoCroatia37...
Loading ./train_tweets/AustraliaSpain34.csv for game AustraliaSpain34...
Loading ./train_tweets/GermanyBrazil74.csv for game GermanyBrazil74...
Loading ./train_tweets/GermanyUSA57.csv for game GermanyUSA57...
Loading ./train_tweets/FranceGermany70.csv for game FranceGermany70...
Loading ./train_tweets/PortugalGhana58.csv for game PortugalGhana58...
Loading ./train_tweets/CameroonBrazil36.csv for game CameroonBrazil36...
Loading ./train_tweets/NetherlandsChile35.csv for game NetherlandsChile35...
Loading ./train_tweets/BelgiumSouthKorea59.csv for game BelgiumSouthKorea59...
Loading ./train_tweets/USASlovenia2010.csv for game USASlovenia2010...
Loading ./train_tweets/AustraliaNetherlands29.csv for game AustraliaNetherlands29...
Loading ./train_tweets/ArgentinaBelgium72.csv for game ArgentinaBelgium72...
Loading ./train_tweets/HondurasSwitzerland54.csv for game HondurasSwitzerland54...
Loading ./train_tweets/FranceNige

In [2]:
data_grouped

{'train': {'MexicoCroatia37':             ID  MatchID  PeriodID  EventType      Timestamp  \
  0         19_0       19         0          0  1403553000000   
  1         19_0       19         0          0  1403553000000   
  2         19_0       19         0          0  1403553000000   
  3         19_0       19         0          0  1403553000000   
  4         19_0       19         0          0  1403553000000   
  ...        ...      ...       ...        ...            ...   
  155544  19_129       19       129          1  1403560800000   
  155545  19_129       19       129          1  1403560800000   
  155546  19_129       19       129          1  1403560800000   
  155547  19_129       19       129          1  1403560800000   
  155548  19_129       19       129          1  1403560800000   
  
                                                      Tweet  
  0       I have thee cutest Croatian following me (@Lar...  
  1       RT @worldsoccershop: If @MarioMandzukic9 score...  
  2

In [3]:
data_grouped['train']["ArgentinaBelgium72"].shape, data_grouped['train']["ArgentinaBelgium72"].head()

((313803, 6),
      ID  MatchID  PeriodID  EventType      Timestamp  \
 0  11_0       11         0          0  1404575400000   
 1  11_0       11         0          0  1404575400000   
 2  11_0       11         0          0  1404575400000   
 3  11_0       11         0          0  1404575400000   
 4  11_0       11         0          0  1404575400000   
 
                                                Tweet  
 0  RT @2014WorIdCup: Argentina vs Belgium\n\nWho ...  
 1  @elijahman_ time to focus on Belgium winning t...  
 2  RT @FIFAWorldCup: GLOBAL STADIUM: #Joinin with...  
 3  RT @CatholicNewsSvc: #PopeFrancis. Uh-oh. Arge...  
 4  RT @soccerdotcom: If he scores vs #BEL we'll a...  )

In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/elio_samaha/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
from utils import preprocess_text  # Use or enhance the existing function

def clean_tweets(df, remove_retweets=True):
    """
    Clean a single DataFrame of tweets.

    Args:
        df (pd.DataFrame): DataFrame containing tweets.
        remove_retweets (bool): Whether to remove retweets.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    print("Cleaning tweets...")
    if "Tweet" not in df.columns:
        raise ValueError("The DataFrame must contain a 'Tweet' column.")

    # Remove retweets if necessary
    if remove_retweets:
        df = df[~df["Tweet"].str.startswith("RT")]

    # Remove duplicates
    df = df.drop_duplicates(subset=["Tweet"])

    # Apply preprocessing to text
    df["cleaned_text"] = df["Tweet"].apply(preprocess_text)

    # Drop very short tweets if desired
    # df = df[df["cleaned_text"].str.len() > 10]  # Drop tweets with <= 10 characters

    return df

def clean_data_grouped(data_grouped, remove_retweets=True):
    """
    Clean the grouped dataset (grouped by games).

    Args:
        data_grouped (dict): Dictionary of DataFrames grouped by game.
        remove_retweets (bool): Whether to remove retweets.

    Returns:
        dict: Cleaned grouped dataset.
    """
    for category in data_grouped.keys():
        for game, df in data_grouped[category].items():
            print(f"Cleaning data for game: {game}")
            data_grouped[category][game] = clean_tweets(df, remove_retweets=remove_retweets)
    
    return data_grouped

# Usage
data_grouped = clean_data_grouped(data_grouped, remove_retweets=True)

2024-12-05 15:24:51.124896: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-05 15:24:51.349364: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-05 15:24:51.349399: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-05 15:24:51.404707: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-05 15:24:51.484215: I tensorflow/core/platform/cpu_feature_guar

Cleaning data for game: MexicoCroatia37
Cleaning tweets...
Cleaning data for game: AustraliaSpain34
Cleaning tweets...
Cleaning data for game: GermanyBrazil74
Cleaning tweets...
Cleaning data for game: GermanyUSA57
Cleaning tweets...
Cleaning data for game: FranceGermany70
Cleaning tweets...
Cleaning data for game: PortugalGhana58
Cleaning tweets...
Cleaning data for game: CameroonBrazil36
Cleaning tweets...
Cleaning data for game: NetherlandsChile35
Cleaning tweets...
Cleaning data for game: BelgiumSouthKorea59
Cleaning tweets...
Cleaning data for game: USASlovenia2010
Cleaning tweets...
Cleaning data for game: AustraliaNetherlands29
Cleaning tweets...
Cleaning data for game: ArgentinaBelgium72
Cleaning tweets...
Cleaning data for game: HondurasSwitzerland54
Cleaning tweets...
Cleaning data for game: FranceNigeria66
Cleaning tweets...
Cleaning data for game: ArgentinaGermanyFinal77
Cleaning tweets...
Cleaning data for game: GermanyAlgeria67
Cleaning tweets...
Cleaning data for game: N

In [28]:
import pickle
import pandas as pd
import numpy as np

def load_and_split_embeddings(embeddings_path):
    """
    Load precomputed embeddings from a pickle file and split them by game.

    Args:
        embeddings_path (str): Path to the pickle file containing embeddings.

    Returns:
        dict: A dictionary where each key is a MatchID (game) and the value is a DataFrame
              containing embeddings and labels for that game.
    """
    # Load the pickle file into a DataFrame
    print("Loading embeddings from pickle file...")
    merged_df = pd.read_pickle(embeddings_path)

    # Extract the MatchID from the ID column
    merged_df["MatchID"] = merged_df["ID"].apply(lambda x: x.split("_")[0])  # Extract the MatchID
    merged_df.drop("ID", axis=1, inplace=True)
    # merged_df.drop()
    merged_df.reset_index(drop=True, inplace=True)

    # Group by MatchID and create a dictionary of DataFrames
    grouped_data = {match_id: group for match_id, group in merged_df.groupby("MatchID")}

    print(f"Loaded and split data into {len(grouped_data)} games.")
    return grouped_data

# Example Usage
embeddings_path = "aggregated_embeddings_with_labels.pkl"
game_data = load_and_split_embeddings(embeddings_path)

# Example: Access data for a specific game
for match_id, df in game_data.items():
    print(f"Game: {match_id}, Number of Periods: {len(df)}")
    print(df.head())

Loading embeddings from pickle file...
Loaded and split data into 16 games.
Game: 0, Number of Periods: 130
                                aggregated_embedding  EventType MatchID
0  [0.038894046, 0.2056955, 0.1732219, -0.0507800...          0       0
1  [0.049072143, 0.20504853, 0.16913632, -0.04101...          0       0
2  [-0.010199165, 0.19084008, 0.142129, -0.027013...          1       0
3  [0.1076907, 0.18990463, 0.13662358, -0.0059914...          1       0
4  [0.110753596, 0.21662608, 0.14855266, -0.02414...          1       0
Game: 1, Number of Periods: 130
                                   aggregated_embedding  EventType MatchID
1187  [-0.02544716, 0.11435313, 0.106707424, -0.0815...          0       1
1188  [-0.028089345, 0.11905627, 0.11491479, -0.0808...          0       1
1189  [-0.035966076, 0.11795183, 0.12116859, -0.0814...          1       1
1190  [-0.07086158, 0.16810267, 0.1163678, -0.077644...          1       1
1191  [-0.073121004, 0.1698075, 0.11858397, -0.07978.

In [36]:
# game_data["0"].columns, game_data["0"].shape, game_data["0"].head()
# len(game_data["0"]["aggregated_embedding"][0])


---
---
---

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [2]:
def load_embeddings(file_path):
    """
    Load embeddings and labels from the pickle file and extract MatchID from the PeriodID.

    Args:
        file_path (str): Path to the pickle file.

    Returns:
        pd.DataFrame: Dataframe with MatchID, PeriodID, aggregated embeddings, and EventType.
    """
    print(f"Loading embeddings from {file_path}...")
    merged_df = pd.read_pickle(file_path)

    # Extract MatchID from ID (assuming the ID format is 'MatchID_PeriodID')
    if "ID" in merged_df.columns:
        merged_df["MatchID"] = merged_df["ID"].apply(lambda x: x.split("_")[0])

    print(f"Loaded {len(merged_df)} PeriodIDs.")
    return merged_df

# Load training and testing data
train_embeddings_file = "aggregated_embeddings_with_labels.pkl"
test_embeddings_file = "aggregated_embeddings_with_labels_test.pkl"

train_df = load_embeddings(train_embeddings_file)
test_df = load_embeddings(test_embeddings_file)

Loading embeddings from aggregated_embeddings_with_labels.pkl...
Loaded 2137 PeriodIDs.
Loading embeddings from aggregated_embeddings_with_labels_test.pkl...
Loaded 516 PeriodIDs.


In [3]:
train_df

Unnamed: 0,ID,aggregated_embedding,EventType,MatchID
0,0_0,"[0.038894046, 0.2056955, 0.1732219, -0.0507800...",0,0
1,0_1,"[0.049072143, 0.20504853, 0.16913632, -0.04101...",0,0
2,0_10,"[-0.010199165, 0.19084008, 0.142129, -0.027013...",1,0
3,0_100,"[0.1076907, 0.18990463, 0.13662358, -0.0059914...",1,0
4,0_101,"[0.110753596, 0.21662608, 0.14855266, -0.02414...",1,0
...,...,...,...,...
2132,8_95,"[-0.12547426, 0.10333069, 0.012717849, 0.00890...",0,8
2133,8_96,"[-0.13536794, 0.08814793, 0.013134736, -0.0146...",1,8
2134,8_97,"[-0.13311367, 0.09247862, 0.016269028, 0.00072...",1,8
2135,8_98,"[-0.12481897, 0.07730664, 0.014376387, 0.00727...",0,8


In [4]:
def create_game_based_data_loaders(df, batch_size, sequence_length=5):
    game_loaders = {}
    grouped = df.groupby("MatchID")

    for match_id, group in grouped:
        # Convert embeddings to array
        all_embeddings = np.vstack(group["aggregated_embedding"].values).astype(np.float32)  # shape: (num_periods, embedding_dim)
        all_labels = group["EventType"].values  # shape: (num_periods,)

        sequences = []
        seq_labels = []

        # Create sequences
        for i in range(len(all_embeddings) - sequence_length + 1):
            seq_emb = all_embeddings[i:i+sequence_length]  # shape: (seq_len, embedding_dim)
            seq_label = all_labels[i+sequence_length-1]    # Label of the last element in the sequence
            sequences.append(seq_emb)
            seq_labels.append(seq_label)

        sequences = np.array(sequences)        # shape: (num_samples, seq_len, embedding_dim)
        seq_labels = np.array(seq_labels)      # shape: (num_samples,)

        dataset = TensorDataset(
            torch.tensor(sequences, dtype=torch.float32), 
            torch.tensor(seq_labels, dtype=torch.float32)
        )
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        game_loaders[match_id] = loader

    return game_loaders

# Split train data for cross-validation
train_df_cv, val_df_cv = train_test_split(
    train_df, test_size=0.2, random_state=42, stratify=train_df["MatchID"]
)

# Create game-based data loaders
train_game_loaders = create_game_based_data_loaders(train_df_cv, batch_size=32, sequence_length=3)
val_game_loaders = create_game_based_data_loaders(val_df_cv, batch_size=32, sequence_length=3)

In [5]:
import math
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model=84, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        x = x + self.pe[:, :x.size(1), :]
        return x

In [6]:
class ChronologicalLSTM(nn.Module):
    def __init__(self, input_dim=84, hidden_dim=126, num_layers=2, bidirectional=True):
        super().__init__()
        self.hidden_dim = hidden_dim # Hidden size of the LSTM
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            dropout=0.2,
            batch_first=True,
            bidirectional=bidirectional
        )
        
        # hidden_dim doubled due to bidirectionality
        self.intermediate_dim = hidden_dim * 2

        # Additional feed-forward layer
        self.fc1 = nn.Linear(self.intermediate_dim, self.intermediate_dim)
        self.activation = nn.ReLU()
        self.classifier = nn.Linear(self.intermediate_dim, 1)

    def forward(self, embeddings):
        # embeddings: (batch_size, seq_len, input_dim)
        _, (hidden, _) = self.lstm(embeddings)
        # hidden: (num_layers*2, batch_size, hidden_dim)
        # Extract the last layer's forward and backward hidden states:
        if self.lstm.bidirectional:
            hidden_state = torch.cat((hidden[-2], hidden[-1]), dim=-1)  # Concatenate forward and backward
        else:
            hidden_state = hidden[-1]

        x = self.fc1(hidden_state)
        x = self.activation(x)

        logits = self.classifier(x)  # (batch_size, 1)
        return torch.sigmoid(logits).squeeze(-1)  # (batch_size,)

In [7]:
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim=84, num_heads=2, hidden_dim=84, num_layers=2):
        super().__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)  # Match input_dim with embedding
        self.projection = nn.Linear(input_dim, hidden_dim)
        self.pos_encoder = PositionalEncoding(d_model=hidden_dim)

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, 
            nhead=num_heads, 
            dim_feedforward=hidden_dim * 4,
            dropout=0.2,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.classifier = nn.Linear(hidden_dim, 1)

    def forward(self, embeddings):
        # embeddings: (batch_size, seq_len, input_dim)
        x = self.projection(embeddings)    # (batch_size, seq_len, hidden_dim)
        x = self.pos_encoder(x)            # Add positional encoding
        x = self.transformer(x)            # (batch_size, seq_len, hidden_dim)

        pooled_output = x.mean(dim=1)      # (batch_size, hidden_dim)
        x = self.fc1(pooled_output)
        x = self.activation(x)
        
        logits = self.classifier(x)        # (batch_size, 1)
        return torch.sigmoid(logits).squeeze(-1)  # (batch_size,)

In [8]:
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score

def train_game_based_model(model, game_loaders, val_loader, epochs, lr, device):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()

    # Calculate total number of training steps
    total_batches = sum(len(loader) for loader in game_loaders.values())
    total_steps = epochs * total_batches
    warmup_steps = int(0.1 * total_steps)

    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=warmup_steps, 
        num_training_steps=total_steps
    )

    best_val_acc = 0.0

    for epoch in range(epochs):
        model.train()
        train_loss, train_accuracy = 0, 0
        total_train_batches = 0

        print(f"Epoch {epoch + 1}/{epochs}...")
        for match_id, game_loader in game_loaders.items():
            for embeddings, labels in game_loader:
                embeddings, labels = embeddings.to(device), labels.float().to(device)
                optimizer.zero_grad()
                outputs = model(embeddings)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                scheduler.step()

                train_loss += loss.item()
                preds = (outputs.detach().cpu().numpy() > 0.5).astype(int)
                train_accuracy += accuracy_score(labels.cpu().numpy(), preds)
                total_train_batches += 1

        # Validation
        model.eval()
        val_loss, val_accuracy = 0, 0
        total_val_batches = 0
        with torch.no_grad():
            for match_id, val_game_loader in val_loader.items():
                for embeddings, labels in val_game_loader:
                    embeddings, labels = embeddings.to(device), labels.float().to(device)
                    outputs = model(embeddings)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()
                    preds = (outputs.cpu().numpy() > 0.5).astype(int)
                    val_accuracy += accuracy_score(labels.cpu().numpy(), preds)
                    total_val_batches += 1

        avg_train_loss = train_loss / total_train_batches
        avg_train_acc = train_accuracy / total_train_batches
        avg_val_loss = val_loss / total_val_batches
        avg_val_acc = val_accuracy / total_val_batches

        print(f"Epoch {epoch + 1}: Train Loss {avg_train_loss:.4f}, Train Acc {avg_train_acc:.4f}")
        print(f"Validation Loss: {avg_val_loss:.4f}, Validation Acc {avg_val_acc:.4f}")

        # Track best validation accuracy
        if avg_val_acc > best_val_acc:
            best_val_acc = avg_val_acc
            # Optionally save the model if you wish
            # torch.save(model.state_dict(), "best_model.pt")

    return model

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def evaluate_model_comparison(models, val_game_loaders, device):
    """
    Evaluate and compare multiple models on the validation set.

    Args:
        models (dict): Dictionary where keys are model names and values are trained models.
        val_game_loaders (dict): Validation loaders grouped by game.
        device: Computation device (CPU/GPU).

    Returns:
        pd.DataFrame: DataFrame containing metrics for each model.
    """
    results = []

    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        model.to(device)
        model.eval()

        all_preds, all_labels = [], []
        with torch.no_grad():
            for match_id, val_loader in val_game_loaders.items():
                for embeddings, labels in val_loader:
                    embeddings, labels = embeddings.to(device), labels.float().to(device)  # Ensure labels are float
                    outputs = model(embeddings).squeeze(-1)  # Ensure shape matches (batch_size,)
                    preds = (outputs > 0.5).int()

                    all_preds.extend(preds.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

        # Compute metrics
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, zero_division=0)
        recall = recall_score(all_labels, all_preds, zero_division=0)
        f1 = f1_score(all_labels, all_preds, zero_division=0)

        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        })

    return pd.DataFrame(results)

In [10]:
# Train Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lstm_model = ChronologicalLSTM(input_dim=768, hidden_dim=128)
transformer_model = TransformerEncoder(input_dim=768, hidden_dim=128, num_heads=4, num_layers=2)

print("Training Chronological LSTM...")
trained_lstm_model = train_game_based_model(lstm_model, train_game_loaders, val_game_loaders, epochs=80, lr=1e-4, device=device)

print("Training Transformer Encoder...")
trained_transformer_model = train_game_based_model(transformer_model, train_game_loaders, val_game_loaders, epochs=80, lr=1e-4, device=device)

# Compare Models
comparison_df = evaluate_model_comparison(
    {"Chronological LSTM": trained_lstm_model, "Transformer Encoder": trained_transformer_model},
    val_game_loaders, device
)

print(comparison_df)

Training Chronological LSTM...
Epoch 1/80...
Epoch 1: Train Loss 0.6940, Train Acc 0.4771
Validation Loss: 0.6952, Validation Acc 0.4216
Epoch 2/80...
Epoch 2: Train Loss 0.6935, Train Acc 0.4858
Validation Loss: 0.6934, Validation Acc 0.4265
Epoch 3/80...
Epoch 3: Train Loss 0.6926, Train Acc 0.5153
Validation Loss: 0.6910, Validation Acc 0.5509
Epoch 4/80...
Epoch 4: Train Loss 0.6923, Train Acc 0.5235
Validation Loss: 0.6888, Validation Acc 0.5784
Epoch 5/80...
Epoch 5: Train Loss 0.6914, Train Acc 0.5328
Validation Loss: 0.6891, Validation Acc 0.5509
Epoch 6/80...
Epoch 6: Train Loss 0.6882, Train Acc 0.5489
Validation Loss: 0.6824, Validation Acc 0.5784
Epoch 7/80...
Epoch 7: Train Loss 0.6911, Train Acc 0.5363
Validation Loss: 0.6876, Validation Acc 0.5509
Epoch 8/80...
Epoch 8: Train Loss 0.6905, Train Acc 0.5276
Validation Loss: 0.6877, Validation Acc 0.6085
Epoch 9/80...
Epoch 9: Train Loss 0.6905, Train Acc 0.5379
Validation Loss: 0.6910, Validation Acc 0.5376
Epoch 10/80...


In [16]:
import optuna

def objective(trial, train_game_loaders, val_game_loaders, device):
    """
    Objective function for hyperparameter tuning using Optuna.

    Args:
        trial (optuna.trial.Trial): A single trial for Optuna optimization.
        train_game_loaders (dict): Game-based training DataLoaders.
        val_game_loaders (dict): Game-based validation DataLoaders.
        device: Computation device (CPU or GPU).

    Returns:
        float: Validation accuracy.
    """
    # Suggest hyperparameters
    input_dim = 768
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    model_type = trial.suggest_categorical("model_type", ["LSTM", "Transformer"])

    if model_type == "LSTM":
        hidden_dim = trial.suggest_int("hidden_dim", 64, 256)
        model = ChronologicalLSTM(input_dim=input_dim, hidden_dim=hidden_dim)
    else:
        possible_heads = [h for h in [2, 4, 8] if input_dim % h == 0]  # Restrict to valid num_heads
        num_heads = trial.suggest_categorical("num_heads", possible_heads)
        hidden_dim = trial.suggest_int("hidden_dim", 64, 256)
        if hidden_dim % num_heads != 0:
            hidden_dim -= hidden_dim % num_heads  # Adjust hidden_dim to be divisible by num_heads
        num_layers = trial.suggest_int("num_layers", 1, 4)
        model = TransformerEncoder(input_dim=input_dim, hidden_dim=hidden_dim, num_heads=num_heads, num_layers=num_layers)

    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()

    # Train the model
    model.train()
    for epoch in range(10):  # For faster tuning, use fewer epochs
        train_loss = 0
        for match_id, game_loader in train_game_loaders.items():
            for embeddings, labels in game_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(embeddings).squeeze()
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()

    # Evaluate on validation set
    model.eval()
    val_accuracy = 0
    total_batches = 0
    with torch.no_grad():
        for match_id, val_loader in val_game_loaders.items():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                outputs = model(embeddings).squeeze()
                val_accuracy += accuracy_score(labels.cpu().numpy(), (outputs.cpu().numpy() > 0.5))
                total_batches += 1

    return val_accuracy / total_batches

# Run Optuna optimization
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, train_game_loaders, val_game_loaders, device), n_trials=50)

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)

[I 2024-12-05 21:51:28,472] A new study created in memory with name: no-name-7eddd007-b1ea-4b9a-9661-a23ca7afb360
[I 2024-12-05 21:51:34,132] Trial 0 finished with value: 0.5784313725490196 and parameters: {'lr': 0.00017893271132792836, 'model_type': 'Transformer', 'num_heads': 4, 'hidden_dim': 126, 'num_layers': 3}. Best is trial 0 with value: 0.5784313725490196.
[I 2024-12-05 21:51:43,271] Trial 1 finished with value: 0.5508578431372548 and parameters: {'lr': 0.0002211757147703584, 'model_type': 'Transformer', 'num_heads': 8, 'hidden_dim': 178, 'num_layers': 4}. Best is trial 0 with value: 0.5784313725490196.
[I 2024-12-05 21:51:47,315] Trial 2 finished with value: 0.5784313725490196 and parameters: {'lr': 6.475451457387286e-05, 'model_type': 'Transformer', 'num_heads': 8, 'hidden_dim': 194, 'num_layers': 2}. Best is trial 0 with value: 0.5784313725490196.
[I 2024-12-05 21:51:50,843] Trial 3 finished with value: 0.5430964052287582 and parameters: {'lr': 6.603878395923621e-05, 'model_

Best hyperparameters: {'lr': 1.7738788851617893e-05, 'model_type': 'Transformer', 'num_heads': 2, 'hidden_dim': 84, 'num_layers': 2}


In [18]:
best_params = study.best_params
input_dim = 768
hidden_dim = best_params["hidden_dim"]
lr = best_params["lr"]

if best_params["model_type"] == "LSTM":
    best_model = ChronologicalLSTM(input_dim=input_dim, hidden_dim=hidden_dim)
    print(f"Best LSTM model with hidden_dim={hidden_dim}")
else:
    print(f"Best Transformer model with hidden_dim={hidden_dim}, num_heads={best_params['num_heads']}, num_layers={best_params['num_layers']}")
    best_model = TransformerEncoder(
        input_dim=input_dim, 
        hidden_dim=hidden_dim, 
        num_heads=best_params.get("num_heads", 4), 
        num_layers=best_params.get("num_layers", 2)
    )

# Train the best model
trained_best_model = train_game_based_model(best_model, train_game_loaders, val_game_loaders, epochs=60, lr=lr, device=device)


Best Transformer model with hidden_dim=84, num_heads=2, num_layers=2
Epoch 1/60...
Epoch 1: Train Loss 0.7228, Train Acc 0.4464
Validation Loss: 0.7270, Validation Acc 0.4216
Epoch 2/60...
Epoch 2: Train Loss 0.7045, Train Acc 0.4720
Validation Loss: 0.7083, Validation Acc 0.4216
Epoch 3/60...
Epoch 3: Train Loss 0.6958, Train Acc 0.4796
Validation Loss: 0.6939, Validation Acc 0.4957
Epoch 4/60...
Epoch 4: Train Loss 0.6924, Train Acc 0.5248
Validation Loss: 0.6900, Validation Acc 0.5509
Epoch 5/60...
Epoch 5: Train Loss 0.6913, Train Acc 0.5329
Validation Loss: 0.6887, Validation Acc 0.5509
Epoch 6/60...
Epoch 6: Train Loss 0.6927, Train Acc 0.5239
Validation Loss: 0.6892, Validation Acc 0.5509
Epoch 7/60...
Epoch 7: Train Loss 0.6910, Train Acc 0.5330
Validation Loss: 0.6856, Validation Acc 0.5784
Epoch 8/60...
Epoch 8: Train Loss 0.6909, Train Acc 0.5355
Validation Loss: 0.6880, Validation Acc 0.5509
Epoch 9/60...
Epoch 9: Train Loss 0.6920, Train Acc 0.5178
Validation Loss: 0.6895,

In [19]:
# Include fine-tuned model in comparison
comparison_df = evaluate_model_comparison(
    {
        "Chronological LSTM": trained_lstm_model,
        "Transformer Encoder": trained_transformer_model,
        "Fine-Tuned Model": trained_best_model
    },
    val_game_loaders, device
)

print(comparison_df)


Evaluating Chronological LSTM...
Evaluating Transformer Encoder...
Evaluating Fine-Tuned Model...
                 Model  Accuracy  Precision    Recall  F1-Score
0   Chronological LSTM  0.719697   0.789216  0.703057  0.743649
1  Transformer Encoder  0.714646   0.798969  0.676856  0.732861
2     Fine-Tuned Model  0.578283   0.623016  0.685590  0.652807


In [23]:
# Predict on the test set
def predict_for_kaggle_submission(model, test_df, device):
    model.to(device)
    model.eval()
    predictions = []

    with torch.no_grad():
        grouped = test_df.groupby("MatchID")
        for match_id, group in grouped:
            embeddings = np.vstack(group["aggregated_embedding"].values)
            embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)
            
            # Check for mismatched input size
            assert embeddings_tensor.shape[1] == model.hidden_dim, (
                f"Input size mismatch: {embeddings_tensor.shape[1]} vs model hidden_dim {model.hidden_dim}"
            )
            
            outputs = model(embeddings_tensor).squeeze(-1)  # Add `-1` to handle trailing dimensions
            preds = (outputs.cpu().numpy() > 0.5).astype(int)
            group["EventType"] = preds
            predictions.append(group[["ID", "EventType"]])

    return pd.concat(predictions)


# Generate predictions for submission
# submission_df = predict_for_kaggle_submission(trained_lstm_model, test_df, device)
submission_df = predict_for_kaggle_submission(trained_best_model, test_df, device)
submission_df.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv.")


AttributeError: 'TransformerEncoder' object has no attribute 'hidden_dim'

In [61]:
# Predict on the test set
def predict_for_kaggle_submission(model, test_df, device):
    model.to(device)
    model.eval()
    predictions = []

    with torch.no_grad():
        grouped = test_df.groupby("MatchID")
        for match_id, group in grouped:
            embeddings = np.vstack(group["aggregated_embedding"].values)
            embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)
            
            # Check for mismatched input size
            assert embeddings_tensor.shape[1] == model.hidden_dim, (
                f"Input size mismatch: {embeddings_tensor.shape[1]} vs model hidden_dim {model.hidden_dim}"
            )
            
            outputs = model(embeddings_tensor).squeeze(-1)  # Add `-1` to handle trailing dimensions
            preds = (outputs.cpu().numpy() > 0.5).astype(int)
            group["EventType"] = preds
            predictions.append(group[["ID", "EventType"]])

    return pd.concat(predictions)

# Generate predictions for submission
submission_df = predict_for_kaggle_submission(trained_lstm_model, test_df, device)
submission_df.to_csv("submission2.csv", index=False)
print("Submission file saved as submission2.csv.")


AttributeError: 'ChronologicalLSTM' object has no attribute 'hidden_dim'

In [None]:
# Predict on the test set
def predict_for_kaggle_submission(model, test_df, device):
    model.to(device)
    model.eval()
    predictions = []

    with torch.no_grad():
        grouped = test_df.groupby("MatchID")
        for match_id, group in grouped:
            embeddings = np.vstack(group["aggregated_embedding"].values)
            embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)
            
            # Check for mismatched input size
            assert embeddings_tensor.shape[1] == model.hidden_dim, (
                f"Input size mismatch: {embeddings_tensor.shape[1]} vs model hidden_dim {model.hidden_dim}"
            )
            
            outputs = model(embeddings_tensor).squeeze(-1)  # Add `-1` to handle trailing dimensions
            preds = (outputs.cpu().numpy() > 0.5).astype(int)
            group["EventType"] = preds
            predictions.append(group[["ID", "EventType"]])

    return pd.concat(predictions)

# Generate predictions for submission
submission_df = predict_for_kaggle_submission(trained_transformer_model, test_df, device)
submission_df.to_csv("submission3.csv", index=False)
print("Submission file saved as submission3.csv.")
