### Model for 2 players only.

In [115]:
# 1. Start by loading all the csv files of 2 players in a dataframe
import pandas as pd
from pathlib import Path

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path structure in Google Drive
games_folder = Path("/content/drive/MyDrive/Colab Notebooks/DL2_model/2_games")

print("Checking folder:", games_folder)

#Load the Files

if not games_folder.exists():
    print("Error: Folder does not exist.")
else:
    csv_files = sorted(games_folder.glob("*.csv"))
    print(f"Found {len(csv_files)} CSV files")

    if len(csv_files) == 0:
        print("No csv files found in folder")
    else:
        dfs = [pd.read_csv(file) for file in csv_files]
        combined_df = pd.concat(dfs, ignore_index=True)

        print("Combined DataFrame shape:", combined_df.shape)
        display(combined_df.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Checking folder: /content/drive/MyDrive/Colab Notebooks/DL2_model/2_games
Found 3500 CSV files
Combined DataFrame shape: (203568, 403)


Unnamed: 0,game_id,num_players,turn_number,current_player,gems_board_white,gems_board_blue,gems_board_green,gems_board_red,gems_board_black,gems_board_gold,...,gem_take2_green,gem_take2_red,gem_take2_black,noble_selection,gems_removed_white,gems_removed_blue,gems_removed_green,gems_removed_red,gems_removed_black,gems_removed_gold
0,1,2,1,0,4,4,4,4,4,5,...,0.0,2.0,0.0,-1,0,0,0,0,0,0
1,1,2,1,1,4,4,4,2,4,5,...,,,,-1,0,0,0,0,0,0
2,1,2,2,0,3,3,4,2,3,5,...,,,,-1,0,0,0,0,0,0
3,1,2,2,1,2,2,3,2,3,5,...,,,,-1,0,0,0,0,0,0
4,1,2,3,0,1,1,2,2,3,5,...,,,,-1,0,0,0,0,0,0


### 1. Feature engineering

In Splendor, optimal decision-making requires understanding both the current game state and strategic relationships between resources cards, and opponents. The 285 engineered features capture three key dimensions: (1) observable game state (gems, visible cards, nobles), (2) player capabilities (resources, reductions, victory points), and (3) derived strategic signals (card affordability, proximity to nobles, relative advantages). By pre-computing these relationships and normalizing all values to [0,1], we simplify the learning task for our MLP baseline, allowing it to focus on pattern recognition rather than complex mathematical reasoning. This approach encodes domain expertise directly into the feature space.

In [116]:
# Dictionary to store all normalized features
normalized_features = {}

#Copy of the combined dataframe
df = combined_df.copy()
features_df = pd.DataFrame()

In [117]:
# 1. GLOBAL features for capturing temporal context and available resources on the board ==> 10 features

# 1.1 Turn number for temporal context (early/mid/late game) ==> 1 feature
normalized_features['turn_number'] = df["turn_number"] / df['turn_number'].max()

# 1.2. Gems on board for resource availability ==> 6 features
for color in ['white', 'blue', 'green', 'red', 'black', 'gold']:
    normalized_features[f'gems_board_{color}'] = df[f'gems_board_{color}'] / df[f'gems_board_{color}'].max()

# 1.3. Normalize for remaining cards on deck
# Helps to learn about the progress of the game and time the strategies ==> 3 features
normalized_features['deck_level1_remaining'] = df['deck_level1_remaining'] / df['deck_level1_remaining'].max()
normalized_features['deck_level2_remaining'] = df['deck_level2_remaining'] / df['deck_level2_remaining'].max()
normalized_features['deck_level3_remaining'] = df['deck_level3_remaining'] / df['deck_level3_remaining'].max()

In [118]:
# 2. VISIBLE CARDS ==> 12 cards x 7 features (each color) = 84 features
# Cards on the board represent immediate purchasing opportunities

# 2.1. Normalize for victory points and cost for each color
# The maximum for vp across all card is 5
# There are 12 cards in total

for i in range(12):
  normalized_features[f'card{i}_vp'] = df[f'card{i}_vp'] / 5.0
  normalized_features[f'card{i}_level'] = df[f'card{i}_level'] / 3.0 # Level (ordinal: 1, 2, 3 → represents difficulty/tier)
  for color in ['white', 'blue', 'green', 'red', 'black']:
    normalized_features[f'card{i}_cost_{color}'] = df[f'card{i}_cost_{color}'] / df[f'card{i}_cost_{color}'].max()

In [119]:
#3. NOBLES ==> 5 nobles x 6 features = 30 features
# Represent long-term strategic objectives
for i in range(5):
    # VP (always 3, but normalized for consistency)
    normalized_features[f'noble{i}_vp'] = df[f'noble{i}_vp'] / 3.0

    # Requirements (reduction bonuses needed)
    for color in ['white', 'blue', 'green', 'red', 'black']:
        normalized_features[f'noble{i}_req_{color}'] = df[f'noble{i}_req_{color}'] / df[f'noble{i}_req_{color}'].max()

In [120]:
# 4. PLAYER STATES (2 players × 42 features = 84 features)
# Captures resources, reductions, VP, and reserved cards for both players
for player_idx in range(2):
    # 4.1. Gems (immediate purchasing power)
    for color in ['white', 'blue', 'green', 'red', 'black', 'gold']:
        normalized_features[f'player{player_idx}_gems_{color}'] = df[f'player{player_idx}_gems_{color}'] / df[f'player{player_idx}_gems_{color}'].max()

    # 4.2. Permanent reductions (the "engine" - permanent discounts)
    for color in ['white', 'blue', 'green', 'red', 'black']:
        normalized_features[f'player{player_idx}_reduction_{color}'] = df[f'player{player_idx}_reduction_{color}'] / df[f'player{player_idx}_reduction_{color}'].max()

    # 4.3. Victory points (goal is 15)
    normalized_features[f'player{player_idx}_vp'] = df[f'player{player_idx}_vp'] / df[f'player{player_idx}_vp'].max()

    # 4.4. Position (0 or 1 - already binary)
    normalized_features[f'player{player_idx}_position'] = df[f'player{player_idx}_position']

    # 4.5. Reserved cards (hidden strategic advantage - max 3 per player)
    for reserve_idx in range(3):
        normalized_features[f'player{player_idx}_reserved{reserve_idx}_vp'] = df[f'player{player_idx}_reserved{reserve_idx}_vp'] / df[f'player{player_idx}_reserved{reserve_idx}_vp'].max()
        normalized_features[f'player{player_idx}_reserved{reserve_idx}_level'] = df[f'player{player_idx}_reserved{reserve_idx}_level'] / 3.0

        for color in ['white', 'blue', 'green', 'red', 'black']:
            normalized_features[f'player{player_idx}_reserved{reserve_idx}_cost_{color}'] = df[f'player{player_idx}_reserved{reserve_idx}_cost_{color}'] / df[f'player{player_idx}_reserved{reserve_idx}_cost_{color}'].max()

In [121]:
# 5. DERIVED STRATEGIC FEATURES
# Pre-computed relationships that help the MLP understand strategic situations
import numpy as np

#5.1 Affordability
# Question: "Can the active player buy this card RIGHT NOW with current resources?"
# Logic: For each card, check if (gems + reductions + gold) >= card cost

# extract current_player for fast indexing
current_player = df['current_player'].values

# Loop through each of the 12 visible cards
for card_idx in range(12):

    #1.Calculate total gold needed for PLAYER 0 to buy this card
    total_gold_p0 = np.zeros(len(df))  # Start with 0 gold needed per row

    for color in ['white', 'blue', 'green', 'red', 'black']:
        # Get card cost for this color (same for all rows)
        cost = df[f'card{card_idx}_cost_{color}'].values

        # Calculate what player 0 has available = gems + permanent reductions
        available_p0 = (df[f'player0_gems_{color}'].values +
                       df[f'player0_reduction_{color}'].values)

        # Calculate : how much is missing? (0 if already enough)
        shortfall_p0 = np.maximum(0, cost - available_p0)

        # Accumulate shortfall across all 5 colors
        total_gold_p0 += shortfall_p0

    # Check if player 0 has enough gold to cover all shortfalls
    can_afford_p0 = (total_gold_p0 <= df['player0_gems_gold'].values).astype(float)

    #Same logic for the other player

    # 2. Calculate total gold needed for PLAYER 1 to buy this card
    total_gold_p1 = np.zeros(len(df))  # Start with 0 gold needed per row

    for color in ['white', 'blue', 'green', 'red', 'black']:
        # Get card cost for this color
        cost = df[f'card{card_idx}_cost_{color}'].values

        # Calculate what player 1 has available
        available_p1 = (df[f'player1_gems_{color}'].values +
                       df[f'player1_reduction_{color}'].values)

        # Calculate shortfall for player 1
        shortfall_p1 = np.maximum(0, cost - available_p1)

        # Accumulate shortfall across all 5 colors
        total_gold_p1 += shortfall_p1

    # Check if player 1 has enough gold to cover all shortfalls
    can_afford_p1 = (total_gold_p1 <= df['player1_gems_gold'].values).astype(float)


    # 3. Select the right affordability based on whose turn it is
    # For each row: if current_player=0, use can_afford_p0, else use can_afford_p1
    normalized_features[f'can_afford_card{card_idx}'] = np.where(
        current_player == 0,  # Condition: is it player 0's turn?
        can_afford_p0,        # If yes, use player 0's affordability
        can_afford_p1         # If no, use player 1's affordability
    )

In [122]:
# 5.2 Distances to Nobles (25 features: 5 nobles × 5 colors)
# How many more reductions needed to attract each noble?

current_player = df['current_player'].values

# Loop through each of the 5 nobles
for noble_idx in range(5):

    # Loop through each color requirement
    for color in ['white', 'blue', 'green', 'red', 'black']:

        # 1.Get noble requirement for this color (same across all rows)
        required = df[f'noble{noble_idx}_req_{color}'].values

        # 2.Get reductions owned by each player
        owned_p0 = df[f'player0_reduction_{color}'].values
        owned_p1 = df[f'player1_reduction_{color}'].values

        # 3.Calculate distance for each player
        # Distance = how many more reductions needed (0 if already satisfied)
        distance_p0 = np.maximum(0, required - owned_p0)
        distance_p1 = np.maximum(0, required - owned_p1)

        # 4.Select based on whose turn it is
        # For each row: if current_player=0, use distance_p0, else use distance_p1
        distance_active = np.where(
            current_player == 0,  # Condition: is it player 0's turn?
            distance_p0,          # If yes, use player 0's distance
            distance_p1           # If no, use player 1's distance
        )

        # 5.Normalize by max requirement (4)
        normalized_features[f'distance_noble{noble_idx}_{color}'] = distance_active / 4.0

In [123]:
# 5.3 Relative advantages
# Compare active player vs opponent on some key metrics
# Relative VP

# Pre-extract current_player as numpy array for fast indexing
current_player = df['current_player'].values

# --- F3.1: RELATIVE VICTORY POINTS ---
# Who is winning? Positive = active player ahead, Negative = opponent ahead
vp_p0 = df['player0_vp'].values
vp_p1 = df['player1_vp'].values

# Calculate difference: active player VP - opponent VP
relative_vp = np.where(
    current_player == 0,     # If player 0's turn
    vp_p0 - vp_p1,          # Player 0 - Player 1
    vp_p1 - vp_p0           # Player 1 - Player 0
)
normalized_features['relative_vp'] = relative_vp / 15.0  # Normalize by winning VP

In [124]:
# 5.4 RELATIVE GEMS (5 colors)
# Who has more gems of each color?
for color in ['white', 'blue', 'green', 'red', 'black']:
    gems_p0 = df[f'player0_gems_{color}'].values
    gems_p1 = df[f'player1_gems_{color}'].values

    # Calculate difference: active player gems - opponent gems
    relative_gems = np.where(
        current_player == 0,     # If player 0's turn
        gems_p0 - gems_p1,      # Player 0 - Player 1
        gems_p1 - gems_p0       # Player 1 - Player 0
    )
    normalized_features[f'relative_gems_{color}'] = relative_gems / relative_gems.max() # Normalize by max gems


In [125]:
# 5.5 RELATIVE REDUCTIONS (5 colors) ---
# Who has a stronger "engine" (more permanent bonuses)?
for color in ['white', 'blue', 'green', 'red', 'black']:
    reduction_p0 = df[f'player0_reduction_{color}'].values
    reduction_p1 = df[f'player1_reduction_{color}'].values

    # Calculate difference: active player reductions - opponent reductions
    relative_reductions = np.where(
        current_player == 0,     # If player 0's turn
        reduction_p0 - reduction_p1,  # Player 0 - Player 1
        reduction_p1 - reduction_p0   # Player 1 - Player 0
    )
    normalized_features[f'relative_reduction_{color}'] = relative_reductions / relative_reductions.max() # Normalize by typical max


In [126]:
# 5.6 GEM DIVERSITY
# How many different gem colors does the active player have?
# Get gems for each player and color
gems_p0_white = df['player0_gems_white'].values
gems_p0_blue = df['player0_gems_blue'].values
gems_p0_green = df['player0_gems_green'].values
gems_p0_red = df['player0_gems_red'].values
gems_p0_black = df['player0_gems_black'].values

gems_p1_white = df['player1_gems_white'].values
gems_p1_blue = df['player1_gems_blue'].values
gems_p1_green = df['player1_gems_green'].values
gems_p1_red = df['player1_gems_red'].values
gems_p1_black = df['player1_gems_black'].values

# Count how many colors each player has (binary: has gems or not)
diversity_p0 = ((gems_p0_white > 0).astype(int) +
                (gems_p0_blue > 0).astype(int) +
                (gems_p0_green > 0).astype(int) +
                (gems_p0_red > 0).astype(int) +
                (gems_p0_black > 0).astype(int))

diversity_p1 = ((gems_p1_white > 0).astype(int) +
                (gems_p1_blue > 0).astype(int) +
                (gems_p1_green > 0).astype(int) +
                (gems_p1_red > 0).astype(int) +
                (gems_p1_black > 0).astype(int))

# Select based on current player and normalize by max (5 colors)
gem_diversity = np.where(
    current_player == 0,
    diversity_p0,
    diversity_p1
) / 5.0

normalized_features['gem_diversity'] = gem_diversity

In [127]:
# 5.7 Total gems
# How many total gems does the active player have? (approaching 10 = must buy/reserve soon)
total_gems_p0 = (gems_p0_white + gems_p0_blue + gems_p0_green +
                 gems_p0_red + gems_p0_black + df['player0_gems_gold'].values)

total_gems_p1 = (gems_p1_white + gems_p1_blue + gems_p1_green +
                 gems_p1_red + gems_p1_black + df['player1_gems_gold'].values)

# Select based on current player and normalize by max (10 gems)
total_gems = np.where(
    current_player == 0,
    total_gems_p0,
    total_gems_p1
) / 10.0

normalized_features['total_gems'] = total_gems

In [128]:
# 5.8 Total reduction
# How powerful is the active player (more reductions = easier to buy cards)
total_reductions_p0 = (df['player0_reduction_white'].values +
                       df['player0_reduction_blue'].values +
                       df['player0_reduction_green'].values +
                       df['player0_reduction_red'].values +
                       df['player0_reduction_black'].values)

total_reductions_p1 = (df['player1_reduction_white'].values +
                       df['player1_reduction_blue'].values +
                       df['player1_reduction_green'].values +
                       df['player1_reduction_red'].values +
                       df['player1_reduction_black'].values)

# Select based on current player and normalize by theoretical max of 35
total_reductions = np.where(
    current_player == 0,
    total_reductions_p0,
    total_reductions_p1
) / 24.0

normalized_features['total_reductions'] = total_reductions

In [129]:
#6. Concatenate all features to avoid fragmentation of dataframe
# From dictionary to dataframe

features_df = pd.DataFrame(normalized_features)

In [130]:
# Remove redundant player0_position and player1_position from the dataframe
cols_to_remove = ["player0_position", "player1_position"]
features_df = features_df.drop(columns=cols_to_remove)

In [131]:
# Download first 57 rows as CSV to make some manual spotchecks
# features_df.head(57).to_csv('first_57_rows2.csv', index=False)

# from google.colab import files
# files.download('first_57_rows2.csv')

#### 2. Data cleaning

In [132]:
# 2.1 Delete the empty columns in the features_df
features_df = features_df.dropna(axis=1)

In [133]:
# 2.2 Find the final number of features
nb_features = features_df.columns.size
print("Number of features:", nb_features)

Number of features: 144


In [134]:
# 2.3 Define the Target Variable (Y): Extract the action the current player took (the label your model will try to predict) from the original combined DataFrame.
# Y is action_type in the df
Y = df["action_type"]

In [135]:
# 2.4 Print shape of target column and feature dataframe
rows, cols = features_df.shape

print("Number of rows in feature dataframe:", rows)
print("Number of columns in feature dataframe:", cols)
print('\n')
print("Number of rows in the target column:", Y.size)

Number of rows in feature dataframe: 203568
Number of columns in feature dataframe: 144


Number of rows in the target column: 203568


In [136]:
# 2.5 Combine the target column with the feature dataframe
Y = Y.rename("target")
final_df = pd.concat([features_df, Y], axis=1)

#### 3. Split the dataset into train and test dataset



In [137]:
from sklearn.model_selection import train_test_split

# 1. Separate features (X) and target (y)
X = final_df.drop(columns=["target"])
y = final_df["target"]

# 2. First split: 70% train, 30% temp (which will become 15% val + 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.3,           # 30% for temp (val + test)
    random_state=42,         # reproducibility
    stratify=y               # keeps class distribution balanced
)

# 3. Second split: split the 30% temp into 15% val and 15% test (50-50 split of temp)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,           # 50% of temp = 15% of total
    random_state=42,         # reproducibility
    stratify=y_temp          # keeps class distribution balanced
)

print("Train set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (142497, 144) (142497,)
Validation set shape: (30535, 144) (30535,)
Test set shape: (30536, 144) (30536,)


In [138]:
# 3. Print the resulting sizes to verify the split
print(f"Original dataset size: {len(X)} rows")
print("-" * 40)
print(f"Training features (X_train) size:   {X_train.shape}")
print(f"Validation features (X_val) size:  {X_val.shape}")
print(f"Testing features (X_test) size:    {X_test.shape}")
print("-" * 40)
print(f"Training target (y_train) size:    {y_train.shape}")
print(f"Validation target (y_val) size:   {y_val.shape}")
print(f"Testing target (y_test) size:     {y_test.shape}")

# 4. Verify the class balance
print("\nClass distribution in y_train:")
print(y_train.value_counts(normalize=True))

print("\nClass distribution in y_val:")
print(y_val.value_counts(normalize=True))

print("\nClass distribution in y_test:")
print(y_test.value_counts(normalize=True))


Original dataset size: 203568 rows
----------------------------------------
Training features (X_train) size:   (142497, 144)
Validation features (X_val) size:  (30535, 144)
Testing features (X_test) size:    (30536, 144)
----------------------------------------
Training target (y_train) size:    (142497,)
Validation target (y_val) size:   (30535,)
Testing target (y_test) size:     (30536,)

Class distribution in y_train:
target
build            0.518193
take 3 tokens    0.454838
take 2 tokens    0.026176
reserve          0.000793
Name: proportion, dtype: float64

Class distribution in y_val:
target
build            0.518192
take 3 tokens    0.454855
take 2 tokens    0.026167
reserve          0.000786
Name: proportion, dtype: float64

Class distribution in y_test:
target
build            0.518208
take 3 tokens    0.454840
take 2 tokens    0.026166
reserve          0.000786
Name: proportion, dtype: float64


#### 4. Address class imbalance
>With reserve at only 0.08%, use class weights in your loss function (inversely proportional to class frequencies) to prevent the model from ignoring this rare but strategically important action.

### 5 . Training the MLP model

In [139]:
# Importing the right libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# 1. Convert to PyTorch tensors
# Convert features
X_train_tensor = torch.FloatTensor(X_train.values)
X_val_tensor   = torch.FloatTensor(X_val.values)
X_test_tensor  = torch.FloatTensor(X_test.values)

# Convert target
# Convert target labels (strings) to class indices
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_indices = label_encoder.fit_transform(y_train)   # Fit on train
y_val_indices   = label_encoder.transform(y_val)         # Transform validation
y_test_indices  = label_encoder.transform(y_test)        # Transform test

# Convert to PyTorch tensors
y_train_tensor = torch.LongTensor(y_train_indices)
y_val_tensor   = torch.LongTensor(y_val_indices)
y_test_tensor  = torch.LongTensor(y_test_indices)

In [140]:
# 1.1 Verify encoding
print("Label encoding:")
for idx, label in enumerate(label_encoder.classes_):
    print(f"  {idx}: {label}")

Label encoding:
  0: build
  1: reserve
  2: take 2 tokens
  3: take 3 tokens


In [141]:
# 2. Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset   = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset  = TensorDataset(X_test_tensor, y_test_tensor)

# 3. Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [142]:
# 3. Compute the class weights (for imbalanced classes)
# This time we do it before training with the right label encoding

# Label encoding:
#   0: build
#   1: reserve
#   2: take 2 tokens
#   3: take 3 tokens

from sklearn.utils.class_weight import compute_class_weight

classes = np.array([0, 1, 2, 3])

# Compute weights based on training set only
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train_indices
)
class_weights_tensor = torch.FloatTensor(class_weights)

# Move class weights to GPU (for criterion)
class_weights_tensor = class_weights_tensor.to(device)

print("Class weights:")
class_names = ['build', 'reserve', 'take 2 tokens', 'take 3 tokens']
for cls_name, weight in zip(class_names, class_weights):
    print(f"  {cls_name}: {weight:.2f}")

Class weights:
  build: 0.48
  reserve: 315.26
  take 2 tokens: 9.55
  take 3 tokens: 0.55


In [143]:
# 4. Define the MLP Model

class SplendorMLP(nn.Module):
    def __init__(self, input_dim, hidden1, hidden2, num_classes, dropout):
        super(SplendorMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden1)
        self.bn1 = nn.BatchNorm1d(hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.bn2 = nn.BatchNorm1d(hidden2)
        self.fc3 = nn.Linear(hidden2, num_classes)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)  # No softmax here - CrossEntropyLoss handles it
        return x

In [None]:
# Check and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
# Instantiate model
model = SplendorMLP(input_dim=144, hidden1=256, hidden2=128, num_classes=4, dropout=0.3)
model = model.to(device) # Move model to GPU

In [144]:
#5. Define the loss, the optimizr and the scheduler

# CrossEntropyLoss is used for multi-class classification.
# The 'weight' argument lets us handle class imbalance by giving rare classes more importance.
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

# Adam optimizer is chosen for its adaptive learning rate and efficiency.
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Scheduler automatically reduces the learning rate when validation loss stops improving.
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

In [145]:
# 6. Training loop

num_epochs = 100
best_val_loss = float('inf')  # Track best validation loss
patience_counter = 0
early_stop_patience = 15

print("\nStarting training...")
print("="*80)

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for X_batch, y_batch in train_loader:  # Iterate through mini-batches
        # Move batch to GPU
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track metrics
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += y_batch.size(0)
        train_correct += (predicted == y_batch).sum().item()

    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = 100 * train_correct / train_total

    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            # Move batch to GPU
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += y_batch.size(0)
            val_correct += (predicted == y_batch).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * val_correct / val_total

    # Learning rate scheduling
    scheduler.step(avg_val_loss)

    # Print progress
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"  Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.2f}%")
        print(f"  Val Loss:   {avg_val_loss:.4f}, Val Acc:   {val_accuracy:.2f}%")

    # Early stopping based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_splendor_model.pth')  # Save best model
    else:
        patience_counter += 1

    if patience_counter >= early_stop_patience:
        print(f"\nEarly stopping triggered at epoch {epoch+1}")
        break

print("\n" + "="*80)
print("Training complete!")
print(f"Best validation loss: {best_val_loss:.4f}")


Starting training...
Epoch [10/100]
  Train Loss: 0.6776, Train Acc: 63.06%
  Val Loss:   0.7004, Val Acc:   62.14%
Epoch [20/100]
  Train Loss: 0.5989, Train Acc: 66.50%
  Val Loss:   0.7163, Val Acc:   66.69%

Early stopping triggered at epoch 23

Training complete!
Best validation loss: 0.6894


In [146]:
# # Final Test Evaluation - no GPU
# PER-CLASS METRICS

from sklearn.metrics import classification_report, confusion_matrix

# Get predictions
model.load_state_dict(torch.load('best_splendor_model.pth'))
model = model.to(device)
model.eval()

all_predictions = []
all_targets = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        # Move batch to GPU
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)

        # Move back to CPU for sklearn
        all_predictions.extend(predicted.cpu().numpy())
        all_targets.extend(y_batch.cpu().numpy())

all_predictions = np.array(all_predictions)
all_targets = np.array(all_targets)

# Get class names
class_names = label_encoder.classes_

# Print classification report
print("="*80)
print("CLASSIFICATION REPORT")
print("="*80)
print(classification_report(all_targets, all_predictions, target_names=class_names))

# Print confusion matrix
print("\n" + "="*80)
print("CONFUSION MATRIX")
print("="*80)
cm = confusion_matrix(all_targets, all_predictions)
cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
print(cm_df)

# Check prediction distribution
print("\n" + "="*80)
print("PREDICTION vs ACTUAL DISTRIBUTION")
print("="*80)
for i, cls_name in enumerate(class_names):
    pred_count = (all_predictions == i).sum()
    actual_count = (all_targets == i).sum()
    print(f"{cls_name:20s} - Predicted: {pred_count:5d}, Actual: {actual_count:5d}")

CLASSIFICATION REPORT
               precision    recall  f1-score   support

        build       0.74      0.74      0.74     15824
      reserve       0.03      0.67      0.07        24
take 2 tokens       0.12      0.78      0.20       799
take 3 tokens       0.77      0.49      0.60     13889

     accuracy                           0.63     30536
    macro avg       0.41      0.67      0.40     30536
 weighted avg       0.73      0.63      0.66     30536


CONFUSION MATRIX
               build  reserve  take 2 tokens  take 3 tokens
build          11714      329           1808           1973
reserve            8       16              0              0
take 2 tokens     86        0            626             87
take 3 tokens   4031      116           2984           6758

PREDICTION vs ACTUAL DISTRIBUTION
build                - Predicted: 15839, Actual: 15824
reserve              - Predicted:   461, Actual:    24
take 2 tokens        - Predicted:  5418, Actual:   799
take 3 tokens    

Architecture 1: 256 → 128 → 4, dropout 0.3
Accuracy: 63%

Macro F1: 0.41 (weighted F1: 0.67)

Strengths:

Good performance on majority classes (build and take 3 tokens).

Recall for rare classes (reserve, take 2 tokens) is surprisingly high (0.83), meaning the model often flags them.

Weaknesses:

Precision for rare classes is extremely low (0.04 and 0.11), leading to floods of false positives.

Over-prediction: e.g., reserve predicted 500 times vs only 24 actual.

Interpretation: The larger model memorizes patterns and aggressively predicts minority classes, but without precision. Accuracy looks decent because majority classes dominate.

The model achieves 63% accuracy but this metric is misleading due to severe class imbalance issues. While it handles the dominant "build" and "take 3 tokens" classes reasonably well (77% and 75% precision), it catastrophically fails on minority classes.

The most critical failure is "reserve" (0.08% of data), which the model over-predicts by 21× (500 predictions vs 24 actual), achieving only 4% precision despite 83% recall. Similarly, "take 2 tokens" is over-predicted by 7.3× (5,819 vs 799 actual) with just 11% precision. These extreme over-predictions create floods of false positives that would make the model unusable in practice.

The confusion matrix shows systematic errors: 3,164 "take 3 tokens" misclassified as "build" and 3,111 as "take 2 tokens." The model is simultaneously biased toward common classes when uncertain, yet over-triggers on rare classes. The gap between macro F1 (0.41) and weighted F1 (0.67) confirms that minority class failures are masked by dominant class performance, making the 63% accuracy fundamentally unreliable as a quality metric.

#7. Training loop with MLP (Second attempt)`

Key Changes:

1. Capped weights to 10.0 - prevents "reserve" (315→10) from dominating
2. Smaller model (256→256 becomes 128→64) - less overfitting
3. Higher dropout (0.3→0.4) - better generalization
4. L2 regularization - penalizes large weights
5. F1-score for early stopping - better metric for imbalanced data
6. Better monitoring - shows F1-score during training

In [147]:
# Check and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

Using device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB


In [148]:
# 1. COMPUTE CLASS WEIGHTS - WITH CAPPING TO PREVENT EXTREMES

classes = np.array([0, 1, 2, 3])
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train_indices
)

# CAP THE WEIGHTS: Prevent extreme values from dominating
class_weights_capped = np.clip(class_weights, 0.5, 10.0)

class_weights_tensor = torch.FloatTensor(class_weights_capped)

# Move class weights to GPU (for criterion)
class_weights_tensor = class_weights_tensor.to(device)

print("Original vs Capped Class Weights:")
class_names = label_encoder.classes_
for cls_name, orig, capped in zip(class_names, class_weights, class_weights_capped):
    print(f"  {cls_name:20s}: {orig:7.2f} -> {capped:5.2f}")

Original vs Capped Class Weights:
  build               :    0.48 ->  0.50
  reserve             :  315.26 -> 10.00
  take 2 tokens       :    9.55 ->  9.55
  take 3 tokens       :    0.55 ->  0.55


In [149]:
# Check device
print(class_weights_tensor.device)

cuda:0


In [150]:
# 3. Smaller model with higher dropout
model_v2 = SplendorMLP(input_dim=144, hidden1=128, hidden2=64, num_classes=4, dropout=0.4)
# Move model to GPU
model_v2 = model_v2.to(device)

In [151]:
print(f"\nModel architecture: 144 → 128 → 64 → 4")
print(f"Total parameters: {sum(p.numel() for p in model_v2.parameters()):,}")


Model architecture: 144 → 128 → 64 → 4
Total parameters: 27,460


In [152]:
#4. Define Loss, Optimizer with L2 Regularization and scheduler
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = optim.Adam(model_v2.parameters(), lr=0.001, weight_decay=1e-4)  # L2 regularization
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

In [153]:
# 5. TRAINING LOOP WITH GPU SUPPORT AND VALIDATION SET

num_epochs = 100
best_val_loss = float('inf')
best_val_f1 = 0.0
patience_counter = 0
early_stop_patience = 15

print("\nStarting training...")
print("="*80)

for epoch in range(num_epochs):
    # Training phase
    model_v2.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for X_batch, y_batch in train_loader:
        # Move batch to GPU
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = model_v2(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += y_batch.size(0)
        train_correct += (predicted == y_batch).sum().item()

    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = 100 * train_correct / train_total

    # Validation phase
    model_v2.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    all_val_preds = []
    all_val_targets = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            # Move batch to GPU
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            outputs = model_v2(X_batch)
            loss = criterion(outputs, y_batch)

            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += y_batch.size(0)
            val_correct += (predicted == y_batch).sum().item()

            # Move predictions back to CPU for sklearn
            all_val_preds.extend(predicted.cpu().numpy())
            all_val_targets.extend(y_batch.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * val_correct / val_total

    # Calculate macro F1-score
    from sklearn.metrics import f1_score
    val_f1_macro = f1_score(all_val_targets, all_val_preds, average='macro')

    scheduler.step(avg_val_loss)

    # Print progress every 5 epochs
    if (epoch + 1) % 5 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"  Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.2f}%")
        print(f"  Val Loss:   {avg_val_loss:.4f}, Val Acc:   {val_accuracy:.2f}%, F1: {val_f1_macro:.4f}")

    # Early stopping based on validation F1
    if val_f1_macro > best_val_f1:
        best_val_f1 = val_f1_macro
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model_v2.state_dict(), 'best_splendor_model_v2.pth')
        print(f"  ✓ New best Val F1: {val_f1_macro:.4f} (saved)")
    else:
        patience_counter += 1

    if patience_counter >= early_stop_patience:
        print(f"\nEarly stopping at epoch {epoch+1}")
        break

print("\n" + "="*80)
print("Training complete!")
print(f"Best validation F1: {best_val_f1:.4f}")
print(f"Best validation loss: {best_val_loss:.4f}")


Starting training...
  ✓ New best Val F1: 0.3732 (saved)
  ✓ New best Val F1: 0.3873 (saved)
  ✓ New best Val F1: 0.4096 (saved)
  ✓ New best Val F1: 0.4512 (saved)
Epoch [5/100]
  Train Loss: 0.7777, Train Acc: 61.55%
  Val Loss:   0.7284, Val Acc:   62.68%, F1: 0.4413
  ✓ New best Val F1: 0.4641 (saved)
Epoch [10/100]
  Train Loss: 0.7150, Train Acc: 62.47%
  Val Loss:   0.6802, Val Acc:   61.10%, F1: 0.4068
Epoch [15/100]
  Train Loss: 0.6889, Train Acc: 63.29%
  Val Loss:   0.6496, Val Acc:   62.82%, F1: 0.4637
  ✓ New best Val F1: 0.4802 (saved)
  ✓ New best Val F1: 0.4892 (saved)
Epoch [20/100]
  Train Loss: 0.6731, Train Acc: 63.49%
  Val Loss:   0.6306, Val Acc:   62.97%, F1: 0.4658
  ✓ New best Val F1: 0.5072 (saved)
Epoch [25/100]
  Train Loss: 0.6556, Train Acc: 64.16%
  Val Loss:   0.6287, Val Acc:   65.13%, F1: 0.4855
Epoch [30/100]
  Train Loss: 0.6561, Train Acc: 63.92%
  Val Loss:   0.6271, Val Acc:   63.83%, F1: 0.4671
Epoch [35/100]
  Train Loss: 0.6475, Train Acc: 6

In [154]:
# Final evaluation on test set
print("\n" + "="*80)
print("Evaluating on test set...")
model_v2.load_state_dict(torch.load('best_splendor_model_v2.pth'))
model_v2.eval()

test_loss = 0.0
test_correct = 0
test_total = 0
all_test_preds = []
all_test_targets = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = model_v2(X_batch)
        loss = criterion(outputs, y_batch)

        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        test_total += y_batch.size(0)
        test_correct += (predicted == y_batch).sum().item()

        all_test_preds.extend(predicted.cpu().numpy())
        all_test_targets.extend(y_batch.cpu().numpy())

avg_test_loss = test_loss / len(test_loader)
test_accuracy = 100 * test_correct / test_total

# Import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
import numpy as np

# Calculate various metrics
test_f1_macro = f1_score(all_test_targets, all_test_preds, average='macro')
test_f1_weighted = f1_score(all_test_targets, all_test_preds, average='weighted')
test_precision_macro = precision_score(all_test_targets, all_test_preds, average='macro')
test_recall_macro = recall_score(all_test_targets, all_test_preds, average='macro')

print(f"\nTest Loss: {avg_test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.2f}%")
print(f"\nMacro Metrics:")
print(f"  Precision: {test_precision_macro:.4f}")
print(f"  Recall:    {test_recall_macro:.4f}")
print(f"  F1-Score:  {test_f1_macro:.4f}")
print(f"\nWeighted F1-Score: {test_f1_weighted:.4f}")

# Classification report
print("\n" + "="*80)
print("CLASSIFICATION REPORT")
print("="*80)
class_names = ['build', 'reserve', 'take 2 tokens', 'take 3 tokens']
print(classification_report(all_test_targets, all_test_preds,
                          target_names=class_names,
                          digits=2))

# Confusion matrix
print("="*80)
print("CONFUSION MATRIX")
print("="*80)
cm = confusion_matrix(all_test_targets, all_test_preds)
print(f"{'':>20}", end='')
for name in class_names:
    print(f"{name:>15}", end='')
print()
for i, name in enumerate(class_names):
    print(f"{name:>20}", end='')
    for j in range(len(class_names)):
        print(f"{cm[i][j]:>15}", end='')
    print()

# Prediction vs Actual distribution
print("\n" + "="*80)
print("PREDICTION vs ACTUAL DISTRIBUTION")
print("="*80)
unique_actual, counts_actual = np.unique(all_test_targets, return_counts=True)
unique_pred, counts_pred = np.unique(all_test_preds, return_counts=True)

# Create dict for easy lookup
pred_dict = dict(zip(unique_pred, counts_pred))

for i, name in enumerate(class_names):
    actual_count = counts_actual[i] if i < len(counts_actual) else 0
    pred_count = pred_dict.get(i, 0)
    print(f"{name:>20} - Predicted: {pred_count:>5}, Actual: {actual_count:>5}")

print("="*80)


Evaluating on test set...

Test Loss: 0.6293
Test Accuracy: 63.74%

Macro Metrics:
  Precision: 0.5114
  Recall:    0.6676
  F1-Score:  0.5061

Weighted F1-Score: 0.6749

CLASSIFICATION REPORT
               precision    recall  f1-score   support

        build       0.78      0.74      0.76     15824
      reserve       0.39      0.54      0.46        24
take 2 tokens       0.11      0.88      0.20       799
take 3 tokens       0.76      0.51      0.61     13889

     accuracy                           0.64     30536
    macro avg       0.51      0.67      0.51     30536
 weighted avg       0.75      0.64      0.67     30536

CONFUSION MATRIX
                              build        reserve  take 2 tokens  take 3 tokens
               build          11716              7           1942           2159
             reserve             11             13              0              0
       take 2 tokens             56              0            705             38
       take 3 tokens  

Architecture 2 (128→64, dropout 0.4) is slightly better overall with a test accuracy of 63.74% compared to Architecture 1's 63.00%. While both models achieve similar weighted F1 scores (0.67 vs 0.66), Architecture 2 demonstrates significantly better handling of rare actions, particularly "reserve" moves where it achieves a precision of 0.39 versus Architecture 1's poor 0.03. Architecture 2 also shows improved recall on "take 2 tokens" actions (0.88 vs 0.78), though it trades off slightly worse performance on "take 3 tokens" (recall 0.51 vs 0.49). The simpler architecture with fewer parameters reduces overfitting risk while maintaining better balanced performance across all action types, making it more robust for actual gameplay despite having similar macro-level metrics.

## Focal Loss implementation

- Focus learning on hard-to-classify examples while being much less aggressive than inverse frequency weighting, preventing the massive overprediction. It avoids exploding gradients and massive overprediction of rare classes.

- Oversampling for only the two minority classes (reserve and take 2 tokens) by duplicating them 3-5x in the training set. This gives the model more exposure to rare patterns without distorting the distribution too much.

In [155]:
#1. Define Focal Loss class and calculate effective class weights
# Focal Loss modifies CrossEntropyLoss by adding a focusing term

import torch.nn.functional as F
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=1.5, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        return focal_loss

In [156]:
#3. Oversample minority classes (reserve 5x, take 2 tokens 3x) to give the model more exposure without extreme weight changes
def get_effective_weights(class_counts, beta=0.999):
    effective_num = 1.0 - np.power(beta, class_counts)
    weights = (1.0 - beta) / effective_num
    weights = weights / weights.sum() * len(weights)
    return weights

# Get counts for each class using your y_train_indices
class_counts = np.array([
    (y_train_indices == 0).sum(),  # build
    (y_train_indices == 1).sum(),  # reserve
    (y_train_indices == 2).sum(),  # take 2 tokens
    (y_train_indices == 3).sum()   # take 3 tokens
])

In [157]:
print("Class counts in training set:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {i}: {label:20} {class_counts[i]:6d} samples")

Class counts in training set:
  0: build                 73841 samples
  1: reserve                 113 samples
  2: take 2 tokens          3730 samples
  3: take 3 tokens         64813 samples


In [158]:
# Calculate weights with beta=0.999
class_weights = get_effective_weights(class_counts, beta=0.999)
class_weights_tensor = torch.FloatTensor(class_weights).to(device)

print("\nClass weights (effective number method):")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {i}: {label:20} weight: {class_weights[i]:.4f}")


Class weights (effective number method):
  0: build                weight: 0.3231
  1: reserve              weight: 3.0227
  2: take 2 tokens        weight: 0.3311
  3: take 3 tokens        weight: 0.3231


### Oversample Minority Classes

In [159]:
# Convert tensors back to numpy for oversampling
X_train_np = X_train_tensor.cpu().numpy()
y_train_np = y_train_tensor.cpu().numpy()

X_train_oversampled = X_train_np.copy()
y_train_oversampled = y_train_np.copy()

In [160]:
# Oversample reserve class (class 1) - duplicate 5x
reserve_indices = np.where(y_train_np == 1)[0]
print(f"Found {len(reserve_indices)} reserve samples, duplicating 5x...")
for _ in range(4):  # 4 more times = 5x total
    X_train_oversampled = np.vstack([X_train_oversampled, X_train_np[reserve_indices]])
    y_train_oversampled = np.concatenate([y_train_oversampled, y_train_np[reserve_indices]])


Found 113 reserve samples, duplicating 5x...


In [161]:
# Oversample take 2 tokens (class 2) - duplicate 3x
take2_indices = np.where(y_train_np == 2)[0]
print(f"Found {len(take2_indices)} take 2 tokens samples, duplicating 3x...")
for _ in range(2):  # 2 more times = 3x total
    X_train_oversampled = np.vstack([X_train_oversampled, X_train_np[take2_indices]])
    y_train_oversampled = np.concatenate([y_train_oversampled, y_train_np[take2_indices]])

Found 3730 take 2 tokens samples, duplicating 3x...


In [162]:
print(f"\nOriginal training set: {len(y_train_np)} samples")
print(f"Oversampled training set: {len(y_train_oversampled)} samples")
print(f"Increase: {len(y_train_oversampled) - len(y_train_np)} samples")


Original training set: 142497 samples
Oversampled training set: 150409 samples
Increase: 7912 samples


In [163]:
print("\nNew class distribution:")
for i, label in enumerate(label_encoder.classes_):
    count = (y_train_oversampled == i).sum()
    pct = 100 * count / len(y_train_oversampled)
    print(f"  {i}: {label:20} {count:6d} ({pct:5.2f}%)")


New class distribution:
  0: build                 73841 (49.09%)
  1: reserve                 565 ( 0.38%)
  2: take 2 tokens         11190 ( 7.44%)
  3: take 3 tokens         64813 (43.09%)


In [164]:
# Create new dataset with oversampled data
train_dataset_oversampled = TensorDataset(
    torch.FloatTensor(X_train_oversampled),
    torch.LongTensor(y_train_oversampled)
)

train_loader_oversampled = DataLoader(
    train_dataset_oversampled,
    batch_size=256,
    shuffle=True
)

print(f"\nNew train loader created with {len(train_loader_oversampled)} batches")


New train loader created with 588 batches


## Initialize Model with Focal Loss

In [165]:
# Instantiate model with my architecture
model_focal = SplendorMLP(
    input_dim=144,
    hidden1=128,
    hidden2=64,
    num_classes=4,
    dropout=0.4
).to(device)

In [166]:
# Initialize Focal Loss with gamma=1.5 and class weights
criterion_focal = FocalLoss(alpha=class_weights_tensor, gamma=1.5)

In [167]:
# Optimizer
optimizer_focal = optim.AdamW(
    model_focal.parameters(),
    lr=0.001,
    weight_decay=0.01
)

In [168]:
# Learning rate scheduler
scheduler_focal = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_focal,
    mode='max',
    factor=0.5,
    patience=5
)

In [171]:
print("Model initialized with Focal Loss")
print(f"  Architecture: 144 -> 128 -> 64 -> 4")
print(f"  Gamma: 1.5")
print(f"  Dropout: 0.4")
print(f"  Using oversampled training data: {len(y_train_oversampled):,} samples")
print(f"  Model parameters: {sum(p.numel() for p in model_focal.parameters()):,}")

Model initialized with Focal Loss
  Architecture: 144 -> 128 -> 64 -> 4
  Gamma: 1.5
  Dropout: 0.4
  Using oversampled training data: 150,409 samples
  Model parameters: 27,460


## Training Loop with validation set

In [172]:
# ============================================================================
#Training Loop with Focal Loss (GPU)
# ============================================================================
num_epochs = 50
best_val_f1 = 0.0
patience_counter = 0
early_stop_patience = 20

train_losses = []
val_f1_scores = []
val_losses = []

print("Starting training...")
print(f"Device: {device}")
print(f"Epochs: {num_epochs}")
print(f"Batch size: 256")
print(f"Early stopping patience: {early_stop_patience}")
print("="*80)

for epoch in range(num_epochs):
    # ==================== Training Phase ====================
    model_focal.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for X_batch, y_batch in train_loader_oversampled:
        # Move to GPU
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        # Forward pass
        optimizer_focal.zero_grad()
        outputs = model_focal(X_batch)
        loss = criterion_focal(outputs, y_batch)

        # Backward pass
        loss.backward()

        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model_focal.parameters(), max_norm=1.0)

        # Update weights
        optimizer_focal.step()

        # Track metrics
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += y_batch.size(0)
        train_correct += (predicted == y_batch).sum().item()

    avg_train_loss = train_loss / len(train_loader_oversampled)
    train_accuracy = 100 * train_correct / train_total
    train_losses.append(avg_train_loss)

    # ==================== Validation Phase ====================
    model_focal.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            # Move to GPU
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            # Forward pass
            outputs = model_focal(X_batch)
            loss = criterion_focal(outputs, y_batch)

            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)

            # Store predictions for metrics
            val_preds.extend(predicted.cpu().numpy())
            val_targets.extend(y_batch.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    # Calculate metrics
    from sklearn.metrics import f1_score, accuracy_score
    val_f1_macro = f1_score(val_targets, val_preds, average='macro')
    val_f1_weighted = f1_score(val_targets, val_preds, average='weighted')
    val_accuracy = 100 * accuracy_score(val_targets, val_preds)
    val_f1_scores.append(val_f1_macro)

    # Learning rate scheduling
    scheduler_focal.step(val_f1_macro)
    current_lr = optimizer_focal.param_groups[0]['lr']

    # Print progress
    print(f"Epoch {epoch+1:2d}/{num_epochs} | "
          f"Train Loss: {avg_train_loss:.4f} | "
          f"Train Acc: {train_accuracy:.2f}% | "
          f"Val Loss: {avg_val_loss:.4f} | "
          f"Val Acc: {val_accuracy:.2f}% | "
          f"Val F1 (macro): {val_f1_macro:.4f} | "
          f"Val F1 (weighted): {val_f1_weighted:.4f} | "
          f"LR: {current_lr:.6f}")

    # Save best model
    if val_f1_macro > best_val_f1:
        best_val_f1 = val_f1_macro
        torch.save(model_focal.state_dict(), 'best_splendor_focal_model.pth')
        patience_counter = 0
        print(f"  ✓ New best model saved! (F1: {best_val_f1:.4f})")
    else:
        patience_counter += 1
        if patience_counter >= early_stop_patience:
            print(f"\nEarly stopping triggered at epoch {epoch+1}")
            print(f"Best validation F1: {best_val_f1:.4f}")
            break

print("\n" + "="*80)
print("Training complete!")
print(f"Best validation F1 (macro): {best_val_f1:.4f}")
print("="*80)

Starting training...
Device: cuda
Epochs: 50
Batch size: 256
Early stopping patience: 20
Epoch  1/50 | Train Loss: 0.0534 | Train Acc: 62.40% | Val Loss: 0.0345 | Val Acc: 68.89% | Val F1 (macro): 0.3617 | Val F1 (weighted): 0.6856 | LR: 0.001000
  ✓ New best model saved! (F1: 0.3617)
Epoch  2/50 | Train Loss: 0.0401 | Train Acc: 65.66% | Val Loss: 0.0311 | Val Acc: 70.42% | Val F1 (macro): 0.3993 | Val F1 (weighted): 0.6962 | LR: 0.001000
  ✓ New best model saved! (F1: 0.3993)
Epoch  3/50 | Train Loss: 0.0366 | Train Acc: 66.43% | Val Loss: 0.0302 | Val Acc: 70.66% | Val F1 (macro): 0.4363 | Val F1 (weighted): 0.7022 | LR: 0.001000
  ✓ New best model saved! (F1: 0.4363)
Epoch  4/50 | Train Loss: 0.0348 | Train Acc: 66.88% | Val Loss: 0.0306 | Val Acc: 71.60% | Val F1 (macro): 0.4062 | Val F1 (weighted): 0.7040 | LR: 0.001000
Epoch  5/50 | Train Loss: 0.0337 | Train Acc: 66.95% | Val Loss: 0.0298 | Val Acc: 71.20% | Val F1 (macro): 0.4128 | Val F1 (weighted): 0.7017 | LR: 0.001000
Epoc

## Test Set Evaluation

In [173]:
# Load best model
model_focal.load_state_dict(torch.load('best_splendor_focal_model.pth'))
model_focal.eval()

# Evaluation
test_loss = 0.0
test_preds = []
test_targets = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        # Move to GPU
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        # Forward pass
        outputs = model_focal(X_batch)
        loss = criterion_focal(outputs, y_batch)

        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)

        # Store predictions
        test_preds.extend(predicted.cpu().numpy())
        test_targets.extend(y_batch.cpu().numpy())

# Calculate metrics
avg_test_loss = test_loss / len(test_loader)
test_accuracy = 100 * accuracy_score(test_targets, test_preds)
test_f1_macro = f1_score(test_targets, test_preds, average='macro')
test_f1_weighted = f1_score(test_targets, test_preds, average='weighted')
test_precision_macro = precision_score(test_targets, test_preds, average='macro')
test_recall_macro = recall_score(test_targets, test_preds, average='macro')

print(f"\nTest Loss: {avg_test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.2f}%")
print(f"\nMacro Metrics:")
print(f"  Precision: {test_precision_macro:.4f}")
print(f"  Recall:    {test_recall_macro:.4f}")
print(f"  F1-Score:  {test_f1_macro:.4f}")
print(f"\nWeighted F1-Score: {test_f1_weighted:.4f}")


Test Loss: 0.0445
Test Accuracy: 73.62%

Macro Metrics:
  Precision: 0.4844
  Recall:    0.4951
  F1-Score:  0.4858

Weighted F1-Score: 0.7341


In [174]:
# Test Set Detailed Analysis

from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score

print("\n" + "="*80)
print("CLASSIFICATION REPORT (TEST SET)")
print("="*80)
print(classification_report(
    test_targets,
    test_preds,
    target_names=label_encoder.classes_,
    digits=4
))

print("="*80)
print("CONFUSION MATRIX (TEST SET)")
print("="*80)
cm = confusion_matrix(test_targets, test_preds)
print(f"{'':>20}", end='')
for label in label_encoder.classes_:
    print(f"{label:>15}", end='')
print()
for i, label in enumerate(label_encoder.classes_):
    print(f"{label:>20}", end='')
    for j in range(len(label_encoder.classes_)):
        print(f"{cm[i][j]:>15}", end='')
    print()

print("\n" + "="*80)
print("PREDICTION vs ACTUAL DISTRIBUTION (TEST SET)")
print("="*80)
unique_actual, counts_actual = np.unique(test_targets, return_counts=True)
unique_pred, counts_pred = np.unique(test_preds, return_counts=True)

pred_dict = dict(zip(unique_pred, counts_pred))

for i, label in enumerate(label_encoder.classes_):
    actual_count = counts_actual[i] if i < len(counts_actual) else 0
    pred_count = pred_dict.get(i, 0)
    ratio = pred_count / actual_count if actual_count > 0 else 0
    print(f"{label:>20} | Predicted: {pred_count:>5} | Actual: {actual_count:>5} | Ratio: {ratio:.2f}x")

print("="*80)


CLASSIFICATION REPORT (TEST SET)
               precision    recall  f1-score   support

        build     0.7415    0.8503    0.7922     15824
      reserve     0.1667    0.1667    0.1667        24
take 2 tokens     0.2565    0.3329    0.2898       799
take 3 tokens     0.7728    0.6304    0.6944     13889

     accuracy                         0.7362     30536
    macro avg     0.4844    0.4951    0.4858     30536
 weighted avg     0.7426    0.7362    0.7341     30536

CONFUSION MATRIX (TEST SET)
                              build        reserve  take 2 tokens  take 3 tokens
               build          13455              5            118           2246
             reserve             18              4              0              2
       take 2 tokens            207              0            266            326
       take 3 tokens           4465             15            653           8756

PREDICTION vs ACTUAL DISTRIBUTION (TEST SET)
               build | Predicted: 18145 | Ac

Focal Loss + Oversampling achieved target performance

...

# Fine-tuning the best model

- loss function tuning : adjusting the focal loss γ (between 1.5–2.0) and refining class weights based on effective sample counts might provide the most immediate gains in minority class performance.

1. CLASS WEIGHT CALCULATION METHODS

The following function calculates class weights using the “effective number of samples” method.

The idea: classes with fewer samples should get higher weights, but not in an extreme way.

A higher beta (like 0.9999) makes the correction more aggressive, meaning rare classes get boosted more strongly.

After computing, the weights are normalized so they sum up to the number of classes.

Use case: balances training by giving rare classes more importance without exploding values.

In [175]:
def get_effective_weights_v2(class_counts, beta=0.9999):
    """More aggressive effective number (higher beta = less correction)"""
    effective_num = 1.0 - np.power(beta, class_counts)
    weights = (1.0 - beta) / effective_num
    weights = weights / weights.sum() * len(weights)
    return weights

The following function computes inverse frequency weights: classes with fewer samples get larger weights.

To avoid extreme values (like hundreds for very rare classes), the weights are capped at a maximum (cap=8.0).

It also ensures weights don’t go below 0.5, so majority classes still retain some influence.

Use case: a simpler, more controlled way to handle imbalance compared to effective number weighting.

In [176]:
def get_inverse_freq_capped(class_counts, cap=8.0):
    """Capped inverse frequency"""
    total = class_counts.sum()
    weights = total / (len(class_counts) * class_counts)
    weights = np.clip(weights, 0.5, cap)
    return weights

2. CONFIGURATIONS TO TEST

This list defines different experimental setups for training with Focal Loss. Each dictionary represents one configuration, with parameters that control how the loss function handles class imbalance:

- Gamma → controls how strongly the loss focuses on hard examples.
- Beta → used in the effective number of samples weighting method.
- Cap → used in the capped inverse frequency method.

In [177]:
configs_to_test = [
    {'name': 'Gamma_1.5_Beta_0.999', 'gamma': 1.5, 'beta': 0.999, 'cap': None},
    {'name': 'Gamma_1.8_Beta_0.999', 'gamma': 1.8, 'beta': 0.999, 'cap': None},
    {'name': 'Gamma_2.0_Beta_0.999', 'gamma': 2.0, 'beta': 0.999, 'cap': None},
    {'name': 'Gamma_1.5_Cap_8', 'gamma': 1.5, 'beta': None, 'cap': 8.0},
    {'name': 'Gamma_2.0_Cap_6', 'gamma': 2.0, 'beta': None, 'cap': 6.0},
]

3. Training function
This is a compact training loop with early stopping based on macro F1‑score. It’s designed to quickly test models without running for too many epochs.

In [185]:
from copy import deepcopy
def quick_train(model, train_loader, val_loader, criterion, device,
                num_epochs=30, lr=0.001):
    """Quick training with early stopping"""
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max',
                                                      factor=0.5, patience=4)

    best_f1 = 0.0
    best_model = None
    patience = 0

    for epoch in range(num_epochs):
        # Train
        model.train()
        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        # Validate
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                _, pred = torch.max(model(X), 1)
                preds.extend(pred.cpu().numpy())
                targets.extend(y.cpu().numpy())

        f1 = f1_score(targets, preds, average='macro')
        scheduler.step(f1)

        if f1 > best_f1:
            best_f1 = f1
            best_model = deepcopy(model.state_dict())
            patience = 0
        else:
            patience += 1
            if patience >= 10:
                break

    return best_model, best_f1


4. MAIN GRID SEARCH LOOP

This block runs a grid search over different Focal Loss configurations to see which setup gives the best performance.



1. Loop through configs: Iterates over each configuration in configs_to_test, showing its parameters (gamma, beta, cap).
2. Weight calculation
3. Model creation: Builds a fresh SplendorMLP
4. Loss setup: Initializes Focal Loss with the chosen weights and gamma value.
5. Training: Runs quick_train on the oversampled training set, tracking the best validation F1.
6. Testing: Loads the best model weights, evaluates on the test set, and collects predictions.
7. Metrics: Computes macro F1 and accuracy, then stores results (including predictions and targets) in a list for later analysis.





In [186]:
print("FOCAL LOSS FINE-TUNING - GRID SEARCH")

results = []

for i, cfg in enumerate(configs_to_test):
    print(f"\n[{i+1}/{len(configs_to_test)}] Testing: {cfg['name']}")
    print(f"  Gamma: {cfg['gamma']}, Beta: {cfg['beta']}, Cap: {cfg['cap']}")

    # Calculate weights
    if cfg['cap'] is None:
        weights = get_effective_weights_v2(class_counts, beta=cfg['beta'])
    else:
        weights = get_inverse_freq_capped(class_counts, cap=cfg['cap'])

    print(f"  Weights: {weights.round(4)}")

    # Create fresh model
    model = SplendorMLP(144, 128, 64, 4, 0.4).to(device)
    criterion = FocalLoss(
        alpha=torch.FloatTensor(weights).to(device),
        gamma=cfg['gamma']
    )

    # Train
    best_model, val_f1 = quick_train(
        model, train_loader_oversampled, val_loader, criterion, device
    )

    # Test
    model.load_state_dict(best_model)
    model.eval()
    test_preds, test_targets = [], []

    with torch.no_grad():
        for X, y in test_loader:
            X, y = X.to(device), y.to(device)
            _, pred = torch.max(model(X), 1)
            test_preds.extend(pred.cpu().numpy())
            test_targets.extend(y.cpu().numpy())

    test_f1 = f1_score(test_targets, test_preds, average='macro')
    test_acc = 100 * accuracy_score(test_targets, test_preds)

    results.append({
        'name': cfg['name'],
        'gamma': cfg['gamma'],
        'val_f1': val_f1,
        'test_f1': test_f1,
        'test_acc': test_acc,
        'model': best_model,
        'preds': test_preds,
        'targets': test_targets
    })

    print(f"  Val F1: {val_f1:.4f} | Test F1: {test_f1:.4f} | Test Acc: {test_acc:.2f}%")

FOCAL LOSS FINE-TUNING - GRID SEARCH

[1/5] Testing: Gamma_1.5_Beta_0.999
  Gamma: 1.5, Beta: 0.999, Cap: None
  Weights: [0.3231 3.0227 0.3311 0.3231]
  Val F1: 0.4947 | Test F1: 0.5076 | Test Acc: 72.95%

[2/5] Testing: Gamma_1.8_Beta_0.999
  Gamma: 1.8, Beta: 0.999, Cap: None
  Weights: [0.3231 3.0227 0.3311 0.3231]
  Val F1: 0.4718 | Test F1: 0.4729 | Test Acc: 71.78%

[3/5] Testing: Gamma_2.0_Beta_0.999
  Gamma: 2.0, Beta: 0.999, Cap: None
  Weights: [0.3231 3.0227 0.3311 0.3231]
  Val F1: 0.4808 | Test F1: 0.4976 | Test Acc: 71.74%

[4/5] Testing: Gamma_1.5_Cap_8
  Gamma: 1.5, Beta: None, Cap: 8.0
  Weights: [0.5    8.     8.     0.5496]
  Val F1: 0.4269 | Test F1: 0.4343 | Test Acc: 60.20%

[5/5] Testing: Gamma_2.0_Cap_6
  Gamma: 2.0, Beta: None, Cap: 6.0
  Weights: [0.5    6.     6.     0.5496]
  Val F1: 0.4327 | Test F1: 0.4257 | Test Acc: 59.86%


5. Results Ranked by Test F1

Leaderboard of the grid search experiments

In [187]:
print("\n" + "="*80)
print("RESULTS RANKED BY TEST F1")
print("="*80)

sorted_results = sorted(results, key=lambda x: x['test_f1'], reverse=True)

for rank, r in enumerate(sorted_results, 1):
    print(f"{rank}. {r['name']:<25} | Val F1: {r['val_f1']:.4f} | "
          f"Test F1: {r['test_f1']:.4f} | Acc: {r['test_acc']:.2f}%")


RESULTS RANKED BY TEST F1
1. Gamma_1.5_Beta_0.999      | Val F1: 0.4947 | Test F1: 0.5076 | Acc: 72.95%
2. Gamma_2.0_Beta_0.999      | Val F1: 0.4808 | Test F1: 0.4976 | Acc: 71.74%
3. Gamma_1.8_Beta_0.999      | Val F1: 0.4718 | Test F1: 0.4729 | Acc: 71.78%
4. Gamma_1.5_Cap_8           | Val F1: 0.4269 | Test F1: 0.4343 | Acc: 60.20%
5. Gamma_2.0_Cap_6           | Val F1: 0.4327 | Test F1: 0.4257 | Acc: 59.86%


6. Best Model Evaluation

Full diagnostic view of the best model’s performance and preserves it for reuse.

In [188]:
best = sorted_results[0]
print("\n" + "="*80)
print(f"BEST: {best['name']} (Gamma={best['gamma']})")
print("="*80)

# Convert lists to numpy arrays
best_preds = np.array(best['preds'])
best_targets = np.array(best['targets'])

print("\nClassification Report:")
print(classification_report(
    best_targets, best_preds,
    target_names=label_encoder.classes_,
    digits=4
))

print("\nConfusion Matrix:")
cm = confusion_matrix(best_targets, best_preds)
print(f"{'':>20}", end='')
for label in label_encoder.classes_:
    print(f"{label:>15}", end='')
print()
for i, label in enumerate(label_encoder.classes_):
    print(f"{label:>20}", end='')
    for j in range(len(label_encoder.classes_)):
        print(f"{cm[i][j]:>15}", end='')
    print()

print("\nPrediction vs Actual:")
for i, label in enumerate(label_encoder.classes_):
    actual = (best_targets == i).sum()
    pred = (best_preds == i).sum()
    ratio = pred / actual if actual > 0 else 0
    print(f"{label:>20} | Pred: {pred:>5} | Actual: {actual:>5} | Ratio: {ratio:.2f}x")

# Save best model
torch.save(best['model'], 'best_finetuned_focal_model.pth')
print("\nBest model saved: best_finetuned_focal_model.pth")


BEST: Gamma_1.5_Beta_0.999 (Gamma=1.5)

Classification Report:
               precision    recall  f1-score   support

        build     0.7322    0.8533    0.7881     15824
      reserve     0.2105    0.3333    0.2581        24
take 2 tokens     0.2712    0.3442    0.3034       799
take 3 tokens     0.7688    0.6113    0.6811     13889

     accuracy                         0.7295     30536
    macro avg     0.4957    0.5355    0.5076     30536
 weighted avg     0.7364    0.7295    0.7263     30536


Confusion Matrix:
                              build        reserve  take 2 tokens  take 3 tokens
               build          13502              9             70           2243
             reserve             16              8              0              0
       take 2 tokens            214              0            275            310
       take 3 tokens           4709             21            669           8490

Prediction vs Actual:
               build | Pred: 18441 | Actual: 1