##### Imports

In [None]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### Feature Engineering

In [None]:
# from scr.feature_engineering import process_single_word

# def get_ngrams(word, n=2):
#     if len(word) < n:
#         return []  # or return a special token, e.g., ['<short>']
#     return [word[i:i+n] for i in range(len(word)-n+1)]

# get_ngrams('w', n=2)

##### Data Reading and Feature Engineering

In [None]:
from scr.feature_engineering import process_single_word, \
    calculate_char_frequencies, char_to_idx, idx_to_char

from scr.utils import *

import random

MASK_PROB = 0.8

# Limit the number of words to a smaller number for debugging
word_list = read_words('/home/sayem/Desktop/Hangman/words_250000_train.txt', limit=None)

# Randomly select 1000 words
unseen_words = random.sample(word_list, 1000)

# Remove these words from the original list to create a separate test set
word_list = [word for word in word_list if word not in unseen_words]

# word_list = word_list[:10000]
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

# Testing on a single word
word = "sir"
features, labels, missed_chars = process_single_word(word, char_frequency, \
    max_word_length, mask_prob=MASK_PROB, normalize=True)

features

In [None]:
# Initialize lists for features, labels, missed characters, and original words
all_features, all_labels, all_missed_chars, original_words = [], [], [], []

for word in word_list:
    # Process each word to get its features, label, and missed characters
    feature_set, label, missed_chars = process_single_word(word, char_frequency, \
        max_word_length, mask_prob=MASK_PROB, normalize=True)

    all_features.append(feature_set)
    all_labels.append(label)
    all_missed_chars.append(missed_chars)
    original_words.append(word)  # Store the original word

# Convert lists to tensors
all_features_tensor = [features.squeeze(0) for features in all_features]  # Remove batch dimension
labels_tensor = [label.squeeze(0) for label in all_labels]  # Remove batch dimension
missed_chars_tensor = [missed_chars.squeeze(0) for missed_chars in all_missed_chars]  # Remove batch dimension

##### Dataset Building

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset

class HangmanDataset(Dataset):
    def __init__(self, feature_tensors, label_tensors, missed_chars_tensors, original_words):
        self.features = feature_tensors
        self.labels = label_tensors
        self.missed_chars = missed_chars_tensors
        self.original_words = original_words

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx], self.missed_chars[idx], self.original_words[idx]


import torch
import torch.nn.functional as F

def collate_fn(batch):
    batch_features, batch_labels, batch_missed_chars, original_words = zip(*batch)
    
    # Find the maximum sequence length in the batch
    max_length = max(feature.size(0) for feature in batch_features)
    
    # Pad each sequence to the max_length
    padded_features = [torch.nn.functional.pad(feature, \
        (0, 0, 0, max_length - feature.size(0))) for feature in batch_features]
    padded_labels = [torch.nn.functional.pad(label, \
        (0, max_length - label.size(0))) for label in batch_labels]

    # Convert list of tensors to tensors with an added batch dimension
    padded_features = torch.stack(padded_features, dim=0)  # shape: [batch_size, max_seq_length, feature_size]
    padded_labels = torch.stack(padded_labels, dim=0)     # shape: [batch_size, max_seq_length]
    batch_missed_chars = torch.stack(batch_missed_chars, dim=0)  # shape: [batch_size, vocab_size]

    # Create a tensor for lengths
    lengths_features = torch.tensor([feature.size(0) \
        for feature in batch_features], dtype=torch.long)  # shape: [batch_size]

    return padded_features, padded_labels, \
        batch_missed_chars, lengths_features, original_words



# # Assuming missed_chars_tensor is a list of tensors for missed characters
# dataset = HangmanDataset(all_features_tensor, \
#     labels_tensor, missed_chars_tensor, original_words)

# data_loader = DataLoader(dataset, batch_size=32, \
#     shuffle=True, collate_fn=collate_fn)

dataset = HangmanDataset(all_features_tensor, \
    labels_tensor, missed_chars_tensor, original_words)

dataset[140] 



In [None]:
# # Example of iterating over the DataLoader
# i = 0
# for i, batch in enumerate(data_loader):
#     inputs, labels, miss_chars, lengths, original_words = batch
#     print(f"Batch {i}: Inputs Shape: {inputs.shape}, Labels Shape: {labels.shape}, \
# Lengths: {lengths}, Miss Chars: {miss_chars}, Original Words: {original_words}")
#     break

##### Model Building

In [None]:
# In your main script or Jupyter Notebook
from scr.rnn import RNN
from scr.feature_engineering import process_single_word, \
    char_to_idx, idx_to_char, calculate_char_frequencies, \
        get_missed_characters
from scr.game import simulate_game, \
    predict_next_character, predict_next_character_beam_search

# Configuration for the RNN model
# Configuration for the RNN model
config = {
    'rnn': 'LSTM',
    'vocab_size': 27,  # Assuming 26 letters + 1 for underscore
    'hidden_dim': 128,
    'num_layers': 2,
    'embedding_dim': 150,
    'output_mid_features': 100,
    'miss_linear_dim': 50,
    'dropout': 0.5,
    'use_embedding': True,
    'lr': 0.0001,
    'input_feature_size': 5 # Number of features excluding the embedding dimension
}

# Initialize RNN model
model = RNN(config)
model = model.to(device)

# Prepare your dataset, train the model, etc.

# Example of using predict_next_character in a game scenario
word = "exampleexample"
current_masked_word = "_xam__e_xam__e"
missed_chars = get_missed_characters(word, char_to_idx)

predicted_index = predict_next_character(model, current_masked_word, \
    missed_chars, char_frequency, max_word_length, \
        device=device, normalize=True)

# predicted_index = predict_next_character_beam_search(model, current_masked_word, \
#     missed_chars, \
#     char_frequency, max_word_length, \
#     device, normalize=True, beam_width=3)
        
predicted_char = idx_to_char[predicted_index]

predicted_char

In [None]:
# Import the necessary functions
from scr.feature_engineering import \
    process_single_word, get_missed_characters
    
from scr.game import predict_next_character, simulate_game
from scr.rnn import RNN
import random
# random.seed(400)
# Your existing code for initializing the model, etc.

def play_multiple_games(model, num_games, word_list, \
    char_to_idx, idx_to_char, char_frequency, max_word_length, device):
    game_results = []
    for _ in range(num_games):
        random_word = random.choice(word_list)
        with torch.no_grad():
            won, final_word, attempts_used = simulate_game(
                model, 
                random_word, 
                char_to_idx, 
                idx_to_char, 
                char_frequency, 
                max_word_length, 
                device, 
                normalize=True, 
                max_attempts=6
            )
        game_results.append((won, final_word, attempts_used))
    return game_results

num_games = 1000
results = play_multiple_games(model, num_games, \
    unseen_words, char_to_idx, idx_to_char, \
        char_frequency, max_word_length, device)

# Analyzing results
total_wins = sum(result[0] for result in results)

total_wins

##### Train

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

def train_one_epoch(model, data_loader, optimizer, device=device):
    total_actual_penalty = 0
    total_miss_penalty = 0
    total_batches = 0

    model.train()
    model.to(device)

    for i, batch in enumerate(data_loader):
        inputs, labels, miss_chars, lengths, _ = batch

        inputs = inputs.to(device)
        labels = labels.to(device)
        miss_chars = miss_chars.to(device)
        lengths = lengths # .to(device)

        # print(f"Batch {i}: Inputs Shape: {inputs.shape}, Labels Shape: {labels.shape}, \
        # Lengths: {lengths.shape}, Miss Chars: {miss_chars.shape}")

        # Run the model
        outputs = model(inputs, lengths, miss_chars)
        # print(f'NN output: {outputs.shape}')

        # # Flatten output for loss calculation (if necessary)
        # outputs = outputs.view(-1, outputs.shape[-1])
        # print(f'NN output (view): {outputs.shape}')

        # labels = labels.view(-1).long()

        # print(labels.shape)

        # Calculate the custom loss
        actual_penalty, miss_penalty = model.calculate_loss(outputs, \
            labels, lengths, miss_chars, vocab_size=27, use_cuda=True)

        # print(actual_penalty)
        # print(miss_penalty)

        total_actual_penalty += actual_penalty.item()
        total_miss_penalty += miss_penalty.item()
        total_batches += 1

        optimizer.zero_grad()
        actual_penalty.backward()  # Backpropagation for the actual_penalty
        optimizer.step()

    avg_actual_penalty = total_actual_penalty / total_batches if total_batches > 0 else 0
    avg_miss_penalty = total_miss_penalty / total_batches if total_batches > 0 else 0
    return avg_actual_penalty, avg_miss_penalty


import random

def validate_one_epoch(model, val_loader, device=device, max_games_per_epoch=100):
    model.eval()
    total_wins = 0
    total_games = 0

    # Collect all words from the validation loader
    all_words = []
    for batch in val_loader:
        batch_original_words = batch[-1]  # Adjust according to your batch structure
        all_words.extend(batch_original_words)

    # Randomly sample a set number of words for this epoch's validation
    selected_words = random.sample(all_words, min(max_games_per_epoch, len(all_words)))

    with torch.no_grad():
        for word in selected_words:
            won, final_word, attempts_used = simulate_game(
                model, 
                word, 
                char_to_idx, 
                idx_to_char, 
                char_frequency, 
                max_word_length, 
                device, 
                normalize=True, 
                max_attempts=6
            )
            total_wins += int(won)
            total_games += 1

    accuracy_percentage = (total_wins / total_games) * 100 if total_games > 0 else 0
    return accuracy_percentage


In [None]:
import optuna
from sklearn.model_selection import KFold
import torch
from torch.utils.data import DataLoader, Subset
from scr.rnn import RNN
# from scr.game import validate_one_epoch
# from scr.utils import EarlyStopping # , train_one_epoch  # Assuming these are in scr.utils

def objective(trial, dataset, static_config, num_epochs):
    # Dynamic hyperparameters
    # Get dynamic hyperparameters from the trial
    dynamic_config = optuna_dynamic_hyperparameters(trial)
    
    # Merge configurations
    config = {**static_config, **dynamic_config}

    # k-Fold Cross-Validation setup
    kfold = KFold(n_splits=5, shuffle=True)
    val_accuracies = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        train_loader, val_loader = get_data_loaders(dataset, train_idx, val_idx)

        # Model initialization and training
        model, optimizer = initialize_model(config)
        early_stopping = EarlyStopping(patience=10, delta=0.001)

        for epoch in range(num_epochs):
            _, _ = train_one_epoch(model, train_loader, optimizer)
            validation_accuracy = validate_one_epoch(model, val_loader)
            
            if early_stopping(validation_accuracy):
                break

        val_accuracies.append(validation_accuracy)

    return sum(val_accuracies) / len(val_accuracies)

# Utility functions (for cleaner code)
def get_data_loaders(dataset, train_idx, val_idx):
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_subset, batch_size=32, shuffle=False, collate_fn=collate_fn)
    return train_loader, val_loader

def initialize_model(config):
    model = RNN(config)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    return model, optimizer

# Dynamic hyperparameters for Optuna trials
def optuna_dynamic_hyperparameters(trial):
    return {
        'lr': trial.suggest_float('lr', 1e-5, 1e-3, log=True),
        'hidden_dim': trial.suggest_categorical('hidden_dim', [128, 256, 512]),
        'embedding_dim': trial.suggest_categorical('embedding_dim', [32, 50, 100]),
        'output_mid_features': trial.suggest_categorical('output_mid_features', [50, 100, 200]),
        'miss_linear_dim': trial.suggest_categorical('miss_linear_dim', [25, 50, 100]),
        'dropout': trial.suggest_float('dropout', 0.2, 0.5),
        'num_layers': trial.suggest_int('num_layers', 1, 3)
    }

In [None]:
# Assuming missed_chars_tensor is a list of tensors for missed characters
dataset = HangmanDataset(all_features_tensor, \
    labels_tensor, missed_chars_tensor, original_words)

# data_loader = DataLoader(dataset, batch_size=32, \
#     shuffle=True, collate_fn=collate_fn)

# Static configuration
static_config = {
    'rnn': 'LSTM',
    'vocab_size': 27,  # 26 English alphabets + 1 (e.g., for underscore)
    'use_embedding': True,  # Typically a design choice
    'input_feature_size': 5,  # Based on your feature engineering strategy
}

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define num_epochs for training in each fold
num_epochs = 1  # Adjust as needed

# Create Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, dataset, \
    static_config, num_epochs), n_trials=1)

In [None]:
STOP

In [None]:
import optuna
from sklearn.model_selection import KFold
import torch
from torch.utils.data import DataLoader, Subset
from scr.utils import EarlyStopping

def objective(trial, dataset, static_config):
    # Hyperparameter space definition
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    hidden_dim = trial.suggest_categorical('hidden_dim', [128, 256, 512])
    dropout = trial.suggest_float('dropout', 0.2, 0.5)
    num_layers = trial.suggest_int('num_layers', 1, 3)

    # Merge static configuration with dynamic hyperparameters
    config = {**static_config, 'lr': lr, 'hidden_dim': hidden_dim, 'dropout': dropout, 'num_layers': num_layers}

    # ... rest of the objective function (k-fold cross-validation, training, validation)
    # Use 'config' to initialize the RNN model
# Load or define your dataset
dataset = HangmanDataset(...)
# Assuming missed_chars_tensor is a list of tensors for missed characters
dataset = HangmanDataset(all_features_tensor, \
    labels_tensor, missed_chars_tensor, original_words)

data_loader = DataLoader(dataset, batch_size=32, \
    shuffle=True, collate_fn=collate_fn)

# Static configuration
static_config = {
    'vocab_size': 27,
    'embedding_dim': 50,
    'output_mid_features': 100,
    'miss_linear_dim': 50,
    'use_embedding': True,
    'input_feature_size': 5
}

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def objective(trial):
    # Hyperparameter space definition
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    hidden_dim = trial.suggest_categorical('hidden_dim', [128, 256, 512])
    dropout = trial.suggest_float('dropout', 0.2, 0.5)
    num_layers = trial.suggest_int('num_layers', 1, 3)

    # Configurations for the model
    config = {
        'lr': lr,
        'hidden_dim': hidden_dim,
        'dropout': dropout,
        'num_layers': num_layers,
        # Other necessary configurations
    }

    # k-Fold Cross-Validation
    kfold = KFold(n_splits=5, shuffle=True)
    val_accuracies = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)

        train_loader = DataLoader(train_subset, batch_size=32, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_subset, batch_size=32, shuffle=False, collate_fn=collate_fn)

        # Initialize the model with the current configuration
        model = RNN(config)
        optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)

        # Early Stopping and Training Logic
        early_stopping = EarlyStopping(patience=10, delta=0.001)
        for epoch in range(num_epochs):  # Define num_epochs
            avg_actual_penalty, avg_miss_penalty = train_one_epoch(model, train_loader, optimizer, device)
            
            # Validation step
            validation_accuracy = validate_one_epoch(model, val_loader, device, max_games_per_epoch=100)
            if early_stopping(validation_accuracy):
                break

        val_accuracies.append(validation_accuracy)

    # Calculate average validation accuracy over all folds
    avg_val_accuracy = sum(val_accuracies) / len(val_accuracies)
    return avg_val_accuracy

In [None]:
# import random

# def validate_one_epoch(model, val_loader, device, max_games_per_epoch=100):
#     model.eval()
#     total_wins = 0
#     total_games = 0

#     # Collect all words from the validation loader
#     all_words = []
#     for batch in val_loader:
#         batch_original_words = batch[-1]  # Adjust according to your batch structure
#         all_words.extend(batch_original_words)

#     # Randomly sample a set number of words for this epoch's validation
#     selected_words = random.sample(all_words, min(max_games_per_epoch, len(all_words)))

#     with torch.no_grad():
#         for word in selected_words:
#             won, final_word, attempts_used = simulate_game(
#                 model, 
#                 word, 
#                 char_to_idx, 
#                 idx_to_char, 
#                 char_frequency, 
#                 max_word_length, 
#                 device, 
#                 normalize=True, 
#                 max_attempts=6
#             )
#             total_wins += int(won)
#             total_games += 1

#     accuracy_percentage = (total_wins / total_games) * 100 if total_games > 0 else 0
#     return accuracy_percentage


# # Training loop
# optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
# num_epochs = 20

# # Assuming missed_chars_tensor is a list of tensors for missed characters
# dataset = HangmanDataset(all_features_tensor, \
#     labels_tensor, missed_chars_tensor, original_words)

# data_loader = DataLoader(dataset, batch_size=32, \
#     shuffle=True, collate_fn=collate_fn)
# # Assuming the HangmanDataset, DataLoader, and model setup are already done

# for epoch in range(num_epochs):
#     # Training step
#     avg_actual_penalty, avg_miss_penalty = train_one_epoch(model, data_loader, optimizer, device)
#     print(f"Epoch {epoch+1}: Training - Avg Actual Penalty: {avg_actual_penalty}, Avg Miss Penalty: {avg_miss_penalty}")

#     # Validation step
#     validation_accuracy = validate_one_epoch(model, val_loader, device, max_games_per_epoch=100)
#     print(f"Epoch {epoch+1}: Validation - Accuracy: {validation_accuracy}%")


In [None]:
# Import the necessary functions
from scr.feature_engineering import \
    process_single_word, get_missed_characters
    
from scr.game import predict_next_character, simulate_game
from scr.rnn import RNN
import random
# random.seed(400)
# Your existing code for initializing the model, etc.

def play_multiple_games(model, num_games, word_list, \
    char_to_idx, idx_to_char, char_frequency, max_word_length, device):
    game_results = []
    for _ in range(num_games):
        random_word = random.choice(word_list)
        with torch.no_grad():
            won, final_word, attempts_used = simulate_game(
                model, 
                random_word, 
                char_to_idx, 
                idx_to_char, 
                char_frequency, 
                max_word_length, 
                device, 
                normalize=True, 
                max_attempts=6
            )
        game_results.append((won, final_word, attempts_used))
    return game_results

num_games = 1000

results = play_multiple_games(model, num_games, \
    unseen_words, char_to_idx, idx_to_char, \
        char_frequency, max_word_length, device)

# Analyzing results
total_wins = sum(result[0] for result in results)

total_wins

In [None]:
STOP

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

# Define the training function for one epoch
def train_one_epoch(model, data_loader, optimizer, loss_function, device):
    total_loss = 0
    total_batches = 0

    model.train()
    for i, batch in enumerate(data_loader):
        inputs = batch[0]
        labels = batch[1]
        lengths = batch[2]

        padded_inputs = pad_sequence(inputs, batch_first=True).to(device)
        lengths = torch.tensor(lengths, dtype=torch.long)
        padded_labels = pad_sequence(labels, batch_first=True).to(device)

        hidden = model.init_hidden(padded_inputs.size(0))
        outputs, hidden = model(padded_inputs, lengths, hidden)
        outputs = outputs.view(-1, outputs.shape[-1])
        labels = padded_labels.view(-1).long()

        loss = loss_function(outputs, labels) # TODO: forece predict? here? 
        total_loss += loss.item()
        total_batches += 1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / total_batches if total_batches > 0 else 0
    return avg_loss


import random

def validate_model(model, val_loader, device, num_games=2000):
    model.eval()
    total_wins = 0

    # Collect words from the validation loader
    words_for_validation = []
    for _, _, _, batch_original_words in val_loader:
        words_for_validation.extend(batch_original_words)

    # Randomly select 'num_games' words for validation
    selected_words = random.sample(words_for_validation, num_games)

    with torch.no_grad():
        for word in selected_words:
            win_result, _, _ = simulate_game(model, word, char_to_idx, idx_to_char, 
                                             char_frequency, max_word_length, 
                                             device=device, normalize=True, 
                                             max_attempts=6)
            total_wins += int(win_result)
    
    win_rate_percentage = int((total_wins / num_games) * 100)  # Convert to percentage

    return win_rate_percentage if num_games > 0 else 0

In [None]:
import matplotlib.pyplot as plt
from pathlib import Path
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
import optuna
from sklearn.model_selection import KFold
from torch.optim.lr_scheduler import ReduceLROnPlateau


def objective(trial, dataset, input_size, output_size, num_epochs, device):
    
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    hidden_size = trial.suggest_categorical("hidden_size", [64, 128, 256])
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5)
    num_layers = 2  # Or any other integer value you wish to use

    total_win_rate = 0

    kf = KFold(n_splits=5)
    fold_number = 1  # Initialize fold counter
    trial_number = trial.number  # Get the current trial number

    # Create directory for this trial's plots if it doesn't exist
    plot_dir = Path(f'./plots/trial_{trial_number}')
    plot_dir.mkdir(parents=True, exist_ok=True)

    for train_index, val_index in kf.split(dataset):

        print(f"Fold {fold_number}")  # Print the current fold number
        train_dataset = Subset(dataset, train_index)
        val_dataset = Subset(dataset, val_index)

        print(f"Number of words in training dataset: {len(train_dataset)}")
        print(f"Number of words in validation dataset: {len(val_dataset)}")
        
        train_loader = DataLoader(train_dataset, \
            batch_size=256, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_dataset, \
            batch_size=256, shuffle=False, collate_fn=collate_fn)

        model = HangmanLSTM(input_size, hidden_size, \
            output_size, num_layers, dropout_rate=dropout_rate).to(device)
            
        optimizer = optim.Adam(model.parameters(), lr=lr)
        loss_function = nn.CrossEntropyLoss()
        scheduler = ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.5)
        
        best_win_rate = 0
        epochs_no_improve = 0
        early_stop_epochs = 5
        num_epochs = num_epochs

        # Lists to store metrics for plotting
        epoch_losses = []
        epoch_win_rates = []

        for epoch in range(num_epochs):
            avg_loss = train_one_epoch(model, train_loader, \
                optimizer, loss_function, device)

            epoch_losses.append(avg_loss)  # Store loss for this epoch

            print(f"Epoch {epoch}: Loss {avg_loss}")
            
            win_rate = validate_model(model, val_loader, device)

            epoch_win_rates.append(win_rate)  # Store win rate for this epoch

            print(f"Epoch {epoch}/{num_epochs}: Train Loss {avg_loss}, Win Rate {win_rate}")

            # Store metrics
            epoch_losses.append(avg_loss)
            epoch_win_rates.append(win_rate)

            scheduler.step(win_rate)

            # Gradient clipping
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            if win_rate > best_win_rate:
                best_win_rate = win_rate
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1

            if epochs_no_improve == early_stop_epochs:
                break  # Early stopping
        
        
        # Plotting after each fold
        plt.figure(figsize=(12, 5))

        # Plot Loss
        plt.subplot(1, 2, 1)
        plt.plot(range(1, len(epoch_losses) + 1), epoch_losses, label='Loss')  # Ensure correct range
        plt.title(f'Fold {fold_number} - Training Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.grid(True)
        plt.savefig(f'./plots/trial_{trial.number}/fold_{fold_number}_loss.png')

        # Plot Win Rate
        plt.subplot(1, 2, 2)
        plt.plot(range(1, len(epoch_win_rates) + 1), epoch_win_rates, label='Win Rate')  # Ensure correct range
        plt.title(f'Fold {fold_number} - Validation Win Rate')
        plt.xlabel('Epoch')
        plt.ylabel('Win Rate')
        plt.grid(True)
        plt.savefig(f'./plots/trial_{trial.number}/fold_{fold_number}_win_rate.png')

        plt.close()

        fold_number += 1

        total_win_rate += best_win_rate

    average_win_rate = total_win_rate / kf.get_n_splits()

    print(f"Avg win rate: ", average_win_rate)
    average_win_rate = 0
    return average_win_rate

# Define your dataset, input_size, output_size, device, num_epochs
dataset = HangmanDataset(all_features_tensor, labels_tensor, original_words)

input_size = 4  # Set your input size
output_size = 27  # Set your output size
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Choose device
num_epochs = 10
n_trials = 1

# Create and run the Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, dataset, \
    input_size, output_size, num_epochs, device), n_trials=n_trials)

In [None]:
# Output the best hyperparameters
print("Best trial:")
trial = study.best_trial
print(f"Value: {trial.value}")
print("Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")