##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### Feature Engineering

##### Data Reading and Feature Engineering

In [2]:
from scr.feature_engineering import build_feature_set, \
    process_single_word_inference
from scr.utils import *

import random

# MASK_PROB = 0.5

# Limit the number of words to a smaller number for debugging
word_list = read_words('/home/sayem/Desktop/Hangman/words_250000_train.txt', limit=10**4)

# # # Randomly select 1000 words
# # unseen_words = random.sample(word_list, 1000)

In [3]:
word_list[0]

'aaa'

In [4]:
import torch
from scr.feature_engineering import add_features_for_training, calculate_char_frequencies
import random
# Initialize lists for features, labels, missed characters, and original words
all_features, all_labels, all_missed_chars, original_words = [], [], [], []

char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)
MASK_PROB = 0.5
NGRAM_N = 2

feature_set, label, missed_chars = add_features_for_training(
        word_list[0], char_frequency, max_word_length, MASK_PROB, NGRAM_N)

all_features, all_labels, all_missed_chars, original_words = [], [], [], []

for word in word_list:
    # Process each word to get its features, label, and missed characters
    feature_set, label, missed_chars = add_features_for_training(
        word, char_frequency, max_word_length, MASK_PROB, NGRAM_N
    )

    # Add features and labels to the lists without squeezing
    all_features.append(feature_set)
    all_labels.append(torch.tensor(label, dtype=torch.float))
    all_missed_chars.append(missed_chars)
    original_words.append(word)  # Store the original word

# Convert lists to tensors
all_features_tensor = [features.squeeze(0) for features in all_features]  # Remove batch dimension
labels_tensor = [label.squeeze(0) for label in all_labels]  # Remove batch dimension
missed_chars_tensor = [missed_chars.squeeze(0) for missed_chars in all_missed_chars]  # Remove batch dimension

In [5]:
all_features_tensor[0]

tensor([[0.0000, 0.1200, 0.0000, 0.0000, 1.0000],
        [0.0000, 0.1200, 1.0000, 0.0000, 1.0000],
        [1.0000, 0.1200, 2.0000, 0.1791, 1.0000]])

In [6]:
from scr.feature_engineering import process_single_word_inference

def process_inference_word(word, char_frequency, max_word_length, ngram_n=2):
    feature_set, missed_chars = process_single_word_inference(word, \
        char_frequency, max_word_length, ngram_n=ngram_n)
    return feature_set.squeeze(0), missed_chars.squeeze(0)  # Remove batch dimension

# Example usage
inference_word = "_a_"
inference_features, inference_missed_chars = \
    process_inference_word(inference_word, char_frequency, max_word_length, ngram_n=NGRAM_N)

In [7]:
inference_features

tensor([[0.0000, 0.1200, 0.0000, 0.0000, 0.0000],
        [1.0000, 0.1200, 1.0000, 0.1791, 1.0000],
        [0.0000, 0.1200, 2.0000, 0.0000, 1.0000]])

In [8]:
inference_missed_chars

tensor([0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1.])

##### Dataset Building

In [9]:
from scr.dataset import HangmanDataset, collate_fn

dataset = HangmanDataset(all_features_tensor, \
    labels_tensor, missed_chars_tensor, original_words)

# dataset[100] 

In [10]:
# # Example of iterating over the DataLoader
# from torch.utils.data import DataLoader

# data_loader = DataLoader(dataset, batch_size=32, \
#         shuffle=True, collate_fn=collate_fn)
# i = 0
# for i, batch in enumerate(data_loader):
#     inputs, labels, miss_chars, lengths, original_words = batch
#     print(f"Batch {i}: Inputs Shape: {inputs.shape}, Labels Shape: {labels.shape}, \
# Lengths: {lengths}, Miss Chars: {miss_chars}, Original Words: {original_words}")
#     break

##### Model Building

In [11]:
# In your main script or Jupyter Notebook
from scr.model import RNN
from scr.feature_engineering import process_single_word_inference, \
    char_to_idx, idx_to_char, calculate_char_frequencies, \
        get_missed_characters
from scr.game import simulate_game, \
    predict_next_character

# Configuration for the RNN model
# Configuration for the RNN model
config = {
    'rnn': 'LSTM',
    'vocab_size': 27,  # Assuming 26 letters + 1 for underscore
    'hidden_dim': 128,
    'num_layers': 2,
    'embedding_dim': 150,
    'output_mid_features': 100,
    'miss_linear_dim': 50,
    'dropout': 0.5,
    'use_embedding': True,
    'lr': 0.0001,
    'input_feature_size': 5 # Number of features excluding the embedding dimension
}

# Initialize RNN model
model = RNN(config)
model = model.to(device)

model.save_model('models/model.pth') 

# Prepare your dataset, train the model, etc.

# # Example of using predict_next_character in a game scenario
# word = "apple"

In [12]:
model = model.load_model(RNN, filename='models/model.pth', device=device)

In [13]:
current_masked_word = "_ppl_"

# missed_chars = get_missed_characters(word, char_to_idx)

predicted_char = predict_next_character(model, current_masked_word, \
    char_frequency, max_word_length)

# predicted_index = predict_next_character_beam_search(model, current_masked_word, \
#     missed_chars, \
#     char_frequency, max_word_length, \
#     device, normalize=True, beam_width=3)
        
# predicted_char = idx_to_char[predicted_index]

predicted_char

'n'

In [14]:
# # Import the necessary functions
# from scr.feature_engineering import \
#     process_single_word, get_missed_characters

from scr.game import predict_next_character, simulate_game
from scr.model import RNN
import random
# random.seed(400)
# Your existing code for initializing the model, etc.

def play_multiple_games(model, num_games, word_list, \
    char_to_idx, idx_to_char, char_frequency, max_word_length, device):
    game_results = []
    for _ in range(num_games):
        random_word = random.choice(word_list)
        with torch.no_grad():
            won, final_word, attempts_used = simulate_game(
                model, 
                random_word, 
                char_to_idx, 
                idx_to_char, 
                char_frequency, 
                max_word_length, 
                device, 
                normalize=True, 
                max_attempts=6
            )
        game_results.append((won, final_word, attempts_used))
        
    return game_results

num_games = 1000
results = play_multiple_games(model, num_games, \
    word_list, char_to_idx, idx_to_char, \
        char_frequency, max_word_length, device)

# Analyzing results
total_wins = sum(result[0] for result in results)

(total_wins / num_games) * 100

13.5

##### Train

In [15]:
from torch.nn.utils.rnn import pad_sequence
import torch

def train_one_epoch(model, data_loader, optimizer, device=device):
    total_actual_penalty = 0
    total_miss_penalty = 0
    total_batches = 0

    model.train()
    model.to(device)

    for i, batch in enumerate(data_loader):
        inputs, labels, miss_chars, lengths, _ = batch

        inputs = inputs.to(device)
        labels = labels.to(device)
        miss_chars = miss_chars.to(device)
        lengths = lengths # .to(device)

        # print(f"Batch {i}: Inputs Shape: {inputs.shape}, Labels Shape: {labels.shape}, \
        # Lengths: {lengths.shape}, Miss Chars: {miss_chars.shape}")

        # Run the model
        outputs = model(inputs, lengths, miss_chars)
        # print(f'NN output: {outputs.shape}')

        # # Flatten output for loss calculation (if necessary)
        # outputs = outputs.view(-1, outputs.shape[-1])
        # print(f'NN output (view): {outputs.shape}')

        # labels = labels.view(-1).long()

        # print(labels.shape)

        # Calculate the custom loss
        actual_penalty, miss_penalty = model.calculate_loss(outputs, \
            labels, lengths, miss_chars, vocab_size=27, use_cuda=True)

        # print(actual_penalty)
        # print(miss_penalty)

        total_actual_penalty += actual_penalty.item()
        total_miss_penalty += miss_penalty.item()
        total_batches += 1

        optimizer.zero_grad()
        actual_penalty.backward()  # Backpropagation for the actual_penalty
        optimizer.step()

    avg_actual_penalty = total_actual_penalty / total_batches if total_batches > 0 else 0
    avg_miss_penalty = total_miss_penalty / total_batches if total_batches > 0 else 0
    return avg_actual_penalty, avg_miss_penalty



# import random

# def validate_one_epoch(model, val_loader, device=device, max_games_per_epoch=1000):
#     model.eval()
#     total_wins = 0
#     total_games = 0

#     # Collect all words from the validation loader
#     all_words = []
#     for batch in val_loader:
#         batch_original_words = batch[-1]  # Adjust according to your batch structure
#         all_words.extend(batch_original_words)

#     # Randomly sample a set number of words for this epoch's validation
#     selected_words = random.sample(all_words, min(max_games_per_epoch, len(all_words)))

#     with torch.no_grad():
#         for word in selected_words:
#             won, final_word, attempts_used = simulate_game(
#                 model, 
#                 word, 
#                 char_to_idx, 
#                 idx_to_char, 
#                 char_frequency, 
#                 max_word_length, 
#                 device, 
#                 normalize=True, 
#                 max_attempts=6
#             )
#             total_wins += int(won)
#             total_games += 1

#     accuracy_percentage = (total_wins / total_games) \
#         * 100 if total_games > 0 else 0
        
#     return accuracy_percentage

import random
import collections

def validate_one_epoch(model, val_loader, char_to_idx, idx_to_char, \
    char_frequency, max_word_length, device, max_games_per_epoch=1000):
    model.eval()
    total_wins = 0
    total_attempts = 0
    total_games = 0

    win_count_by_length = collections.defaultdict(int)
    game_count_by_length = collections.defaultdict(int)

    # Collect all words from the validation loader
    all_words = []
    for batch in val_loader:
        batch_original_words = batch[-1]  # Adjust according to your batch structure
        all_words.extend(batch_original_words)

    # Randomly sample a set number of words for this epoch's validation
    selected_words = random.sample(all_words, min(max_games_per_epoch, len(all_words)))

    with torch.no_grad():
        for word in selected_words:
            won, final_word, attempts_used = simulate_game(
                model, 
                word, 
                char_to_idx, 
                idx_to_char, 
                char_frequency, 
                max_word_length, 
                device, 
                normalize=True, 
                max_attempts=6
            )
            total_wins += int(won)
            total_attempts += attempts_used
            total_games += 1

            word_length = len(word)
            win_count_by_length[word_length] += int(won)
            game_count_by_length[word_length] += 1

    accuracy_percentage = (total_wins / total_games) * 100 if total_games > 0 else 0
    average_attempts = total_attempts / total_games if total_games > 0 else 0
    win_rate_by_length = {length: (win_count_by_length[length] / game_count_by_length[length]) 
                          for length in game_count_by_length}

    return accuracy_percentage, average_attempts, win_rate_by_length


In [16]:
import optuna
from sklearn.model_selection import KFold
import torch
from torch.utils.data import DataLoader, Subset
from scr.model import RNN
from torch.optim.lr_scheduler import StepLR
from optuna.pruners import MedianPruner

from scr.utils import  *


pruner = MedianPruner()

# from scr.game import validate_one_epoch
# from scr.utils import EarlyStopping # , train_one_epoch  
# # Assuming these are in scr.utils

def objective(trial, dataset, static_config, num_epochs):
    # Dynamic hyperparameters
    dynamic_config = optuna_dynamic_hyperparameters(trial)
    
    # Merge configurations
    config = {**static_config, **dynamic_config}

    # # Initialize ModelCheckpointManager
    # model_checkpoint_manager = ModelCheckpointManager(config)

    print(f"Trial {trial.number}: Configuration - {config}")  # Debug print


    # k-Fold Cross-Validation setup
    kfold = KFold(n_splits=5, shuffle=True)

    val_accuracies = []
    average_attempts_list = []
    win_rate_by_length_list = []

    best_accuracy = 0  # Initialize best accuracy for model saving
    
    best_actual_penalty = None
    best_miss_penalty = None

    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        print(f"Starting Fold {fold+1}")  # Debug print
        train_loader, val_loader = get_data_loaders(dataset, train_idx, val_idx)

        # Model initialization and training
        model, optimizer = initialize_model(config)
        early_stopping = EarlyStopping(patience=10, delta=0.001)

        # Initialize the learning rate scheduler
        scheduler = StepLR(optimizer, step_size=trial.suggest_int("step_size", 5, 20), \
            gamma=trial.suggest_float("gamma", 0.1, 0.5))

        for epoch in range(num_epochs):
            avg_actual_penalty, avg_miss_penalty = train_one_epoch(model, train_loader, optimizer)
            print(f"Epoch {epoch+1}/{num_epochs}, Fold {fold+1}: Avg. Actual Penalty - {avg_actual_penalty}, Avg. Miss Penalty - {avg_miss_penalty}")  # Debug print
            
            scheduler.step()

            validation_metrics = validate_one_epoch(model, val_loader, char_to_idx, idx_to_char, \
                char_frequency, max_word_length, device)

            validation_accuracy, average_attempts, win_rate_by_length = validation_metrics

            val_accuracies.append(validation_accuracy)
            average_attempts_list.append(average_attempts)
            win_rate_by_length_list.append(win_rate_by_length)

            print(f"Epoch {epoch+1}: Validation Accuracy - {validation_accuracy}")
            trial.report(validation_accuracy, epoch)

            # Save the model if it is the best so far
            if validation_accuracy > best_accuracy:
                best_accuracy = validation_accuracy

                print(f"New best model found: Accuracy - {best_accuracy}")  # Debug print

                best_actual_penalty = avg_actual_penalty
                best_miss_penalty = avg_miss_penalty
                
                model.save_model()

            if early_stopping(validation_accuracy):
                print(f"Early stopping triggered at epoch {epoch+1}")  # Debug print
                break

            # Check if the trial should be pruned
            if trial.should_prune():
                print("Trial pruned.")  # Debug print
                raise optuna.exceptions.TrialPruned()


        val_accuracies.append(validation_accuracy)
        average_attempt = 

    average_accuracy = sum(val_accuracies) / len(val_accuracies)
    
    print(f"Average accuracy over folds: {average_accuracy}")  # Debug print
    
    return average_accuracy


# Utility functions (for cleaner code)
def get_data_loaders(dataset, train_idx, val_idx):
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
    train_loader = DataLoader(train_subset, batch_size=32, \
        shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_subset, batch_size=32, \
        shuffle=False, collate_fn=collate_fn)
    return train_loader, val_loader

def initialize_model(config):
    model = RNN(config)
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    return model, optimizer

# Utility functions
def optuna_dynamic_hyperparameters(trial):
    # Define and return dynamic hyperparameters based on the trial
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    hidden_dim = trial.suggest_categorical('hidden_dim', [128, 256, 512])
    embedding_dim = trial.suggest_categorical('embedding_dim', [50, 100, 150])
    output_mid_features = trial.suggest_categorical('output_mid_features', [50, 100, 200])
    miss_linear_dim = trial.suggest_categorical('miss_linear_dim', [50, 100, 150])
    dropout = trial.suggest_float('dropout', 0.2, 0.5)
    num_layers = trial.suggest_int('num_layers', 1, 3)

    return {
        'lr': lr,
        'hidden_dim': hidden_dim,
        'embedding_dim': embedding_dim,
        'output_mid_features': output_mid_features,
        'miss_linear_dim': miss_linear_dim,
        'dropout': dropout,
        'num_layers': num_layers
    }

In [17]:
from pathlib import Path

# Define your models directory path
models_dir = Path('models')

# Create the directory if it does not exist
models_dir.mkdir(parents=True, exist_ok=True)

# Assuming missed_chars_tensor is a list of tensors for missed characters
dataset = HangmanDataset(all_features_tensor, \
    labels_tensor, missed_chars_tensor, original_words)

# Static configuration
static_config = {
    'rnn': 'LSTM',
    'vocab_size': 27,  # 26 English alphabets + 1 (e.g., for underscore)
    'use_embedding': True,  # Typically a design choice
    'input_feature_size': 5,  # Based on your feature engineering strategy
    'models': str(models_dir)
}

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define num_epochs for training in each fold
num_epochs = 2 # Adjust as needed
N_TRIAL = 2

# ====================================================== #
# Create Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, dataset, \
    static_config, num_epochs), n_trials=N_TRIAL)
# ====================================================== #

[I 2023-11-16 01:05:29,568] A new study created in memory with name: no-name-fa1adeef-4cf8-41b2-b42f-fc4393879305


Trial 0: Configuration - {'rnn': 'LSTM', 'vocab_size': 27, 'use_embedding': True, 'input_feature_size': 5, 'models': 'models', 'lr': 0.0008684798330525045, 'hidden_dim': 256, 'embedding_dim': 50, 'output_mid_features': 200, 'miss_linear_dim': 50, 'dropout': 0.4314557296794439, 'num_layers': 3}
Starting Fold 1
Epoch 1/2, Fold 1: Avg. Actual Penalty - 0.08094604259729385, Avg. Miss Penalty - -3.7455215272903444
Epoch 1: Validation Accuracy - 3.6999999999999997
New best model found: Accuracy - 3.6999999999999997
Epoch 2/2, Fold 1: Avg. Actual Penalty - 0.07475450190901756, Avg. Miss Penalty - -4.132945240020752
Epoch 2: Validation Accuracy - 4.7
New best model found: Accuracy - 4.7
Starting Fold 2
Epoch 1/2, Fold 2: Avg. Actual Penalty - 0.08042967519164085, Avg. Miss Penalty - -3.74269801235199
Epoch 1: Validation Accuracy - 4.2
Epoch 2/2, Fold 2: Avg. Actual Penalty - 0.07409329175949096, Avg. Miss Penalty - -4.130918335914612
Epoch 2: Validation Accuracy - 2.4
Starting Fold 3
Epoch 1/2

[I 2023-11-16 01:07:07,189] Trial 0 finished with value: 3.6 and parameters: {'lr': 0.0008684798330525045, 'hidden_dim': 256, 'embedding_dim': 50, 'output_mid_features': 200, 'miss_linear_dim': 50, 'dropout': 0.4314557296794439, 'num_layers': 3, 'step_size': 13, 'gamma': 0.15451273875184893}. Best is trial 0 with value: 3.6.


Epoch 2: Validation Accuracy - 3.5999999999999996
Average accuracy over folds: 3.6
Trial 1: Configuration - {'rnn': 'LSTM', 'vocab_size': 27, 'use_embedding': True, 'input_feature_size': 5, 'models': 'models', 'lr': 9.000202955636949e-05, 'hidden_dim': 128, 'embedding_dim': 100, 'output_mid_features': 200, 'miss_linear_dim': 150, 'dropout': 0.28252560244380076, 'num_layers': 1}
Starting Fold 1
Epoch 1/2, Fold 1: Avg. Actual Penalty - 0.08673496228456497, Avg. Miss Penalty - -3.2235816831588746
Epoch 1: Validation Accuracy - 3.3000000000000003
New best model found: Accuracy - 3.3000000000000003
Epoch 2/2, Fold 1: Avg. Actual Penalty - 0.07925721889734268, Avg. Miss Penalty - -3.726433588027954
Epoch 2: Validation Accuracy - 4.3
New best model found: Accuracy - 4.3
Starting Fold 2
Epoch 1/2, Fold 2: Avg. Actual Penalty - 0.08700583252310753, Avg. Miss Penalty - -3.1996726665496826
Epoch 1: Validation Accuracy - 4.0
Epoch 2/2, Fold 2: Avg. Actual Penalty - 0.07891854158043861, Avg. Miss P

[I 2023-11-16 01:08:12,595] Trial 1 finished with value: 3.8 and parameters: {'lr': 9.000202955636949e-05, 'hidden_dim': 128, 'embedding_dim': 100, 'output_mid_features': 200, 'miss_linear_dim': 150, 'dropout': 0.28252560244380076, 'num_layers': 1, 'step_size': 5, 'gamma': 0.20047126915997204}. Best is trial 1 with value: 3.8.


Epoch 2: Validation Accuracy - 4.1000000000000005
Average accuracy over folds: 3.8


In [18]:
best_params = study.best_params
# Output the best hyperparameters
print("Best trial:")
trial = study.best_trial
print(f"Value: {trial.value}")
print("Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

Best trial:
Value: 3.8
Params: 
    lr: 9.000202955636949e-05
    hidden_dim: 128
    embedding_dim: 100
    output_mid_features: 200
    miss_linear_dim: 150
    dropout: 0.28252560244380076
    num_layers: 1
    step_size: 5
    gamma: 0.20047126915997204


In [19]:
best_params

{'lr': 9.000202955636949e-05,
 'hidden_dim': 128,
 'embedding_dim': 100,
 'output_mid_features': 200,
 'miss_linear_dim': 150,
 'dropout': 0.28252560244380076,
 'num_layers': 1,
 'step_size': 5,
 'gamma': 0.20047126915997204}

In [20]:
# Merge the two dictionaries to form a single configuration
config = {**static_config, **best_params}

config

{'rnn': 'LSTM',
 'vocab_size': 27,
 'use_embedding': True,
 'input_feature_size': 5,
 'models': 'models',
 'lr': 9.000202955636949e-05,
 'hidden_dim': 128,
 'embedding_dim': 100,
 'output_mid_features': 200,
 'miss_linear_dim': 150,
 'dropout': 0.28252560244380076,
 'num_layers': 1,
 'step_size': 5,
 'gamma': 0.20047126915997204}

In [21]:
# Merge the two dictionaries to form a single configuration
config = {**static_config, **best_params}

# Initialize the RNN model with the combined configuration
model = RNN(config)

model = model.to(device)

# model.load_model(config['LSTM'], 2, 256, trial_number=10)

In [22]:
STOP

NameError: name 'STOP' is not defined

##### Testing on unknown data

In [None]:
# # Import the necessary functions
# from scr.feature_engineering import \
#     process_single_word, get_missed_characters

from scr.game import predict_next_character, simulate_game
from scr.model import RNN
import random
# random.seed(400)
# Your existing code for initializing the model, etc.

def play_multiple_games(model, num_games, word_list, \
    char_to_idx, idx_to_char, char_frequency, max_word_length, device):
    game_results = []
    for _ in range(num_games):
        random_word = random.choice(word_list)
        with torch.no_grad():
            won, final_word, attempts_used = simulate_game(
                model, 
                random_word, 
                char_to_idx, 
                idx_to_char, 
                char_frequency, 
                max_word_length, 
                device, 
                normalize=True, 
                max_attempts=6
            )
        game_results.append((won, final_word, attempts_used))
        
    return game_results

num_games = 1000
results = play_multiple_games(model, num_games, \
    word_list, char_to_idx, idx_to_char, \
        char_frequency, max_word_length, device)

# Analyzing results
total_wins = sum(result[0] for result in results)

(total_wins / num_games) * 100