##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import pickle
from pathlib import Path
from scr.dataset import *
from scr.utils import *
# # For inference
from scr.feature_engineering import *

import gc

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

from scr.utils import print_scenarios
torch.set_float32_matmul_precision('medium')
from pathlib import Path

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Data Dir

In [2]:
NUM_STRATIFIED_SAMPLES = 2_00 # This will be overwritten by Papermill

NUM_WORD_SAMPLE = 1_000 # words for testing

FAST_DEV_RUN = False

MAX_EPOCH = 250

In [3]:
from pathlib import Path
from scr.custom_sampler import *

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
parquet_path = stratified_samples_dir / 'parquets'

# Create directories for train and validation parquets if they don't exist
parquet_path.mkdir(parents=True, exist_ok=True)
# parquet_valid_path.mkdir(parents=True, exist_ok=True)

# Define and create the directory for models
models_dir = Path("/home/sayem/Desktop/Hangman/models")
models_dir.mkdir(parents=True, exist_ok=True)

# Define your output directory
# Define your output directory and logger directory
output_dir = Path("/home/sayem/Desktop/Hangman/training_outputs")
logger_dir = output_dir / "lightning_logs"

# Create the output and logger directories if they don't exist
output_dir.mkdir(parents=True, exist_ok=True)
logger_dir.mkdir(parents=True, exist_ok=True)

# Define the file path for saving the testing words
testing_words_file_path = stratified_samples_dir / "testing_words.txt"

try:
    testing_word_list = read_words(testing_words_file_path)
    print(f"Length of the testing word list: {len(testing_word_list)}")
    sampled_test_words = stratified_sample_by_length_and_uniqueness(testing_word_list, NUM_WORD_SAMPLE)
    print(f"Sampled {len(sampled_test_words)} unique words for testing.")
except FileNotFoundError:
    print(f"File not found: {testing_words_file_path}")

print(len(sampled_test_words))

Length of the testing word list: 10048
Sampled 1085 unique words for testing.
1085


##### Dataset Loading and train test split

In [4]:
# Create datasets directly from the saved parquet files
hangman_dataset = HangmanDataset(parquet_path)
# valid_dataset = HangmanDataset(parquet_valid_path)

from scr.utils import *

# print(len(train_dataset))
# print(len(valid_dataset))

# assert len(train_dataset) > len(valid_dataset)

# Assuming `hangman_dataset` is an instance of HangmanDataset
train_dataset, valid_dataset = split_hangman_dataset(hangman_dataset)

In [5]:
len(train_dataset)

9816

In [6]:
len(valid_dataset)

2454

In [7]:
train_group_info = train_dataset.get_group_info()
# len(train_group_info)  # This will show each group and the number of games in it

valid_group_info = valid_dataset.get_group_info()
# len(valid_group_info)  # This will show each group and the number of games in it

assert len(train_group_info) == len(valid_group_info)

##### Model Initialization

In [8]:
from scr.encoder import *
from scr.trainer_ import *
from scr.decoder import SimpleLSTM

# Fixed parameters
max_word_length = 29  # Maximum word length
num_embeddings = 28   # Vocabulary size (fixed, based on unique characters in the game)
num_features = 5      # Number of features per character
missed_char_dim = 28  # Additional dimension for missed characters
output_dim = 28       # Output dimension for the model

# Tunable hyperparameters
LEARNING_RATE = 0.57
HIDDEN_DIM = 32
NUM_LAYERS = 3
EMBEDDING_DIM = 15

# Initialize the Encoder
encoder = Encoder(num_embeddings, EMBEDDING_DIM, max_word_length, missed_char_dim)

# Calculate the input dimension for the LSTM (SimpleLSTM decoder)
# The input dimension should match the flattened output dimension of the Encoder
embedded_char_features = max_word_length * EMBEDDING_DIM
additional_features_per_char = (num_features - 1)  # Excluding the embedded character
additional_char_features = additional_features_per_char * max_word_length
combined_feature_size = embedded_char_features + additional_char_features + missed_char_dim
input_dim = combined_feature_size

# Initialize the SimpleLSTM decoder
decoder = SimpleLSTM(input_dim=input_dim, 
                     hidden_dim=HIDDEN_DIM, 
                     output_dim=output_dim, 
                     num_layers=NUM_LAYERS,
                     missed_char_dim=missed_char_dim)

# Other parts of the code flow...

In [9]:
# Initialize the HangmanModel
lightning_model = HangmanModel(encoder, decoder, 
                    LEARNING_RATE, char_frequency, 
                    max_word_length, test_words=sampled_test_words)

# Save the model's state dictionary
untrained_model_file = models_dir / f"{NUM_STRATIFIED_SAMPLES}_untrained_model.pth"
torch.save(lightning_model, untrained_model_file)
print(f"Model saved at {untrained_model_file}")

Model saved at /home/sayem/Desktop/Hangman/models/200_untrained_model.pth


##### Data Module

In [10]:
from scr.data_module import *

# Initialize Data Module
initial_batch_size = 128  # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                custom_collate_fn)
                                # performance_metrics=None)

##### Testing on Untrained Model

In [11]:
## callbacks
from pytorch_lightning.callbacks import Callback, EarlyStopping
from scr.custom_callbacks import *

# Setup EarlyStopping to monitor the test_win_rate
early_stop_callback = EarlyStopping(
    monitor='win_rate',
    min_delta=0.00,
    patience=200,
    verbose=True,
    mode='max'  # Maximize the win rate
)

step_level_early_stopping = StepLevelEarlyStopping(
    monitor='val_miss_penalty',  # Metric to monitor
    min_delta=0.0,              # Minimum change to qualify as an improvement
    patience=5                  # Number of steps with no improvement after which training will be stopped
)

In [12]:
from pytorch_lightning.profilers import SimpleProfiler
import pytorch_lightning as pl
from scr.custom_callbacks import *
from scr.dataset import *
from scr.data_module import *
from scr.trainer_ import *

torch.cuda.empty_cache()

# # Create Callbacks
# loss_logging_callback = LossLoggingCallback()

callbacks = [early_stop_callback, step_level_early_stopping] # , SchedulerSetupCallback()] # , loss_logging_callback]

FAST_DEV_RUN = False
# # Calculate the minimum percentage of validation batches
# min_val_batches = 1 / len(data_module.val_dataloader())

# # Create Trainer with Callbacks
trainer = pl.Trainer(
    default_root_dir=output_dir,
    fast_dev_run=FAST_DEV_RUN, 
    max_epochs=MAX_EPOCH, 
    callbacks=callbacks,
    num_sanity_val_steps=0,
    reload_dataloaders_every_n_epochs=1,
    enable_progress_bar=True 
    # val_check_interval=0.5
    # limit_train_batches=2,  # Limit the number of training batches to 2
    # limit_val_batches=2     # Limit the number of validation batches to 2
)

# # # # # print(f"Running for {NUM_STRATIFIED_SAMPLES} samples...")
# # # print()
# # # # # # # Assuming combined_eval_metrics is a list of dictionaries
# combined_eval_metrics = trainer.validate(model=lightning_model, datamodule=data_module)
# # performance_metrics_dict = {k: v for d in combined_eval_metrics for k, v in d.items()}

# # # # # # print("Converted Performance Metrics Dictb ionary:", performance_metrics_dict)

# # trainer.datamodule.update_performance_metrics(combined_eval_metrics)

# data_module.update_performance_metrics(performance_metrics_dict)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
# # # # # Validate the model (if needed)
# # trainer.validate(model=lightning_model, datamodule=data_module)
# print(f"Training Begin for {NUM_STRATIFIED_SAMPLES} words: {len(train_dataset)} Games")
# # # # # # # Fit the model
# trainer.fit(lightning_model, data_module)

# # # Optionally print the profiler summary
# # print(profiler.summary())

# # Save the entire modell
# trained_model_file = models_dir / f"{NUM_STRATIFIED_SAMPLES}_trained_model.pth"
# torch.save(lightning_model, trained_model_file)
# print(f"Model saved at {trained_model_file}")

In [14]:
# STOP

In [15]:
# # Initialize PerformanceBasedSampler
# sampler = PerformanceBasedSampler(dataset=hangman_dataset, 
#                                   performance_metrics=performance_metrics_dict, 
#                                   batch_size=10)

# # Verify target pairs
# print("Target pairs:", sampler.target_pairs)

In [16]:
# print(f"Training Begin for {NUM_STRATIFIED_SAMPLES} words: {len(train_dataset)} Games")
# # # # # # # Fit the model
# trainer.fit(lightning_model, data_module)

##### Tuning: lr

In [17]:
# from pytorch_lightning.tuner.tuning import Tuner

# # Assuming lightning_model, train_loader, and val_loader are already defined
# # Initialize the tuner with your trainer
# tuner = Tuner(trainer)

# # Run the learning rate finder using the data module
# lr_finder = tuner.lr_find(model=lightning_model, 
#                         datamodule=data_module)

# # Plot the learning rate finder results
# fig = lr_finder.plot(suggest=True)
# fig.show()

# # Get the suggested learning rate
# new_lr = lr_finder.suggestion()
# print(f"Suggested Learning Rate: {new_lr}")

# # Update model's learning rate
# lightning_model.learning_rate = new_lr

# # Optionally, you can view the results of the LR finder
# print(lr_finder.results)

##### Tuning: Batch

In [18]:
# # Assuming lightning_model is already defined
# new_batch_size = tuner.scale_batch_size(
#     model=lightning_model,
#     datamodule=data_module,
#     mode='power',  # or 'binsearch'
#     steps_per_trial=10,
#     init_val=64,
#     max_trials=4
# )

# # Update the batch size in the data module
# data_module.batch_size = new_batch_size

# # print(f"Tune Batch size: ", new_batch_size)

##### Training

In [19]:
# # # # # # Validate the model (if needed)
# # trainer.validate(model=lightning_model, datamodule=data_module)
# print(f"Training Begin for {NUM_STRATIFIED_SAMPLES} words: {len(train_dataset)} Games")
# # # # # # # Fit the model
# trainer.fit(lightning_model, data_module)

# # # Optionally print the profiler summary
# # # print(profiler.summary())

# # # Save the entire model
# # trained_model_file = models_dir / f"{NUM_STRATIFIED_SAMPLES}_trained_model.pth"
# # torch.save(lightning_model, trained_model_file)
# # print(f"Model saved at {trained_model_file}")

##### Testing

In [20]:
# Finding the maximum word length in the list
max_word_length_in_list = max(len(word) for word in sampled_test_words)
max_word_length_in_list

29

In [21]:
# # Load the entire LSTM model object
# untrained_model_file_path = models_dir / f"{NUM_STRATIFIED_SAMPLES}_trained_model.pth"

untrained_model = torch.load(untrained_model_file)

# Example usage
result = play_games_and_calculate_stats(untrained_model, \
    sampled_test_words, char_frequency, max_word_length)

print(f"Untrained model performence: {result['overall_win_rate']} % win rate")

performance_metrics = result['length_wise_stats']

# print(performance_metrics)

from scr.utils import *

plot_hangman_stats(performance_metrics)


for length, data in result["length_wise_stats"].items():
    print(f"Length {length}: Win Rate: {data['win_rate']}%, Average Attempts: {data['average_attempts_used']}")

Processing words:   0%|          | 0/1085 [00:00<?, ?word/s]

Guessed characters: []
Batch features shape (input to Encoder): torch.Size([1, 1, 173])
Encoded characters shape: torch.Size([1, 1, 29])
Additional features shape: torch.Size([1, 1, 116])
Embedded characters shape: torch.Size([1, 1, 29, 15])
Reshaped embedded characters shape: torch.Size([1, 1, 435])
Combined features shape: torch.Size([1, 1, 551])
Flattened output shape: torch.Size([1, 1, 551])
torch.Size([1, 1, 551])
Input features shape: torch.Size([1, 1, 551])
Original Seq shape:  torch.Size([1])
Missed Chars shape:  torch.Size([1, 1, 28])

Packed input shape: torch.Size([1, 551])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x551 and 579x128)

In [None]:
# trained_model_file_path = models_dir / f"{NUM_STRATIFIED_SAMPLES}_trained_model.pth"
# trained_model = torch.load(trained_model_file_path)
# print(type(trained_model))

In [None]:
# # Load the entire LSTM model object

# trained_model_file_path = models_dir / f"{NUM_STRATIFIED_SAMPLES}_trained_model.pth"
# trained_model = torch.load(trained_model_file_path)

# # # If you want to use the model for inference
# # trained_model.eval()  # Set the model to evaluation mode

# from scr.game import *

# word = 'may'

# play_game_with_a_word(trained_model, \
#     word, char_frequency, max_word_length)

In [None]:
# Example usage
result = play_games_and_calculate_stats(trained_model, \
    sampled_test_words, char_frequency, max_word_length)

print(f"Overall Win Rate: {result['overall_win_rate']}%, Overall Average Attempts: {result['overall_avg_attempts']}")

# for length, data in result["length_wise_stats"].items():
#     print(f"Length {length}: Win Rate: {data['win_rate']}%, Average Attempts: {data['average_attempts_used']}")

In [None]:
performance_metrics = result['length_wise_stats']

plot_hangman_stats(performance_metrics)

In [None]:
performance_metrics