##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import pickle
from pathlib import Path
from scr.dataset import *
from scr.utils import *
# # For inference
from scr.feature_engineering import *

import gc

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

from scr.utils import print_scenarios
torch.set_float32_matmul_precision('medium')
from pathlib import Path

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Data Dir

In [2]:
NUM_STRATIFIED_SAMPLES = 1_000 # This will be overwritten by Papermill

NUM_WORD_SAMPLE = 20_000 # words for testing

FAST_DEV_RUN = False

MAX_EPOCH = 15

In [3]:
# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
parquet_train_path = stratified_samples_dir / 'train_parquets'
parquet_valid_path = stratified_samples_dir / 'valid_parquets'

# Define and create the directory for models
models_dir = Path("/home/sayem/Desktop/Hangman/models")
models_dir.mkdir(parents=True, exist_ok=True)

test_data = Path("/home/sayem/Desktop/Hangman/data/20k.txt")

testing_word_list = read_words(test_data)

In [4]:
try:
    testing_word_list = read_words(test_data)
    sampled_test_words = sample_words(testing_word_list, NUM_WORD_SAMPLE)
    print(f"Sampled {len(sampled_test_words)} unique words for testing.")
except FileNotFoundError:
    print(f"File not found: {test_words_file_path}")

Sampled 20000 unique words for testing.


##### Model Building

##### Dataset Loading and train-test split

In [5]:
# Create datasets directly from the saved parquet files
train_dataset = HangmanDataset(parquet_train_path)
valid_dataset = HangmanDataset(parquet_valid_path)

In [6]:
print(len(train_dataset))
print(len(valid_dataset))

assert len(train_dataset) > len(valid_dataset)

36588
9150


In [7]:
# Usage
from scr.model import *
from scr.trainer import *

lstm_model = SimpleLSTM(input_dim=145, hidden_dim=256, output_dim=28, 
                                num_layers=2, missed_char_dim=28)

# Save the model's state dictionary
model_file = models_dir / f"{NUM_STRATIFIED_SAMPLES}_untrained_model.pth"
torch.save(lstm_model, model_file)
print(f"Model saved at {model_file}")

Model saved at /home/sayem/Desktop/Hangman/models/1000_untrained_model.pth


##### Data Loaders

In [8]:
train_loader = DataLoader(train_dataset, batch_size=512, 
                          collate_fn=custom_collate_fn, 
                          shuffle=True, 
                          num_workers=15,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs
                          
val_loader = DataLoader(valid_dataset, batch_size=512, 
                          collate_fn=custom_collate_fn, 
                          shuffle=True, 
                          num_workers=15,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs

In [9]:
learning_rate = 0.01

lightning_model = HangmanModel(lstm_model, learning_rate, char_frequency, 
                               max_word_length, test_words=sampled_test_words)

##### Testing on Untrained Model

In [11]:
from pytorch_lightning.profilers import SimpleProfiler
from pathlib import Path
import torch
import pytorch_lightning as pl
from scr.custom_callbacks import *

# Create a PyTorch Lightning trainer and train the model
profiler = SimpleProfiler()

# # Create the callbacks
loss_logging_callback = LossLoggingCallback()

callbacks=[early_stop_callback, loss_logging_callback]

# Create a PyTorch Lightning trainer with the callbacks
profiler = SimpleProfiler()

trainer = pl.Trainer(
    fast_dev_run=FAST_DEV_RUN, 
    max_epochs=MAX_EPOCH, 
    callbacks=callbacks
)

trainer.validate(model=lightning_model, dataloaders=val_loader)

trainer.fit(lightning_model, train_loader, val_loader)
# print(profiler.summary())

# Save the model's state dictionary 
trained_model_file = models_dir / f"{NUM_STRATIFIED_SAMPLES}_trained_model.pth"
torch.save(lstm_model, trained_model_file)
print(f"Model saved at {trained_model_file}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

Processing words: 100%|██████████| 20000/20000 [01:25<00:00, 232.77word/s]
Processing words: 100%|██████████| 20000/20000 [01:17<00:00, 259.29word/s]
Processing words: 100%|██████████| 20000/20000 [01:17<00:00, 257.65word/s]
Processing words: 100%|██████████| 20000/20000 [01:17<00:00, 258.57word/s]
Processing words: 100%|██████████| 20000/20000 [01:15<00:00, 265.40word/s]
Processing words: 100%|██████████| 20000/20000 [01:15<00:00, 263.58word/s]
Processing words: 100%|██████████| 20000/20000 [01:15<00:00, 265.23word/s]
Processing words: 100%|██████████| 20000/20000 [01:16<00:00, 260.14word/s]
Processing words: 100%|██████████| 20000/20000 [01:17<00:00, 256.82word/s]
Processing words: 100%|██████████| 20000/20000 [01:18<00:00, 255.38word/s]
Processing words: 100%|██████████| 20000/20000 [01:17<00:00, 258.43word/s]
Processing words: 100%|██████████| 20000/20000 [01:15<00:00, 265.85word/s]
Processing words: 100%|██████████| 20000/20000 [01:15<00:00, 265.52word/s]
Processing words: 100%|██

Epoch 0: Validation Loss: 0.07781682908535004, Miss Penalty: 0.05623424053192139, Win Rate: 0.7250001430511475
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_win_rate         0.7250001430511475
     val_loss_epoch         0.07781682908535004
 val_miss_penalty_epoch     0.05623424053192139
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Processing words: 100%|██████████| 20000/20000 [01:31<00:00, 219.22word/s]
Processing words: 100%|██████████| 20000/20000 [01:24<00:00, 238.00word/s]

Epoch 0: Validation Loss: 0.07819022983312607, Miss Penalty: 0.05665992200374603, Win Rate: 0.7250000238418579





Training: |          | 0/? [00:00<?, ?it/s]

##### Testing

In [None]:
# Load the entire LSTM model object
trained_model_file_path = models_dir / f"{NUM_STRATIFIED_SAMPLES}_trained_model.pth"
trained_model = torch.load(trained_model_file_path)

# If you want to use the model for inference
trained_model.eval()  # Set the model to evaluation mode

from scr.game import *

word = 'may'

play_game_with_a_word(trained_model, \
    word, char_frequency, max_word_length)

In [None]:
NUM_STRATIFIED_SAMPLES

In [None]:
# Example usage
result = play_games_and_calculate_stats(trained_model, \
    sampled_test_words, char_frequency, max_word_length)

print(f"Overall Win Rate: {result['overall_win_rate']}%, Overall Average Attempts: {result['overall_avg_attempts']}")

for length, data in result["length_wise_stats"].items():
    print(f"Length {length}: Win Rate: {data['win_rate']}%, Average Attempts: {data['average_attempts_used']}")