##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import pickle
from pathlib import Path
from scr.dataset import *
from scr.utils import *
# # For inference
from scr.feature_engineering import *

import gc

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

from scr.utils import print_scenarios
torch.set_float32_matmul_precision('medium')
from pathlib import Path

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Data Dir

In [2]:
NUM_STRATIFIED_SAMPLES = 100 # This will be overwritten by Papermill

NUM_WORD_SAMPLE = 1_000 # words for testing

FAST_DEV_RUN = False

MAX_EPOCH = 15

In [3]:
from pathlib import Path
from scr.custom_sampler import *

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
parquet_path = stratified_samples_dir / 'parquets'

# Create directories for train and validation parquets if they don't exist
parquet_path.mkdir(parents=True, exist_ok=True)
# parquet_valid_path.mkdir(parents=True, exist_ok=True)

# Define and create the directory for models
models_dir = Path("/home/sayem/Desktop/Hangman/models")
models_dir.mkdir(parents=True, exist_ok=True)

# Define your output directory
# Define your output directory and logger directory
output_dir = Path("/home/sayem/Desktop/Hangman/training_outputs")
logger_dir = output_dir / "lightning_logs"

# Create the output and logger directories if they don't exist
output_dir.mkdir(parents=True, exist_ok=True)
logger_dir.mkdir(parents=True, exist_ok=True)

# Define the file path for saving the testing words
testing_words_file_path = stratified_samples_dir / "testing_words.txt"

try:
    testing_word_list = read_words(testing_words_file_path)
    print(f"Length of the testing word list: {len(testing_word_list)}")
    sampled_test_words = stratified_sample_by_length_and_uniqueness(testing_word_list, \
        NUM_WORD_SAMPLE)
    print(f"Sampled {len(sampled_test_words)} unique words for testing.")
except FileNotFoundError:
    print(f"File not found: {testing_words_file_path}")

print(len(sampled_test_words))

Length of the testing word list: 10048
Sampled 1085 unique words for testing.
1085


##### Dataset Loading

In [4]:
# Create datasets directly from the saved parquet files
hangman_dataset = HangmanDataset(parquet_path)
# valid_dataset = HangmanDataset(parquet_valid_path)

from scr.utils import *

# Assuming `hangman_dataset` is an instance of HangmanDataset
# Usage
train_dataset, valid_dataset \
    = split_hangman_dataset(hangman_dataset, 0.8)

print(len(train_dataset))
print(len(valid_dataset))

assert len(train_dataset) > len(valid_dataset)

6924
1734


In [5]:
hangman_dataset.get_all_group_labels()

([('hard', 'lose'),
  ('easy', 'win'),
  ('medium', 'win'),
  ('hard', 'win'),
  ('medium', 'lose'),
  ('easy', 'lose')],
 [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  29])

In [6]:
len(train_dataset)

6924

In [7]:
type(('10'))

str

In [8]:
# Request a sample with word length 29
word_length = 10
sample = hangman_dataset[(word_length,)]

In [9]:
sample

{'game_id': 3884,
 'word': 'chemigraph',
 'initial_state': ['c__m___ap_'],
 'final_state': 'ch_m_g_aph',
 'guessed_states': ['c__m___ap_',
  'ch_m___aph',
  'ch_m_g_aph',
  'ch_m_g_aph',
  'ch_m_g_aph',
  'ch_m_g_aph',
  'ch_m_g_aph',
  'ch_m_g_aph'],
 'guessed_letters': ['h', 'g', 'b', 'd', 'w', 'z', 't', 'j'],
 'game_state': 'midRevealed',
 'difficulty': 'medium',
 'outcome': 'win',
 'word_length': 10,
 'won': False}

In [10]:
from scr.data_module import *
from scr.dataset import *

# Initialize Data Module
initial_batch_size = 128  # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                custom_collate_fn)

In [12]:
train_loader = data_module.train_dataloader()

In [13]:
sample_performance_metrics = {
    'win_rate_1': 45, 'avg_attempts_1': 5,
    'win_rate_2': 60, 'avg_attempts_2': 3,  # This should be filtered out
    'win_rate_3': 30, 'avg_attempts_3': 6,
    'win_rate_4': 55, 'avg_attempts_4': 7   # This should be filtered out due to high win rate
}

In [15]:
# Initialize PerformanceBasedSampler
sampler = PerformanceBasedSampler(dataset=hangman_dataset, 
                                  performance_metrics=sample_performance_metrics, 
                                  batch_size=10)

# Verify target pairs
print("Target pairs:", sampler.target_pairs)


Target pairs: [(1,), (3,)]
