##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import pickle
from pathlib import Path
from scr.dataset import *
from scr.utils import *
# # For inference
from scr.feature_engineering import *

import gc

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

from scr.utils import print_scenarios
torch.set_float32_matmul_precision('medium')
from pathlib import Path

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Data Dir

In [2]:
NUM_STRATIFIED_SAMPLES = 100 # This will be overwritten by Papermill

NUM_WORD_SAMPLE = 1_000 # words for testing

FAST_DEV_RUN = False

MAX_EPOCH = 250

In [3]:
from pathlib import Path
from scr.custom_sampler import *

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
parquet_path = stratified_samples_dir / 'parquets'

# Create directories for train and validation parquets if they don't exist
parquet_path.mkdir(parents=True, exist_ok=True)
# parquet_valid_path.mkdir(parents=True, exist_ok=True)

# Define and create the directory for models
models_dir = Path("/home/sayem/Desktop/Hangman/models")
models_dir.mkdir(parents=True, exist_ok=True)

# Define your output directory
# Define your output directory and logger directory
output_dir = Path("/home/sayem/Desktop/Hangman/training_outputs")
logger_dir = output_dir / "lightning_logs"

# Create the output and logger directories if they don't exist
output_dir.mkdir(parents=True, exist_ok=True)
logger_dir.mkdir(parents=True, exist_ok=True)

# Define the file path for saving the testing words
testing_words_file_path = stratified_samples_dir / "testing_words.txt"

try:
    testing_word_list = read_words(testing_words_file_path)
    print(f"Length of the testing word list: {len(testing_word_list)}")
    sampled_test_words = stratified_sample_by_length_and_uniqueness(testing_word_list, NUM_WORD_SAMPLE)
    print(f"Sampled {len(sampled_test_words)} unique words for testing.")
except FileNotFoundError:
    print(f"File not found: {testing_words_file_path}")

print(len(sampled_test_words))

Length of the testing word list: 10048
Sampled 1085 unique words for testing.
1085


##### Dataset Loading and train test split

In [4]:
parquet_path

PosixPath('/media/sayem/510B93E12554BBD1/dataset/100/parquets')

In [5]:
# Create datasets directly from the saved parquet files
hangman_dataset = HangmanDataset(parquet_path)
# valid_dataset = HangmanDataset(parquet_valid_path)

from scr.utils import *

# Split the dataset into training and validation sets
train_dataset, valid_dataset = hangman_dataset.split(test_size=0.2)

In [6]:
assert len(train_dataset) > len(valid_dataset)

In [7]:
# Access the first element in the dataset
first_element = hangman_dataset[0]  # Using an integer index
print(first_element)

{'game_id': 4032, 'word': 'alta', 'initial_state': ['____'], 'final_state': 'alta', 'guessed_states': ['____', '__t_', '_lt_', 'alta'], 'guessed_letters': ['t', 'l', 'a'], 'game_state': 'allMasked', 'difficulty': 'easy', 'outcome': 'win', 'word_length': 4, 'won': True}


In [8]:
# Access the first row of the first file (assuming at least one row exists)
tuple_element = hangman_dataset[(0, 0)]  # Using a tuple (file index, row index)
print(tuple_element)

{'game_id': 4032, 'word': 'alta', 'initial_state': ['____'], 'final_state': 'alta', 'guessed_states': ['____', '__t_', '_lt_', 'alta'], 'guessed_letters': ['t', 'l', 'a'], 'game_state': 'allMasked', 'difficulty': 'easy', 'outcome': 'win', 'word_length': 4, 'won': True}


In [9]:
import numpy as np

# Set the random seed for reproducible results
np.random.seed(42)

# Generate sequence lengths from 1 to 20
sequence_lengths = np.arange(1, 21)  # 1 to 20 inclusive

# Generate random miss penalties between 0.05 and 0.3 for each sequence length
miss_penalties = np.round(np.random.uniform(0.05, 0.3, len(sequence_lengths)), 2)

# Combine sequence lengths and miss penalties into a dictionary
performance_metrics = dict(zip(sequence_lengths, miss_penalties))

# Display the generated performance metrics
performance_metrics

{1: 0.14,
 2: 0.29,
 3: 0.23,
 4: 0.2,
 5: 0.09,
 6: 0.09,
 7: 0.06,
 8: 0.27,
 9: 0.2,
 10: 0.23,
 11: 0.06,
 12: 0.29,
 13: 0.26,
 14: 0.1,
 15: 0.1,
 16: 0.1,
 17: 0.13,
 18: 0.18,
 19: 0.16,
 20: 0.12}

In [10]:
composite_score = \
    {13: 1.0144969265237789, 14: 1.0139399278380974, 12: 1.0145889222021265, 11: 1.0149790846522682, \
        15: 1.0137005413389382, 10: 1.0149863660317038, 8: 0.9827807551208211, 7: 0.9876452759289711, \
            9: 1.014814988760513, 5: 1.0138412270631227, 4: 1.013670403951156, 2: 1.0134486201164408, \
                3: 0.01386993304338964, 6: 1.0140813705881533, 17: 1.0140782178690035, 16: 1.0145310966095122, \
                    18: 1.0138246885811288, 19: 1.0127181950956583, 21: 1.012694844044745, 20: 1.01259642560035}

In [11]:
# # Specify the batch size for the sampler
# batch_size = 3

# # Initialize the PerformanceBasedSampler
# sampler = PerformanceBasedSampler(dataset=hangman_dataset, \
#     performance_metrics=composite_score, batch_size=batch_size)

In [12]:
# next(iter(sampler))

In [13]:
# from torch.utils.data import DataLoader

# # Specify the batch size for the sampler
# batch_size = 128

# # Initialize the PerformanceBasedSampler
# sampler = PerformanceBasedSampler(dataset=hangman_dataset, \
#     performance_metrics=performance_metrics, batch_size=batch_size)

# # Initialize DataLoader with the HangmanDataset and PerformanceBasedSampler
# data_loader = DataLoader(
#     dataset=hangman_dataset,
#     batch_sampler=sampler,  # Correct usage for custom batch handling
#     collate_fn=new_custom_collate_fn,  # Custom collate function, if needed
# )

# import torch  # Ensure torch is imported

# for batch in tqdm(data_loader):
#     states = batch['guessed_states']
#     guesses = batch['guessed_letters']
#     max_seq_length = batch['max_seq_len']
#     original_seq_lengths = batch['original_seq_lengths']


#     batch_features, batch_missed_chars = process_batch_of_games(
#                 states, guesses, char_frequency,
#                     max_word_length,
#                     max_seq_length)

#     print(f"{batch_features.shape}")

#     break


In [14]:
# hangman_dataset[(0, 34)]

In [15]:
from scr.data_module import *

# Initialize Data Module
initial_batch_size = 1024 # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                new_custom_collate_fn)

In [16]:
# performance_metrics_dict = {k: v for d in combined_eval_metrics for k, v in d.items()}

# # # # # print("Converted Performance Metrics Dictb ionary:", performance_metrics_dict)

# data_module.update_performance_metrics(composite_score)

In [17]:
from tqdm import tqdm

# Assuming data_module is an instance of HangmanDataModule
data_loader = data_module.train_dataloader()  # Call the method to get the DataLoader instance

# Initialize a list to store the diversity of sequence lengths in each batch
batch_seq_length_diversity = []

for batch in tqdm(data_loader):
    
    states = batch['guessed_states']
    guesses = batch['guessed_letters']
    max_seq_length = batch['max_seq_len']
    original_seq_lengths = batch['original_seq_lengths']

    # # Extract original_seq_lengths from the batch
    # original_seq_lengths = batch['original_seq_lengths']

    # Analyze the diversity of sequence lengths in the current batch
    unique_seq_lengths = set(original_seq_lengths)  # Use a set to find unique sequence lengths
    batch_seq_length_diversity.append(len(unique_seq_lengths))  # Store the count of unique lengths

    print(f"First state in batch: {batch['guessed_states'][0]}")
    # print(f"Original sequence lengths in batch: {original_seq_lengths}")
    print(f"Unique sequence lengths in batch: {unique_seq_lengths}")
    print(f"Diversity (number of unique sequence lengths) in batch: {len(unique_seq_lengths)}")

    # break  # Remove this break to analyze all batches

    print()

# After collecting data, analyze the overall diversity
print(f"Average diversity of sequence lengths across batches: {sum(batch_seq_length_diversity) / len(batch_seq_length_diversity)}")


  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:01<00:00,  3.59it/s]

First state in batch: ['____c__s__c', '____c__s__c', '____c__s__c', '____c__s__c', '____c__s_ic', '____c__s_ic', '____c__s_ic', '____c__s_ic', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
Diversity (number of unique sequence lengths) in batch: 20

First state in batch: ['_________a__', '_________a__', '_________a__', '_________a__', '_________a__', '_________a__', '_r___r___a__', '_r___r___a__', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
Diversity (number of unique sequence lengths) in batch: 19

First state in batch: ['_____________________', '_r_________r_________', '_r_________r________s', '_r___n__n__r___n__n_s', '_ri__n_in__ri__n_in_s', '_ri__n_in__ri__n_in_s', '_ri__n_in__ri__n_in_s', '_ri__n_ine_ri__n_ines', 'bri__n_inebri__n_ines', 'bri__n_inebri__n_ines', 'br




In [18]:
from tqdm import tqdm

maximum_word_length = 29
# Assuming data_module is an instance of HangmanDataModule
data_loader = data_module.train_dataloader()  # Call the method to get the DataLoader instance

# Feature descriptions
feature_descriptions = [
    "Uncovered Progress: Fraction of the word uncovered",
    "Missed Guesses: Total number of incorrect guesses",
    "Duplicate Guesses: Total number of duplicate guesses",
    "Incorrect Guesses: Total number of guesses that didn't reveal any new letters",
    "Endgame Proximity: Proportion of letters guessed to the maximum word length",
    "Guess Diversity: Diversity of guessed letters",
    "Initial Letter Reveal: Whether the first letter was revealed (1) or not (0)",
    "Critical Letter Uncover Rate: Rate at which critical vowels (aeiou) were uncovered",
    "Guess Efficiency: Proportion of correct guesses to total guesses made",
    "Overall Success Rate: Proportion of successful guesses to total guesses",
    "Remaining Attempts: Number of attempts left after all guesses",
    "Late Game Success Rate: Success rate in the second half of the game",
    "Longest Success Streak: Longest streak of consecutive correct guesses",
    "Final State Achieved: Whether the final state (the complete word) was achieved (1) or not (0)"
]

for batch in tqdm(data_loader):
    print(batch.keys())  # To verify the keys in each batch
    game_states = batch['guessed_states']
    guessed_letters = batch['guessed_letters']

    # Analyze the first game in the batch for demonstration
    features = analyze_and_extract_features(game_states[0], guessed_letters[0],  maximum_word_length)

    # print("\nFirst game states in batch:")
    # for state in game_states[0]:
    #     print(state)

    print(f"{game_states[0]}")
    print(f"{guessed_letters[0]}")

    # print("\nGuessed letters for the first game in batch:")
    # print(guessed_letters[0])

    print("\nExtracted Features for the first game in batch:")
    for feature, value in features.items():
        # Check if the value is a tensor or a NumPy array and then call .item()
        if hasattr(value, 'item'):
            print(f"{feature}: {value.item()}")
        else:
            # Directly print the value if it's not a tensor/NumPy array
            print(f"{feature}: {value}")

    # print(features.shape)


    break  # Remove this break to analyze all batches


  0%|          | 0/7 [00:01<?, ?it/s]

dict_keys(['guessed_states', 'guessed_letters', 'max_seq_len', 'original_seq_lengths', 'difficulty', 'outcome', 'word_length', 'won', 'initial_state', 'final_state', 'game_state'])
['____', '____', '____', '____', 'p_p_', 'p_p_', 'p_p_', 'p_p_', '', '', '', '', '', '', '', '', '', '', '', '']
['o', 'c', 'q', 'p', 'y', 'w', 'j', '', '', '', '', '', '', '', '', '', '', '', '']

Extracted Features for the first game in batch:
uncovered_progress: 1
missed_guesses: 7
duplicate_guesses: 0
incorrect_guesses: 7
endgame_proximity: 0.2413793103448276
guess_diversity: 0.42105263157894735
initial_letter_reveal: 0
critical_letter_uncover_rate: 0
guess_efficiency: 0.0
overall_success_rate: 0.0
remaining_attempts: 0
late_game_success_rate: 0.0
longest_success_streak: 0
final_state_achieved: 0
vowel_consonant_ratio: 0.16666666666666666





In [19]:
# Assuming data_module is an instance of HangmanDataModule
data_loader = data_module.train_dataloader  # Call the method to get the DataLoader

for batch in tqdm(data_loader()):
    states = batch['guessed_states']
    guesses = batch['guessed_letters']
    max_seq_length = batch['max_seq_len']
    original_seq_lengths = batch['original_seq_lengths']

    # Process the batch using your custom function
    batch_features, batch_missed_chars = process_batch_of_games(
        states, guesses, char_frequency, max_word_length, max_seq_length)

    print(f"Batch features shape: {batch_features.shape}")
    # break  # Remove this break to process all batches, it's here just to process the first batch

    # print(states[0])

    # print(original_seq_lengths[0])

    break


  0%|          | 0/7 [00:02<?, ?it/s]

Batch features shape: torch.Size([1024, 21, 160])





In [20]:
STOP

NameError: name 'STOP' is not defined

In [None]:
from tqdm import tqdm
from collections import Counter

# Assuming data_module is an instance of HangmanDataModule
data_loader = data_module.train_dataloader()  # Call the method to get the DataLoader instance

# Initialize lists to store the diversity of sequence lengths and word lengths in each batch
batch_seq_length_diversity = []
batch_word_length_diversity = []

for batch in tqdm(data_loader):
    # Extract original_seq_lengths and word_len from the batch
    original_seq_lengths = batch['original_seq_lengths']
    word_lengths = batch['word_length']

    # Analyze the diversity of sequence lengths in the current batch
    unique_seq_lengths = set(original_seq_lengths)
    batch_seq_length_diversity.append(len(unique_seq_lengths))

    # Analyze the diversity of word lengths in the current batch
    unique_word_lengths = set(word_lengths)
    batch_word_length_diversity.append(len(unique_word_lengths))

    # Count the number of examples under each unique sequence length
    seq_len_counts = Counter(original_seq_lengths)
    # Count the number of examples under each unique word length
    word_len_counts = Counter(word_lengths)

    print(f"First state in batch: {batch['guessed_states'][0]}")
    print(f"Unique sequence lengths in batch: {unique_seq_lengths}")
    print(f"Diversity (number of unique sequence lengths) in batch: {len(unique_seq_lengths)}")
    print(f"Number of examples under each unique sequence length in batch: {seq_len_counts}")
    print(f"Unique word lengths in batch: {unique_word_lengths}")
    print(f"Diversity (number of unique word lengths) in batch: {len(unique_word_lengths)}")
    print(f"Number of examples under each unique word length in batch: {word_len_counts}")

    print()

# After collecting data, analyze the overall diversity
average_seq_length_diversity = sum(batch_seq_length_diversity) / len(batch_seq_length_diversity) if batch_seq_length_diversity else 0
average_word_length_diversity = sum(batch_word_length_diversity) / len(batch_word_length_diversity) if batch_word_length_diversity else 0
print(f"Average diversity of sequence lengths across batches: {average_seq_length_diversity}")
print(f"Average diversity of word lengths across batches: {average_word_length_diversity}")

In [None]:
STOP

In [None]:
import numpy as np
from tqdm import tqdm

# Assuming `data_loader` is already defined and configured
total_samples_processed = 0
batch_sizes = []

for batch in tqdm(data_loader()):
    batch_size_current = len(batch['guessed_states'])  # Assuming batch is a dictionary with 'guessed_states'
    total_samples_processed += batch_size_current
    batch_sizes.append(batch_size_current)

In [None]:
# Calculate expected values
expected_num_batches = np.ceil(len(data_module.train_dataset) / data_module.batch_size)
actual_num_batches = len(batch_sizes)
final_batch_size = len(data_module.train_dataset) % data_module.batch_size or data_module.batch_size

# Sanity checks
print(f"Expected number of batches: {expected_num_batches}")
print(f"Actual number of batches: {actual_num_batches}")
print(f"Expected final batch size: {final_batch_size}")
print(f"Actual final batch size: {batch_sizes[-1]}")
print(f"Total samples in dataset: {len(data_module.train_dataset)}")
print(f"Total samples processed: {total_samples_processed}")

# Verifying if the dataset was fully covered
assert total_samples_processed == len(data_module.train_dataset), "Mismatch in the number of processed samples and dataset size."

In [None]:
data_loader = data_module.val_dataloader

for batch in data_loader():
    states = batch['guessed_states']
    guesses = batch['guessed_letters']
    max_seq_length = batch['max_seq_len']
    original_seq_lengths = batch['original_seq_lengths']


    batch_features, batch_missed_chars = process_batch_of_games(
                states, guesses, char_frequency,
                    max_word_length,
                    max_seq_length)

    print(f"{batch_features.shape}")

    break

In [None]:
 dataset_size = len(data_module.train_dataloader().dataset)

 dataset_size

In [None]:
batch_size = data_module.train_dataloader().batch_size

In [None]:
batch_size

In [None]:
# Tunable hyperparameters
LEARNING_RATE = 0.01
HIDDEN_DIM = 32 ### 32
NUM_LAYERS = 3
EMBEDDING_DIM = 30

In [None]:
from scr.encoder import *
from scr.trainer_ import *
from scr.decoder import SimpleLSTM

# Fixed parameters
max_word_length = 29  # Maximum word length
num_embeddings = 28   # Vocabulary size (fixed, based on unique characters in the game)
num_features = 154     # Number of features per character
missed_char_dim = 28  # Additional dimension for missed characters
output_dim = 28       # Output dimension for the model

char_feature_dim = 5  # Features per character
embedding_dim = 50
additional_state_features = num_features \
    - max_word_length * char_feature_dim   # Additional features per state

print(f"Addition state fetatures: {additional_state_features}")
# Initialize the Encoder

encoder = Encoder(num_embeddings, embedding_dim, max_word_length, \
    char_feature_dim, additional_state_features)

input_dim = max_word_length * embedding_dim + additional_state_features
# Initialize the SimpleLSTM decoder
decoder = SimpleLSTM(input_dim=input_dim, 
                     hidden_dim=HIDDEN_DIM, 
                     output_dim=output_dim, 
                     num_layers=NUM_LAYERS,
                     missed_char_dim=missed_char_dim)

# Other parts of the code flow...

In [None]:
# Initialize the HangmanModel
lightning_model = HangmanModel(encoder, decoder, 
                    LEARNING_RATE, char_frequency, 
                    max_word_length, test_words=sampled_test_words)

In [None]:
lightning_model.optimizer_type

In [None]:
from scr.data_module import *

# Initialize Data Module
initial_batch_size = 1 # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                custom_collate_fn)
                                # performance_metrics=None)

In [None]:
from scr.custom_sampler import *
from scr.dataset import *

In [None]:
# from scr.feature_engineering import *


for batch in data_module.train_dataloader():
    print(batch.keys())

    states = batch['guessed_states']
    guesses = batch['guessed_letters']
    max_seq_length = batch['max_seq_len']
    original_seq_lengths = batch['original_seq_lengths']

    batch_features, batch_missed_chars = process_batch_of_games(
                                    states, guesses, char_frequency,
                                    max_word_length,
                                    max_seq_length)

    print(f"Batch features shape {batch_features.shape}\n")


    encoded_guess = pad_and_reshape_labels(guesses, max_seq_length)

    print(f"states: {states}")
    print(f"Guesses: {guesses}\n")
    print(f"Guesses shape: {guesses}\n")
    # print(f"batch missed chars: {batch_missed_chars}\n")

    # print(f"Encoded guess: {encoded_guess}\n")


    print(f"batch missed chars shape: {batch_missed_chars.shape}\n")

    print(f"Encoded guess shape: {encoded_guess.shape}\n")


    # Convert the batch to characters
    missed_chars = batch_to_chars(batch_missed_chars)

    print(f"batch missed chars : So far present characters (that NN should not guess again): {missed_chars}\n")

    break

In [None]:
STOP

In [None]:
## callbacks
from pytorch_lightning.callbacks import Callback, EarlyStopping
from scr.custom_callbacks import *

# Setup EarlyStopping to monitor the test_win_rate
early_stop_callback = EarlyStopping(
    monitor='win_rate',
    min_delta=0.00,
    patience=200,
    verbose=True,
    mode='max'  # Maximize the win rate
)

step_level_early_stopping = StepLevelEarlyStopping(
    monitor='val_miss_penalty', # Metric to monitor
    min_delta=0.0,              # Minimum change to qualify as an improvement
    patience=5                  # Number of steps with no improvement after which training will be stopped
)

In [None]:
from pytorch_lightning.profilers import SimpleProfiler
import pytorch_lightning as pl
from scr.custom_callbacks import *
from scr.dataset import *
from scr.data_module import *
from scr.trainer_ import *

torch.cuda.empty_cache()

# # Create Callbacks
# loss_logging_callback = LossLoggingCallback()

# , SchedulerSetupCallback()] # , loss_logging_callback]
callbacks = [early_stop_callback, step_level_early_stopping] 
FAST_DEV_RUN = False
# # Calculate the minimum percentage of validation batches
# min_val_batches = 1 / len(data_module.val_dataloader())

# # Create Trainer with Callbacks
trainer = pl.Trainer(
    default_root_dir=output_dir,
    fast_dev_run=FAST_DEV_RUN, 
    max_epochs=MAX_EPOCH, 
    callbacks=callbacks,
    num_sanity_val_steps=0,
    reload_dataloaders_every_n_epochs=1,
    enable_progress_bar=True 
    # val_check_interval=0.5
    # limit_train_batches=2,  # Limit the number of training batches to 2
    # limit_val_batches=2     # Limit the number of validation batches to 2
)

# # # # # print(f"Running for {NUM_STRATIFIED_SAMPLES} samples...")
# # # print()
# # # # # # # Assuming combined_eval_metrics is a list of dictionaries
# combined_eval_metrics = trainer.validate(model=lightning_model, datamodule=data_module)

# # performance_metrics_dict = {k: v for d in combined_eval_metrics for k, v in d.items()}

# # # # # # print("Converted Performance Metrics Dictb ionary:", performance_metrics_dict)

# # trainer.datamodule.update_performance_metrics(combined_eval_metrics)

# data_module.update_performance_metrics(performance_metrics_dict)

In [None]:
mo

In [None]:
STOP

In [None]:
# Sample data
sample_game = {
    'guessed_states': ['_d__d__e__', '_d__d__e__', '_d__d__e__', '_d__d_me__', \
        '_d__d_me__', '_d__d_me__', '_d__d_men_', '_d__d_men_'],
    'guessed_letters': ['c', 'r', 'm', 'v', 'i', 'n', 'w', 'o']
}

In [None]:
from scr.feature_engineering import *

In [None]:
# Define parameters
max_word_length = 29  # Set as per your model's requirement
max_seq_length = 10  # Set as per your model's requirement

# # Dummy character frequency (example, create as per your data)
# char_frequency = {char: 1 for char in 'abcdefghijklmnopqrstuvwxyz'}

# Call the function and unpack the returned tuple into two variables
char_sequence_tensor, missed_chars_tensor = process_game_sequence(
    game_states=sample_game['guessed_states'], 
    guessed_letters_sequence=sample_game['guessed_letters'],
    char_frequency=char_frequency, 
    max_word_length=max_word_length,
    max_seq_length=max_seq_length
)

# Now you can print the shape of each tensor separately
print(f"Character Sequence Tensor Shape: {char_sequence_tensor.shape}")
print(f"Missed Chars Tensor Shape: {missed_chars_tensor.shape}")

In [None]:
from scr.feature_engineering import *

In [None]:
# Sample data for a batch of games
batch_sample_games = {
    'guessed_states_batch': [
        ['_d__d__e__'],
        # ['_b__b__t__'],
        # ['__ll___p__'],
    ],
    
    'guessed_letters_batch': [
        ['c'],
        # ['a'],
        # ['i']
    ]
}

In [None]:
# # Sample data for a batch of games
# batch_sample_games = {
#     'guessed_states_batch': [
#         ['_d__d__e__', '_d__d__e__', '_d__d__e__', '_d__d_me__', '_d__d_me__', '_d__d_me__', '_d__d_men_', '_d__d_men_'],
#         ['_b__b__t__', '_b__b__t__', '_b__b__t__', '_b__b_t__', '_b__b_te__', '_b__b_te__', '_b__b_test', '_b__b_test'],
#         ['__ll___p__', '__ll___p__', '__ll___p__', '__ll___p__', '__ll___p__', '__ll__op_', '__ll__op_', '__ll__op_']
#     ],
    
#     'guessed_letters_batch': [
#         ['c', 'r', 'm', 'v', 'i', 'n', 'w', 'o'],
#         ['a', 's', 'e', 'f', 't', 'r', 'n', 'u'],
#         ['i', 'q', 'r', 't', 'p', 'o', 'a', 's']
#     ]
# }

# Define parameters
max_word_length = 29  # Set as per your model's requirement
max_seq_length = 10  # Set as per your model's requirement

# # Dummy character frequency (example, create as per your data)
# char_frequency = {char: 1 for char in 'abcdefghijklmnopqrstuvwxyz'}

# Call the function
batch_features, batch_missed_chars = process_batch_of_games(
    batch_sample_games['guessed_states_batch'],
    batch_sample_games['guessed_letters_batch'],
    char_frequency,
    max_word_length,
    max_seq_length=1
)

print()
# Print shapes for sanity check
print(f"Batch Features Shape: {batch_features.shape}")
print(f"Batch Missed Chars Shape: {batch_missed_chars.shape}")

In [None]:
batch_sample_games['guessed_states_batch'][0]

In [None]:
batch_sample_games['guessed_letters_batch'][0]

In [None]:
# Sample data
sample_game = {
    'guessed_states': ['_d__d__e__', '_d__d__e__', '_d__d__e__', '_d__d_me__', \
        '_d__d_me__', '_d__d_me__', '_d__d_men_', '_d__d_men_'],
    'guessed_letters': ['c', 'r', 'm', 'v', 'i', 'n', 'w', 'o']
}

In [None]:
overall_sucess_rate, guess_outcome = analyze_guess_outcomes(sample_game['guessed_states'], \
    sample_game['guessed_letters'], maximum_word_length=29)

In [None]:
overall_sucess_rate

In [None]:
guess_outcome

In [None]:
game_states = ['allMasked', 'early', 'quarterRevealed', 'midRevealed', 
               'midLateRevealed', 'lateRevealed', 'nearEnd']
game_state_to_idx = {state: idx for idx, state in enumerate(game_states)}

def encode_game_state(game_state):
    state_vector = [0] * len(game_states)
    state_index = game_state_to_idx.get(game_state, -1)
    if state_index >= 0:
        state_vector[state_index] = 1
    return state_vector

In [None]:
from scr.data_module import *

# Initialize Data Module
initial_batch_size = 1  # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                custom_collate_fn)
                                # performance_metrics=None)

# for batch in data_module.train_dataloader():
#     # print(batch)

# batch_features, batch_missed_chars = process_batch(next(iter(data_module.train_dataloader())), \
#     char_frequency, max_word_length)

# print(batch_features.shape)
# print(batch_missed_chars.shape)

# # break

for batch in data_module.train_dataloader():
    states = batch['guessed_states']
    # print(states.shape)
    guesses = batch['guessed_letters']
    max_seq_length = batch['max_seq_len']
    original_seq_lengths = batch['original_seq_lengths']

    # Print debug information
    print("DEBUG INFO:")
    # Uncomment these if needed
    # print(f"Miss Penalty: {miss_penalty}")
    # print(f"Type of Miss Penalty: {type(miss_penalty)}")
    print(f"Batch Word Lengths: {batch['word_length']}")
    print(f"Batch Difficulties: {batch['difficulty']}")
    print(f"Batch Outcomes: {batch['outcome']}")
    print(f"Batch Won Flags: {batch['won']}")

    # Print the entire batch
    print("Batch Contents:")
    for key, value in batch.items():
        print(f"{key}: {value}")

    batch_features, batch_missed_chars = process_batch_of_games(
        states, char_frequency,
        max_word_length,
        max_seq_length)

    print()
    print(f"{batch_features.shape}")
    print(f"{batch_missed_chars.shape}")
    break

In [None]:
miss_char.shape

In [None]:
# # # # Validate the model (if needed)
# trainer.validate(model=lightning_model, datamodule=data_module)
print(f"Training Begin for {NUM_STRATIFIED_SAMPLES} words: {len(train_dataset)} Games")
# # # # # # Fit the model
trainer.fit(lightning_model, data_module)

# # Optionally print the profiler summary
# print(profiler.summary())

# Save the entire modell
trained_model_file = models_dir / f"{NUM_STRATIFIED_SAMPLES}_trained_model.pth"
torch.save(lightning_model, trained_model_file)
print(f"Model saved at {trained_model_file}")

In [None]:
batch_features.shape

In [None]:
batch_missed_chars.shape

In [None]:
features, missed_chars = build_enhanced_feature_set(sample, \
    char_frequency, max_word_length)

In [None]:
features.shape

In [None]:
STOP

In [None]:
import torch
import torch.nn as nn

# Sample dimensions
missed_char_dim = 28  # Dimension of missed character vector
hidden_dim = 10       # Arbitrary hidden dimension for output

class TestModel(nn.Module):
    def __init__(self):
        super(TestModel, self).__init__()
        self.miss_linear = nn.Linear(missed_char_dim, hidden_dim)

    def forward(self, missed_chars):
        missed_chars_processed = self.miss_linear(missed_chars)
        return missed_chars_processed

# Create model
model = TestModel()

# Sample data: a batch of size 1 with 28 missed character indicators
# Creating a sample input with 28 values, each being 0 or 1
missed_chars = torch.tensor([0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, \
    1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1], dtype=torch.float)

# Forward pass
output = model(missed_chars)
output.shape

In [None]:
def calculate_difficulty_score(metrics):
    # Extracting the metrics
    win_rate = metrics.get('performance_wins', 0)
    avg_attempts = metrics.get('performance_total_attempts_used', 0)
    miss_penalty = metrics.get('miss_penalty_avg', 0)

    # Weights for each metric (these can be adjusted)
    weight_win_rate = 1.0   # Higher weight as win rate is a strong indicator of difficulty
    weight_avg_attempts = 0.5  # Moderate weight
    weight_miss_penalty = 0.5  # Moderate weight

    # Normalize the metrics (invert win rate as lower win rate indicates higher difficulty)
    normalized_win_rate = (100 - win_rate) / 100
    # normalized_avg_attempts = avg_attempts / 6  # Assuming max avg_attempts is 6
    normalized_miss_penalty = miss_penalty  # Already in range 0 to 1

    # Calculate the composite score
    composite_score = (
        weight_win_rate * normalized_win_rate +
        weight_miss_penalty * normalized_miss_penalty
    )

    return composite_score

# Example usage
metrics = {
    'performance_wins': 0,  # Example values
    # 'performance_total_attempts_used': 6,
    'miss_penalty_avg': 1
}

score = calculate_difficulty_score(metrics)
print("Difficulty Score:", score)

# # Use the score to determine if the word length should be added to target pairs
# if score >= 0.001:  # Define this threshold based on your game's difficulty scale
#     target_pairs.append((int(word_length),))


In [None]:
def calculate_difficulty_score(metrics):
    # Extracting the metrics
    win_rate = metrics.get('performance_wins', 0)
    miss_penalty = metrics.get('miss_penalty_avg', 0)

    # Weights for each metric
    weight_win_rate = 1.0   # Higher weight for win rate
    weight_miss_penalty = 0.5  # Weight for miss penalty

    # Normalize the metrics (invert win rate as lower win rate indicates higher difficulty)
    normalized_win_rate = (100 - win_rate) / 100
    normalized_miss_penalty = miss_penalty  # Already in range 0 to 1

    # Calculate the composite score
    composite_score = (
        weight_win_rate * normalized_win_rate +
        weight_miss_penalty * normalized_miss_penalty
    )

    return composite_score


# Best-case scenario metrics
best_case_metrics = {
    'performance_wins': 100,  # Maximum win rate
    'miss_penalty_avg': 0     # Minimum miss penalty
}

best_case_score = calculate_difficulty_score(best_case_metrics)
print("Best-Case Difficulty Score:", best_case_score)

In [None]:
# Worst-case scenario metrics
worst_case_metrics = {
    'performance_wins': 0,  # Minimum win rate
    'miss_penalty_avg': 1   # Maximum miss penalty
}

worst_case_score = calculate_difficulty_score(worst_case_metrics)
print("Worst-Case Difficulty Score:", worst_case_score)

In [None]:
def calculate_difficulty_score(metrics, weight_win_rate=1.0, weight_miss_penalty=0.5):
    """
    Calculates the difficulty score based on win rate and miss penalty.
    
    :param metrics: Dictionary containing 'performance_wins' and 'miss_penalty_avg'.
    :param weight_win_rate: Weight for the win rate metric.
    :param weight_miss_penalty: Weight for the miss penalty metric.
    :return: Calculated difficulty score.
    """
    # Extracting the metrics
    win_rate = metrics.get('performance_wins', 0)
    miss_penalty = metrics.get('miss_penalty_avg', 0)

    # Normalize the metrics (invert win rate as lower win rate indicates higher difficulty)
    normalized_win_rate = (100 - win_rate) / 100
    normalized_miss_penalty = miss_penalty  # Already in range 0 to 1

    # Calculate the composite score
    composite_score = (
        weight_win_rate * normalized_win_rate +
        weight_miss_penalty * normalized_miss_penalty
    )

    return composite_score

# Example usage with custom weights
metrics = {
    'performance_wins': 0,  # Example values
    'miss_penalty_avg': 1
}

# Custom weights
custom_weight_win_rate = 1.0
custom_weight_miss_penalty = 0.5

score = calculate_difficulty_score(metrics, custom_weight_win_rate, custom_weight_miss_penalty)
print("Difficulty Score with Custom Weights:", score)

In [None]:
aggregated_metrics = {
    5: {'total_games': 100, 'wins': 60, 'total_attempts_used': 300, 'win_rate': 0.6, \
        'average_attempts_used': 3.0, 'miss_penalty': 0.02},
    6: {'total_games': 150, 'wins': 90, 'total_attempts_used': 450, 'win_rate': 0.6, \
        'average_attempts_used': 3.0, 'miss_penalty': 0.03}
}

# Iterating over the dictionary
for word_len, metrics in aggregated_metrics.items():
    print(f"Word Length: {word_len}")
    for key, value in metrics.items():
        print(f"  {key}: {value}")

In [None]:
def select_target_pairs(performance_metrics, batch_size):
    target_pairs = []

    for word_length, metrics in performance_metrics.items():
        # print(metrics)
        win_rate = metrics.get('win_rate', 0)
        avg_attempts = metrics.get('average_attempts_used', 0)

        if win_rate <= 20 and avg_attempts >= 4:
            target_pairs.append(word_length)

    return target_pairs[:batch_size]

# Test data
performance_metrics = {
    5: {'total_games': 100, 'wins': 60, 'total_attempts_used': 300, 'win_rate': 0.6, 
        'average_attempts_used': 4.0, 'miss_penalty': 0.02},
    6: {'total_games': 150, 'wins': 90, 'total_attempts_used': 450, 'win_rate': 0.6, 
        'average_attempts_used': 4.0, 'miss_penalty': 0.03}
}

# Test the function
batch_size = 10
target_pairs = select_target_pairs(performance_metrics, batch_size)
print("Target Pairs:", target_pairs)

In [None]:
# def calculate_miss_penalty(outputs, miss_chars):
#     if outputs.numel() == 0:
#         print("Empty outputs tensor")
#         return torch.tensor(0.0, device=outputs.device)

#     miss_penalty = torch.sum(outputs * miss_chars) / outputs.numel()
#     return miss_penalty

In [None]:
from scr.feature_engineering import *

In [None]:
word = 'cat'

encoded = encode_word(word)

encoded

In [None]:
# Get missed characters tensor
miss_chars = get_missed_characters(word)

In [None]:
miss_chars

In [None]:
# Creating a dummy output tensor for the word 'cat'
word = 'cat'

# Assumptions for the dummy output
batch_size = 1  # Number of words in the batch
max_seq_len = len(word)  # Maximum sequence length (length of the word)
num_characters = 28  # Total number of characters in the vocabulary

# Creating the dummy output tensor
# For simplicity, filling it with random values between 0 and 1
outputs = torch.rand((batch_size, max_seq_len, num_characters))

outputs.shape, outputs

In [None]:
# probabilities = F.softmax(outputs, dim=-1)

# probabilities

In [None]:
# Creating two dummy output tensors for the word 'cat'
# One where all characters are accurately predicted and another where all are wrong

# Accurate Predictions: Setting the confidence for the correct characters ('c', 'a', 't') to 1
# and others to 0 for each position in the word
correct_outputs = torch.zeros((batch_size, max_seq_len, num_characters))
correct_outputs[0, 0, char_to_idx['c']] = 1  # High confidence for 'c' in the first position
correct_outputs[0, 1, char_to_idx['a']] = 1  # High confidence for 'a' in the second position
correct_outputs[0, 2, char_to_idx['t']] = 1  # High confidence for 't' in the third position

# Wrong Predictions: Setting the confidence for incorrect characters to 1 and for correct ones to 0
wrong_outputs = torch.ones((batch_size, max_seq_len, num_characters))
wrong_outputs[0, 0, char_to_idx['c']] = 0  # Zero confidence for 'c' in the first position
wrong_outputs[0, 1, char_to_idx['a']] = 0  # Zero confidence for 'a' in the second position
wrong_outputs[0, 2, char_to_idx['t']] = 0  # Zero confidence for 't' in the third position

correct_outputs.shape, wrong_outputs.shape

In [None]:
# Calculate miss penalty
miss_penalty = calculate_miss_penalty(wrong_outputs, miss_chars)
miss_penalty

##### Data Dir

In [None]:
NUM_STRATIFIED_SAMPLES = 100 # This will be overwritten by Papermill

NUM_WORD_SAMPLE = 1_000 # words for testing

FAST_DEV_RUN = False

MAX_EPOCH = 15

In [None]:
from pathlib import Path
from scr.custom_sampler import *

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
parquet_path = stratified_samples_dir / 'parquets'

# Create directories for train and validation parquets if they don't exist
parquet_path.mkdir(parents=True, exist_ok=True)
# parquet_valid_path.mkdir(parents=True, exist_ok=True)

# Define and create the directory for models
models_dir = Path("/home/sayem/Desktop/Hangman/models")
models_dir.mkdir(parents=True, exist_ok=True)

# Define your output directory
# Define your output directory and logger directory
output_dir = Path("/home/sayem/Desktop/Hangman/training_outputs")
logger_dir = output_dir / "lightning_logs"

# Create the output and logger directories if they don't exist
output_dir.mkdir(parents=True, exist_ok=True)
logger_dir.mkdir(parents=True, exist_ok=True)

# Define the file path for saving the testing words
testing_words_file_path = stratified_samples_dir / "testing_words.txt"

try:
    testing_word_list = read_words(testing_words_file_path)
    print(f"Length of the testing word list: {len(testing_word_list)}")
    sampled_test_words = stratified_sample_by_length_and_uniqueness(testing_word_list, \
        NUM_WORD_SAMPLE)
    print(f"Sampled {len(sampled_test_words)} unique words for testing.")
except FileNotFoundError:
    print(f"File not found: {testing_words_file_path}")

print(len(sampled_test_words))

##### Dataset Loading

In [None]:
# Create datasets directly from the saved parquet files
hangman_dataset = HangmanDataset(parquet_path)
# valid_dataset = HangmanDataset(parquet_valid_path)

from scr.utils import *

# Assuming `hangman_dataset` is an instance of HangmanDataset
# Usage
train_dataset, valid_dataset \
    = split_hangman_dataset(hangman_dataset, 0.8)

print(len(train_dataset))
print(len(valid_dataset))

assert len(train_dataset) > len(valid_dataset)

In [None]:
hangman_dataset[(29,)]

In [None]:
train_dataset[(29,)]

In [None]:
train_dataset.get_all_group_labels()

In [None]:
len(train_dataset)

In [None]:
# Request a sample with word length 29
word_length = 5
sample = hangman_dataset[(word_length,)]
sample

In [None]:
sample

In [None]:
from scr.data_module import *
from scr.dataset import *

# Initialize Data Module
initial_batch_size = 128  # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                custom_collate_fn)

In [None]:
train_loader = data_module.train_dataloader()

In [None]:
dummy_performance_metrics = {
    
    3: {'win_rate': 45, 'average_attempts_used': 5},
    4: {'win_rate': 60, 'average_attempts_used': 3},  # This won't be selected due to high win rate
    5: {'win_rate': 30, 'average_attempts_used': 6},
    6: {'win_rate': 48, 'average_attempts_used': 4},
    # ... add more dummy metrics as needed ...
}

In [None]:
# Initialize PerformanceBasedSampler
sampler = PerformanceBasedSampler(dataset=hangman_dataset, 
                                  performance_metrics=dummy_performance_metrics, 
                                  batch_size=10)

# Verify target pairs
print("Target pairs:", sampler.target_pairs)

In [None]:
next(iter(sampler))

In [None]:
train_dataset[(1,)]