##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### Data Reading and Feature Engineering

In [2]:
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset
from scr.feature_engineering import \
    calculate_char_frequencies, calculate_word_frequencies
from scr.utils import read_words, save_words_to_file
from scr.dataset import HangmanDataset, \
    stratified_sample_by_length_and_frequency

# Constants and File Paths
MASK_PROB = 0.8
NGRAM_N = 3
NUM_STRATIFIED_SAMPLES = 1000
BATCH_SIZE = 64  # Example batch size, adjust as needed
# base_dataset_dir = Path('data/20k/')
# Define the base directory for the dataset
# pkls_dir = pkls_dir

base_dataset_dir = Path('dataset/20k/')

pkls_dir = base_dataset_dir / 'pkl'
base_dataset_dir.mkdir(parents=True, exist_ok=True)
pkls_dir.mkdir(parents=True, exist_ok=True)

# Read and Shuffle Word List
word_list = read_words('data/20k.txt', limit=None)
random.shuffle(word_list)

# Splitting Dataset Function
def split_dataset(word_list, train_ratio=0.7, val_ratio=0.15):
    total_words = len(word_list)
    train_size = int(total_words * train_ratio)
    val_size = int(total_words * val_ratio)
    random.shuffle(word_list)
    return word_list[:train_size], word_list[train_size:train_size + val_size], \
        word_list[train_size + val_size:]


# Splitting the word list
train_words, val_words, test_words = split_dataset(word_list)

# Save split datasets to files
save_words_to_file(train_words, base_dataset_dir / 'train_words.txt')
save_words_to_file(val_words, base_dataset_dir / 'val_words.txt')
save_words_to_file(test_words, base_dataset_dir / 'test_words.txt')

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(train_words)
char_frequency = calculate_char_frequencies(train_words)
max_word_length = max(len(word) for word in train_words)

In [3]:
# In your main script or Jupyter Notebook
from scr.model import RNN
from scr.feature_engineering import process_single_word_inference, \
    char_to_idx, idx_to_char, calculate_char_frequencies, \
        get_missed_characters

# Configuration for the RNN model
# Configuration for the RNN model
config = {
    'rnn': 'LSTM',
    'vocab_size': 27,  # Assuming 26 letters + 1 for underscore
    'hidden_dim': 256,
    'num_layers': 20,
    'embedding_dim': 200,
    'output_mid_features': 200,
    'miss_linear_dim': 50,
    'dropout': 0.5, 
    'use_embedding': True,
    'lr': .00001, 
    'input_feature_size': 3, # Number of features excluding the embedding dimension
    'step_size': 15, 
    'gamma': 0.341,
    'use_cuda': True  # Set to True to use CUDA, False to use CPU
}


In [4]:
from scr.game import guess_character
from scr.game import play_game_with_a_word

# Initialize RNN model
model = RNN(config)
model = model.to(device)

model.save_model('models/model.pth') # HangmanAPI will load it from here
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
from scr.game import simulate_game_progress, play_game_with_a_word

# proto_from_p_o_o_
word = 'proto'

a = play_game_with_a_word(model, word, char_frequency, \
    max_word_length, device, max_attempts=6, normalize=True) # Playing with NN
    
initial_state = '__ot_'
# Ensure that the arguments match the function definition
won, game_progress = simulate_game_progress(
    model=model, 
    word=word, 
    initial_state=initial_state, 
    char_frequency=char_frequency, 
    max_word_length=max_word_length, 
    device=device, 
    max_attempts=6, 
    normalize=True, 
    difficulty="medium", 
    outcome_preference='win'
)
         
print(game_progress)
print(a)

[('u', '__ot_', False), ('f', '__ot_', False), ('p', 'p_ot_', True), ('o', 'p_oto', True), ('h', 'p_oto', False), ('r', 'proto', True)]
(False, '_____', 6)


In [6]:
import pickle
from pathlib import Path
from scr.dataset import stratified_sample_by_length_and_frequency
from scr.game import simulate_game_progress, process_word
import gc

pkls_dir = pkls_dir

base_dataset_dir = Path('dataset/250k/')

pkls_dir = base_dataset_dir / 'pkl'
base_dataset_dir.mkdir(parents=True, exist_ok=True)
pkls_dir.mkdir(parents=True, exist_ok=True)


import random

# def sample_scenarios(scenarios, base_sample_size, \
#     max_samples_per_length=10, always_include_masked_state=None):
#     sampled = []
#     word_length_categories = set([len(s['word']) for s in scenarios])

#     for length in word_length_categories:
#         length_scenarios = [s for s in scenarios if len(s['word']) == length]
#         total_samples_for_length = 0

#         # Always include the fully masked state scenario if provided
#         if always_include_masked_state:
#             masked_state_scenarios = [s for s in length_scenarios \
#                 if s['initial_state'] == always_include_masked_state]

#             # Include all scenarios with the fully masked state and print debug message
#             for scenario in masked_state_scenarios:
#                 sampled.append(scenario)
#                 total_samples_for_length += 1
#                 print(f"Debug: Added fully masked state scenario for word '{scenario['word']}' with initial state '{scenario['initial_state']}'")

#         # Continue with other categories
#         for category in ["easy_win", "easy_lose", "medium_win", "medium_lose", "hard_win", "hard_lose"]:
#             cat_scenarios = [s for s in length_scenarios if s['difficulty'] \
#                 == category.split('_')[0] and s['outcome'] == category.split('_')[1]]

#             available_samples = max_samples_per_length - total_samples_for_length
#             if available_samples <= 0:
#                 break

#             sample_size = min(len(cat_scenarios), base_sample_size, available_samples)
#             sampled.extend(random.sample(cat_scenarios, sample_size))
#             total_samples_for_length += sample_size
    
#     return sampled


import random

import random

def sample_scenarios(scenarios, base_sample_size, max_samples_per_length=10, always_include_masked_state=None):
    sampled = []
    word_length_categories = set([len(s['word']) for s in scenarios])

    for length in word_length_categories:
        length_scenarios = [s for s in scenarios if len(s['word']) == length]
        total_samples_for_length = 0

        # Always include the fully masked state scenario if provided
        if always_include_masked_state:
            masked_state_scenarios = [s for s in length_scenarios \
                if s['initial_state'] == always_include_masked_state]

            for scenario in masked_state_scenarios:
                sampled.append(scenario)
                total_samples_for_length += 1

        # Continue with other categories
        for category in ["easy_win", "easy_lose", "medium_win", "medium_lose", "hard_win", "hard_lose"]:
            cat_scenarios = [s for s in length_scenarios if s['difficulty'] \
                == category.split('_')[0] and s['outcome'] == category.split('_')[1]]

            available_samples = max_samples_per_length - total_samples_for_length
            if available_samples <= 0:
                break

            sample_size = min(len(cat_scenarios), base_sample_size, available_samples)
            sampled.extend(random.sample(cat_scenarios, sample_size))
            total_samples_for_length += sample_size

    # Debug: Check for inclusion of fully masked state scenarios in the final sample
    for scenario in sampled:
        initial_state = scenario.get('initial_state')
        if initial_state == always_include_masked_state:
            print(f"Debug: Fully masked state scenario included for word '{scenario['word']}'")
        elif initial_state is not None:
            print(f"Debug: Other initial state scenario for word '{scenario['word']}'")
        else:
            print(f"Debug: No initial state provided for word '{scenario['word']}'")

    return sampled




# Function to print scenarios
def print_scenarios(scenarios):
    for scenario in scenarios:
        word = scenario['word']
        difficulty = scenario['difficulty']
        outcome = scenario['outcome']
        game_won = scenario['data'][0]
        guesses = scenario['data'][1]

        print(f"Word: {word}, Difficulty: {difficulty}, Outcome: {outcome}")
        for guess in guesses:
            letter, state, correct = guess
            print(f"  Guessed '{letter}', State: {state}, Correct: {correct}")
        print(f"  Game {'Won' if game_won else 'Lost'}")
        print("")

iteration = 0

# train_words = ['cat', 'banana']

print(f'train words len: {len(train_words)}')
# Main loop
iteration = 0
base_sample_size = 5  # Base number of samples per difficulty-outcome category

while train_words:
    # Stratify sample words
    sampled_words = stratified_sample_by_length_and_frequency(
        train_words, word_frequencies, NUM_STRATIFIED_SAMPLES)

    # Create a directory for the current batch
    current_batch_dir = pkls_dir / str(iteration)
    current_batch_dir.mkdir(parents=True, exist_ok=True)

    for word in sampled_words:
        all_scenarios = []
        # Process the word to get initial masked states
        initial_masked_states = process_word(word)

        # Sample scenarios, always include the fully masked state scenario
        fully_masked_state = '_' * len(word)
        sampled_scenarios = sample_scenarios(all_scenarios, \
            base_sample_size, always_include_masked_state=fully_masked_state)

        for initial_state in initial_masked_states:
            difficulties = ["easy", "medium", "hard"]
            outcomes = ["win", "lose"]

            for difficulty in difficulties:
                for outcome in outcomes:
                    # print(f'{word} with initial state: {initial_state}: 
                    # Difficulty: {difficulty}, Outcome: {outcome}')
                    won, game_progress = simulate_game_progress(
                                            model=model, 
                                            word=word, 
                                            initial_state=initial_state, 
                                            char_frequency=char_frequency, 
                                            max_word_length=max_word_length, 
                                            device=device, 
                                            max_attempts=6, 
                                            normalize=True, 
                                            difficulty=difficulty, 
                                            outcome_preference=outcome
                                        )

                    all_scenarios.append({'word': word, 'difficulty': difficulty, \
                        'outcome': outcome, 'data': (won, game_progress)})

        # Sample scenarios
        sampled_scenarios = sample_scenarios(all_scenarios, base_sample_size)

        # # print(sampled_scenarios)
        # print_scenarios(sampled_scenarios)

        # Save only sampled scenarios
        for scenario in sampled_scenarios:
            game_states = [scenario['data']]
            difficulty = scenario['difficulty']
            outcome = scenario['outcome']
            file_path = current_batch_dir / f"{word}_from_{initial_state}_{difficulty}_{outcome}.pkl"
            with open(file_path, 'wb') as file:
                pickle.dump(game_states, file)

            # Clear large objects to free memory
        del all_scenarios, sampled_scenarios

    # Manual garbage collection
    gc.collect()

    train_words = [word for word in train_words if word not in sampled_words]
    iteration += 1

train words len: 14000
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'tulip'
Debug: Fully masked state scenario included for word 'plays'
Debug: Fully masked state scenario included for word 'plays'
Debug: Fully masked state scenario included for word 'plays'
Debug: Fully masked state scenario included for word 'plays'
Debug: Fully masked state scenario included for word 'plays'
Debug: Fully masked state scenario included for word 'plays'
D

KeyboardInterrupt: 

In [None]:
base_dataset_dir = Path('dataset/20k/pkl')

In [None]:
# Iterate over all batch directories
for batch_dir in base_dataset_dir.iterdir():
    if batch_dir.is_dir():
        # List all .pkl files in the current batch directory
        pkl_files = list(batch_dir.glob("*.pkl"))

        for pkl_file in pkl_files:
            with open(pkl_file, 'rb') as file:
                game_states = pickle.load(file)
                print(f"Contents of {pkl_file}: {game_states}")
                break
            break
        break

# Replace "path_to_your_dataset_directory" with the actual path

In [None]:
game_states