##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)
random.shuffle(word_list)

##### Data Reading and Feature Engineering

In [2]:
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset
from scr.feature_engineering import \
    calculate_char_frequencies, calculate_word_frequencies
from scr.utils import read_words, save_words_to_file
from scr.dataset import HangmanDataset

# Constants and File Paths
MASK_PROB = 0.8
NGRAM_N = 3
# NUM_STRATIFIED_SAMPLES = 10
BATCH_SIZE = 64  # Example batch size, adjust as needed
# base_dataset_dir = Path('data/20k/')

# pkls_dir = pkls_dir

from pathlib import Path

# Define the base directory where you want to save the dataset
base_dataset_dir = Path('./dataset/')
# Ensure the base directory exists
base_dataset_dir.mkdir(parents=True, exist_ok=True)


pkls_dir = base_dataset_dir / 'pkl'
pkls_dir.mkdir(parents=True, exist_ok=True)


# Splitting Dataset Function
def split_dataset(word_list, train_ratio=0.7, val_ratio=0.15):
    total_words = len(word_list)
    train_size = int(total_words * train_ratio)
    val_size = int(total_words * val_ratio)
    random.shuffle(word_list)
    return word_list[:train_size], word_list[train_size:train_size + val_size], \
        word_list[train_size + val_size:]


# Splitting the word list
train_words, val_words, test_words = split_dataset(word_list)

# Save split datasets to files
save_words_to_file(train_words, base_dataset_dir / 'train_words.txt')
save_words_to_file(val_words, base_dataset_dir / 'val_words.txt')
save_words_to_file(test_words, base_dataset_dir / 'test_words.txt')

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(train_words)
char_frequency = calculate_char_frequencies(train_words)
max_word_length = max(len(word) for word in train_words)

In [3]:
from scr.game import simulate_game_progress, play_game_with_a_word, process_word

# Example word and initial state
# Example usage
word = "mississippi"
# word = "cat"
initial_states = process_word(word, mask_prob=0.5, max_variants=15)

# Print generated initial states
# print("Generated Initial States:")
# for initial_state in initial_states:
#     # Simulate the game
#     print(initial_state)
    # print(f"For initial state: {initial_state}")
    # won, game_progress = simulate_game_progress(
    #     model=None,  # Assuming model is not used in this example
    #     word=word, 
    #     initial_state=initial_state, 
    #     char_frequency={},  # Assuming char_frequency is not used in this example
    #     max_word_length=len(word), 
    #     device=None,  # Assuming device is not used in this example
    #     max_attempts=6, 
    #     normalize=True,
    #     difficulty="medium", 
    #     outcome_preference='win'
    # )

    # # Display game progress
    # for step in game_progress:
    #     print(f"Guessed: '{step[0]}', New State: '{step[1]}', Correct: {step[2]}")

    #     # break

    # # break

    # # print("Game Result:", "Won" if won else "Lost")

In [4]:
len(initial_states)

11

In [5]:
initial_states

['___________',
 'm_ss_ss____',
 '________pp_',
 '_ississi__i',
 '__ss_ss_pp_',
 'mi__i__i__i',
 '_i__i__ippi',
 'm_______pp_',
 'm__________',
 '__ss_ss____',
 '_i__i__i__i']

In [6]:
STOP

NameError: name 'STOP' is not defined

In [None]:
import pickle
from pathlib import Path
from scr.dataset import *
from scr.game import *
import gc
from scr.utils import print_scenarios

pkls_dir = pkls_dir

# base_dataset_dir = Path('dataset/250k/')

# pkls_dir = base_dataset_dir / 'pkl'
# base_dataset_dir.mkdir(parents=True, exist_ok=True)
# pkls_dir.mkdir(parents=True, exist_ok=True)

import random

def sample_scenarios(scenarios, base_sample_size, \
    max_samples_per_length=15, always_include_masked_state=None):
    sampled = []
    word_length_categories = set([len(s['word']) for s in scenarios])

    for length in word_length_categories:
        length_scenarios = [s for s in scenarios if len(s['word']) == length]
        total_samples_for_length = 0

        # Always include the fully masked state scenario if provided
        if always_include_masked_state:
            masked_state_scenarios = [s for s in length_scenarios \
                if s['initial_state'] == always_include_masked_state]

            for scenario in masked_state_scenarios:
                sampled.append(scenario)
                total_samples_for_length += 1

        # Continue with other categories
        for category in ["easy_win", "easy_lose", "medium_win", "medium_lose", \
            "hard_win", "hard_lose"]:
            cat_scenarios = [s for s in length_scenarios if s['difficulty'] \
                == category.split('_')[0] and s['outcome'] == category.split('_')[1]]

            available_samples = max_samples_per_length - total_samples_for_length
            if available_samples <= 0:
                break

            sample_size = min(len(cat_scenarios), base_sample_size, available_samples)
            sampled.extend(random.sample(cat_scenarios, sample_size))
            total_samples_for_length += sample_size

    # # Debug: Check for inclusion of fully masked state scenarios in the final sample
    # for scenario in sampled:
    #     initial_state = scenario.get('initial_state')
    #     if initial_state == always_include_masked_state:
    #         print(f"Debug: Fully masked state scenario included for word '{scenario['word']}'")
    #     elif initial_state is not None:
    #         print(f"Debug: Other initial state scenario for word '{scenario['word']}'")
    #     else:
    #         print(f"Debug: No initial state provided for word '{scenario['word']}'")

    return sampled
    
# Function to print scenarios

In [None]:
from scr.custom_sampler import stratified_sample_by_length_and_frequency

iteration = 0

# train_words = ['cat']

print(f'train words len: {len(train_words)}')

NUM_STRATIFIED_SAMPLES = 1000
# Main loop
iteration = 0
base_sample_size = 10  # Base number of samples per difficulty-outcome category

# train_words = ['cat']


while tqdm(train_words, miniters=1, leave=False, mininterval=2.0):

    sampled_words = stratified_sample_by_length_and_frequency(train_words, \
        word_frequencies, \
        NUM_STRATIFIED_SAMPLES)


    for word in tqdm(sampled_words, miniters=2, leave=False, mininterval=2.0): 
        # , miniters=2, leave=False, mininterval=2.0):
        # print(word)
        all_scenarios = []
        # Process the word to get initial masked states
        initial_masked_states = process_word(word, mask_prob=0.9, max_variants=10)

        for initial_state in initial_masked_states:
            
            difficulties = ["easy", "medium", "hard"]
            outcomes = ["win", "lose"]

            for difficulty in difficulties:
                for outcome in outcomes:
                    # print(f'{word} from initial state: {initial_state}: \
                    # Difficulty: {difficulty}, Outcome: {outcome}')
                    won, game_progress = simulate_game_progress(
                                            model=None, 
                                            word=word, 
                                            initial_state=initial_state, 
                                            char_frequency=char_frequency, 
                                            max_word_length=max_word_length, 
                                            device=device, 
                                            max_attempts=6, 
                                            normalize=True, 
                                            difficulty=difficulty, 
                                            outcome_preference=outcome
                                        )

                    # all_scenarios.append({'word': word, 'difficulty': difficulty, \
                    #     'outcome': outcome, 'data': (won, game_progress)})

                    all_scenarios.append({
                                'word': word, 
                                'difficulty': difficulty,
                                'outcome': outcome, 
                                'initial_state': initial_state,  # Added 'initial_state' key
                                'data': (won, game_progress)
                            })

    # # # Sample scenarios
    sampled_scenarios = sample_scenarios(all_scenarios, base_sample_size, \
        always_include_masked_state=True)

    # Create a directory for the current batch
    current_batch_dir = pkls_dir / str(iteration)
    current_batch_dir.mkdir(parents=True, exist_ok=True)

    # print(current_batch_dir)

    for scenario in sampled_scenarios:
        try:
            game_states = [scenario['data']]
            difficulty = scenario['difficulty']
            outcome = scenario['outcome']
            initial_state = scenario['initial_state']  # This should be the correct scope
            file_path = current_batch_dir / f"{word}_from_{initial_state}_{difficulty}_{outcome}.pkl"

            # print(f"Saving scenario for {word}: {file_path}")

            with open(file_path, 'wb') as file:
                pickle.dump(game_states, file)

            # print(f"Saved {file_path}")

        except Exception as e:
            print(f"Error saving {file_path}: {e}")

    # Clear memory
    del all_scenarios, sampled_scenarios

    # Manual garbage collection
    gc.collect()

    train_words = [word for word in train_words if word not in sampled_words]
    # print(len(train_words))
    # print(iteration)
    iteration += 1
    # break

In [None]:
base_dataset_dir = Path('dataset/pkl')

In [None]:
pkl_list = []

# Iterate over all batch directories
for batch_dir in sorted(base_dataset_dir.iterdir(), key=lambda x: int(x.name) if x.name.isdigit() else float('inf')):
    if batch_dir.is_dir():
        # List all .pkl files in the current batch directory
        pkl_files = list(batch_dir.glob("*.pkl"))

        for pkl_file in pkl_files:
            with open(pkl_file, 'rb') as file:
                game_data = pickle.load(file)
                # Extract information from file name
                parts = pkl_file.stem.split('_from_')
                word_and_state = parts[0].split('_')
                word = '_'.join(word_and_state[:-1])
                initial_state = word_and_state[-1]
                difficulty, outcome = parts[1].split('_')[-2:]

                # Assuming game_data is a list of tuples (game_won, guesses)
                for data in game_data:
                    game_won, guesses = data
                    # Create a scenario dictionary for each data tuple
                    scenario = {
                        'word': word,
                        'difficulty': difficulty,
                        'outcome': outcome,
                        'data': (game_won, guesses)
                    }
                    pkl_list.append((pkl_file, scenario))  # Add scenario to the list

# Accessing an individual pickle file's content by index
index_to_access = 0  # Change this index to access different files
if index_to_access < len(pkl_list):
    file_path, scenario = pkl_list[index_to_access]
    print(f"Contents of {file_path}:")
    print_scenarios([scenario])  # Wrap scenario in a list for the function
else:
    print(f"No pickle file at index {index_to_access}")
