##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import pickle
from pathlib import Path
from scr.dataset import *
from scr.game import *
import gc
from scr.utils import print_scenarios


from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset
from scr.feature_engineering import \
    calculate_char_frequencies, calculate_word_frequencies
from scr.utils import read_words, save_words_to_file

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)
random.shuffle(word_list)

In [2]:
from pathlib import Path

NUM_STRATIFIED_SAMPLES = 150000

# Define the base directory on the specified drive
base_dataset_dir = Path(f"/media/sayem/510B93E12554BBD1/dataset/{NUM_STRATIFIED_SAMPLES}")

# Ensuring the directory exists
base_dataset_dir.mkdir(parents=True, exist_ok=True)

# Ensure the base directory exists
base_dataset_dir.mkdir(parents=True, exist_ok=True)


pkls_dir = base_dataset_dir / 'pkl'
pkls_dir.mkdir(parents=True, exist_ok=True)

In [3]:
base_dataset_dir

PosixPath('/media/sayem/510B93E12554BBD1/dataset/150000')

In [4]:
# Splitting Dataset Function
def split_dataset(word_list, train_ratio=0.7, val_ratio=0.15):
    total_words = len(word_list)
    train_size = int(total_words * train_ratio)
    val_size = int(total_words * val_ratio)
    random.shuffle(word_list)
    return word_list[:train_size], word_list[train_size:train_size + val_size], \
        word_list[train_size + val_size:]


# Splitting the word list
train_words, val_words, test_words = split_dataset(word_list)

# Save split datasets to files
save_words_to_file(train_words, base_dataset_dir / 'train_words.txt')
save_words_to_file(val_words, base_dataset_dir / 'val_words.txt')
save_words_to_file(test_words, base_dataset_dir / 'test_words.txt')

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(train_words)
char_frequency = calculate_char_frequencies(train_words)
max_word_length = max(len(word) for word in train_words)

##### Data Reading and Feature Engineering

In [5]:
def count_repetitive_characters(word):
    char_count = {}
    for char in word:
        char_count[char] = char_count.get(char, 0) + 1
    return sum(1 for count in char_count.values() if count > 1)

def adaptive_mask_prob_and_variants(word):
    word_length = len(word)
    unique_char_count = len(set(word))
    repetitive_char_count = count_repetitive_characters(word)

    # Lower mask probability for longer and more unique words
    if word_length <= 5:
        mask_prob = 0.8 if repetitive_char_count > (word_length / 2) else 0.9
    
    elif word_length <= 8:
        mask_prob = 0.6 if unique_char_count > 4 else 0.7
    
    else:
        mask_prob = 0.4 if unique_char_count > 6 else 0.5

    # Adjust max variants based on the complexity of the word
    if unique_char_count <= 3 or repetitive_char_count > (word_length / 2):
        max_variants = max(5, word_length)
    
    elif unique_char_count <= 6:
        max_variants = max(10, word_length)
    
    else:
        max_variants = min(15, word_length)

    return mask_prob, max_variants


In [6]:
word = "noon"
mask_prob, max_variants = adaptive_mask_prob_and_variants(word)

mask_prob

0.9

In [7]:
max_variants

5

##### Game State Simulation

In [8]:
from scr.game import simulate_game_progress, play_game_with_a_word, process_word

# Example word and initial state
# Example usage
# word = "mississippi"
# word = "cat"
mask_prob, max_variants = adaptive_mask_prob_and_variants(word)

initial_states = process_word(word, mask_prob=mask_prob, \
    max_variants=max_variants)

# Print generated initial states
print("Generated Initial States:")
for initial_state in initial_states:
    # Simulate the game
    print(initial_state)
    print(f"For initial state: {initial_state}")
    won, game_progress = simulate_game_progress(
        model=None,  # Assuming model is not used in this example
        word=word, 
        initial_state=initial_state, 
        char_frequency={},  # Assuming char_frequency is not used in this example
        max_word_length=len(word), 
        device=None,  # Assuming device is not used in this example
        max_attempts=6, 
        normalize=True,
        difficulty="medium", 
        outcome_preference='win'
    )

    # Display game progress
    for step in game_progress:
        print(f"Guessed: '{step[0]}', New State: '{step[1]}', Correct: {step[2]}")

        # break

    # break

    # print("Game Result:", "Won" if won else "Lost")

Generated Initial States:
____
For initial state: ____
Guessed: 'o', New State: '_oo_', Correct: True
Guessed: 'n', New State: 'noon', Correct: True
n__n
For initial state: n__n
Guessed: 'l', New State: 'n__n', Correct: False
Guessed: 'o', New State: 'noon', Correct: True
_oo_
For initial state: _oo_
Guessed: 'n', New State: 'noon', Correct: True


In [9]:
len(initial_states)

3

In [10]:
initial_states

['____', 'n__n', '_oo_']

##### Dataset Generation

In [11]:
from scr.custom_sampler import stratified_sample_by_length_and_frequency

# NUM_STRATIFIED_SAMPLES = 1000
# Main loop
iteration = 0

sampled_words = stratified_sample_by_length_and_frequency(train_words, \
    word_frequencies, \
    NUM_STRATIFIED_SAMPLES)

print(f'Number words for state generation: {len(sampled_words)}')

for word in tqdm(sampled_words, miniters=2, leave=False, mininterval=2.0): 
    # , miniters=2, leave=False, mininterval=2.0):
    # print(word)
    all_scenarios = []
    # Process the word to get initial masked states
    # initial_masked_states = process_word(word, mask_prob=0.9, max_variants=10)

    mask_prob, max_variants = adaptive_mask_prob_and_variants(word)

    game_states = process_word(word, mask_prob=mask_prob, \
        max_variants=max_variants)

    for initial_state in game_states:
        
        difficulties = ["easy", "medium", "hard"]
        outcomes = ["win", "lose"]

        for difficulty in difficulties:
            for outcome in outcomes:
                # print(f'{word} from initial state: {initial_state}: \
                # Difficulty: {difficulty}, Outcome: {outcome}')
                won, game_progress = simulate_game_progress(
                                        model=None, 
                                        word=word, 
                                        initial_state=initial_state, 
                                        char_frequency=char_frequency, 
                                        max_word_length=max_word_length, 
                                        device=device, 
                                        max_attempts=6, 
                                        normalize=True, 
                                        difficulty=difficulty, 
                                        outcome_preference=outcome
                                    )

                # all_scenarios.append({'word': word, 'difficulty': difficulty, \
                #     'outcome': outcome, 'data': (won, game_progress)})

                all_scenarios.append({
                            'word': word, 
                            'difficulty': difficulty,
                            'outcome': outcome, 
                            'initial_state': initial_state,  # Added 'initial_state' key
                            'data': (won, game_progress)
                        })  # all game state
    
    # Create a directory for the current strarified samples
    
    current_batch_dir = pkls_dir / str(iteration)
    current_batch_dir.mkdir(parents=True, exist_ok=True)

    # print(all_scenarios)

    # print(current_batch_dir)

    for scenario in all_scenarios:
        try:
            game_states = [scenario['data']]
            difficulty = scenario['difficulty']
            outcome = scenario['outcome']
            initial_state = scenario['initial_state']  # This should be the correct scope
            file_path = current_batch_dir / f"{word}_from_{initial_state}_{difficulty}_{outcome}.pkl"

            # print(f"Saving scenario for {word}: {file_path}")

            with open(file_path, 'wb') as file:
                pickle.dump(game_states, file)

            # print(f"Saved {file_path}")

        except Exception as e:
            print(f"Error saving {file_path}: {e}")

    # Clear memory
    del all_scenarios # , sampled_scenarios

    # Manual garbage collection
    gc.collect()

    # train_words = [word for word in train_words if word not in sampled_words]
    # print(len(train_words))
    # print(iteration)
    iteration += 1

Number words for state generation: 150001


  0%|          | 0/150001 [00:00<?, ?it/s]

                                                           

Error saving /media/sayem/510B93E12554BBD1/dataset/150000/pkl/144837/administratress_from_a________a______easy_lose.pkl: [Errno 28] No space left on device: '/media/sayem/510B93E12554BBD1/dataset/150000/pkl/144837/administratress_from_a________a______easy_lose.pkl'
Error saving /media/sayem/510B93E12554BBD1/dataset/150000/pkl/144837/administratress_from_a________a______medium_win.pkl: [Errno 28] No space left on device: '/media/sayem/510B93E12554BBD1/dataset/150000/pkl/144837/administratress_from_a________a______medium_win.pkl'
Error saving /media/sayem/510B93E12554BBD1/dataset/150000/pkl/144837/administratress_from_a________a______medium_lose.pkl: [Errno 28] No space left on device: '/media/sayem/510B93E12554BBD1/dataset/150000/pkl/144837/administratress_from_a________a______medium_lose.pkl'
Error saving /media/sayem/510B93E12554BBD1/dataset/150000/pkl/144837/administratress_from_a________a______hard_win.pkl: [Errno 28] No space left on device: '/media/sayem/510B93E12554BBD1/dataset/1



OSError: [Errno 28] No space left on device: '/media/sayem/510B93E12554BBD1/dataset/150000/pkl/144838'

##### Reading Checking

In [None]:
pkl_list = []

for batch_dir in sorted(pkls_dir.iterdir(), key=lambda x: int(x.name) if x.name.isdigit() else float('inf')):
    if batch_dir.is_dir():
        pkl_files = list(batch_dir.glob("*.pkl"))

        for pkl_file in pkl_files:
            with open(pkl_file, 'rb') as file:
                game_data = pickle.load(file)

                # Split the file name to extract word and initial state
                parts = pkl_file.stem.split('_from_')
                word = parts[0]  # The word is before '_from_'
                remaining_parts = parts[1].split('_')
                initial_state = remaining_parts[0]  # The initial state is right after '_from_'
                difficulty, outcome = remaining_parts[1], remaining_parts[2]

                for data in game_data:
                    game_won, guesses = data
                    scenario = {
                        'word': word,
                        'initial_state': initial_state,
                        'difficulty': difficulty,
                        'outcome': outcome,
                        'data': (game_won, guesses)
                    }
                    pkl_list.append((pkl_file, scenario))

index_to_access = 1000
if index_to_access < len(pkl_list):
    file_path, scenario = pkl_list[index_to_access]
    print(f"Contents of {file_path}:")
    print_scenarios([scenario])
else:
    print(f"No pickle file at index {index_to_access}")


In [None]:
# pkl_list = []

# # Iterate over all batch directories
# for batch_dir in sorted(pkls_dir.iterdir(), key=lambda x: int(x.name) \
#     if x.name.isdigit() else float('inf')):
#     if batch_dir.is_dir():
#         # List all .pkl files in the current batch directory
#         pkl_files = list(batch_dir.glob("*.pkl"))

#         for pkl_file in pkl_files:
#             with open(pkl_file, 'rb') as file:
#                 game_data = pickle.load(file)
#                 # Extract information from file name
#                 parts = pkl_file.stem.split('_from_')
#                 word_and_state = parts[0].split('_')
#                 word = '_'.join(word_and_state[:-1])
#                 initial_state = word_and_state[-1]
#                 difficulty, outcome = parts[1].split('_')[-2:]

#                 # Assuming game_data is a list of tuples (game_won, guesses)
#                 for data in game_data:
#                     game_won, guesses = data
#                     # Create a scenario dictionary for each data tuple
#                     scenario = {
#                         'word': word,
#                         'difficulty': difficulty,
#                         'outcome': outcome,
#                         'data': (game_won, guesses)
#                     }
#                     pkl_list.append((pkl_file, scenario))  # Add scenario to the list

# # Accessing an individual pickle file's content by index
# index_to_access = 1000  # Change this index to access different files
# if index_to_access < len(pkl_list):
#     file_path, scenario = pkl_list[index_to_access]
#     print(f"Contents of {file_path}:")
#     print_scenarios([scenario])  # Wrap scenario in a list for the function
# else:
#     print(f"No pickle file at index {index_to_access}")