##### Imports

In [1]:
import sys
from pathlib import Path
import warnings
from torch.utils.data import Dataset

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import pickle
from pathlib import Path
from scr.dataset import *
# from scr.game import *
from scr.feature_engineering import *
# from scr.plot_utils import *
import gc
from scr.utils import print_scenarios

from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

torch.set_float32_matmul_precision('medium')  # for tensor core

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Data Path

In [2]:
import shutil
from pathlib import Path

NUM_STRATIFIED_SAMPLES = 5000 # This will be overwritten by Papermill

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")

stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)

parquet_path = stratified_samples_dir / 'parquets'
# parquet_valid_path = stratified_samples_dir / 'valid_parquets'

# Function to delete and recreate a directory
def recreate_directory(path):
    if path.exists():
        shutil.rmtree(path)  # Delete the directory and its contents
    path.mkdir(parents=True)  # Create the directory

# Recreate the train and valid directories
recreate_directory(parquet_path)
# recreate_directory(parquet_valid_path)

# print(f"Directories '{parquet_train_path}' and '{parquet_valid_path}' have been recreated.")

print(f"Directories '{parquet_path}' have been recreated.")

Directories '/media/sayem/510B93E12554BBD1/dataset/5000/parquets' have been recreated.


In [3]:
len(word_list)

227300

##### Testing Data

In [4]:
# Define the total number of words and the number of test samples
from scr.custom_sampler import *
NUM_TEST_SAMPLES = 10_000

# Assuming 'word_list' contains the 250,000 words
# First, separate 10,000 words for the final testing set
testing_words = stratified_sample_by_length_and_uniqueness(
    word_list, 
    NUM_TEST_SAMPLES
)

# Define the file path for saving the testing words
testing_words_file_path = stratified_samples_dir / "testing_words.txt"

# Save the testing words to a file
with open(testing_words_file_path, 'w') as file:
    for word in testing_words:
        file.write(word + '\n')

print(f"Testing words saved in {testing_words_file_path}")

# Now, remove these testing samples from the original word list
remaining_words = [word for word in word_list if word not in testing_words]

Testing words saved in /media/sayem/510B93E12554BBD1/dataset/5000/testing_words.txt


##### Stratified Sample Generation

In [5]:
## we are taking starified samples from train_words

from scr.custom_sampler import \
    stratified_sample_by_length_and_frequency, \
        stratified_sample_by_length, stratified_sample_by_length_and_uniqueness

print(f'Number of Statrified samples: {NUM_STRATIFIED_SAMPLES}')

# sampled_words_by_length_and_frequency \
#     = stratified_sample_by_length_and_frequency(train_words, \
#     word_frequencies, \
#     NUM_STRATIFIED_SAMPLES)

sampled_words_by_length = stratified_sample_by_length_and_uniqueness(remaining_words, \
    NUM_STRATIFIED_SAMPLES)

print(len(sampled_words_by_length))

Number of Statrified samples: 5000
5039


##### Intial State Simulation Testing

In [6]:
from scr.game import *

In [7]:
# word = "mississippi"
word = "mythopoetize"
# word = "cat"

initial_states = process_word_for_six_states(word)

initial_states

{'allMasked': '____________',
 'early': '____o_o_____',
 'quarterRevealed': '__t_o_o_t___',
 'midRevealed': '_yt_o_o_ti__',
 'midLateRevealed': '_ytho_o_tiz_',
 'lateRevealed': 'mytho_oetize',
 'nearEnd': 'mytho_oetize'}

In [8]:
len(initial_states)

7

In [9]:
initial_states

{'allMasked': '____________',
 'early': '____o_o_____',
 'quarterRevealed': '__t_o_o_t___',
 'midRevealed': '_yt_o_o_ti__',
 'midLateRevealed': '_ytho_o_tiz_',
 'lateRevealed': 'mytho_oetize',
 'nearEnd': 'mytho_oetize'}

##### Dataset Generation: Simulation

In [10]:
word = "mississippi"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States: ", initial_states)

Generated Initial States:  {'allMasked': '___________', 'early': '__ss_ss____', 'quarterRevealed': '__ss_ss____', 'midRevealed': '__ss_ss_pp_', 'midLateRevealed': 'm_ss_ss_pp_', 'lateRevealed': 'm_ss_ss_pp_', 'nearEnd': 'm_ss_ss_pp_'}


In [11]:
from scr.game import simulate_game_progress, \
    play_game_with_a_word, process_word

# Example word and initial state
# Example usage
word = "mississippi"
# word = "cat"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States:")
i = 1 
for state_name, initial_state in initial_states.items():
    # Simulate the game
    print(initial_state)
    print(f"{i}: For initial state: {initial_state}")
    won, game_progress = simulate_game_progress(
        model=None,  # Assuming model is not used in this example
        word=word, 
        initial_state=initial_state, 
        char_frequency={},  # Assuming char_frequency is not used in this example
        max_word_length=len(word), 
        device=None,  # Assuming device is not used in this example
        max_attempts=6, 
        normalize=True,
        difficulty="medium", 
        outcome_preference='win'
    )

    # Display game progress
    for step in game_progress:
        print(f"Guessed: '{step[0]}', New State: '{step[1]}', Correct: {step[2]}")

        # break

    i+=1

    # break

    # print("Game Result:", "Won" if won else "Lost")

Generated Initial States:
___________
1: For initial state: ___________
Guessed: 'm', New State: 'm__________', Correct: True
Guessed: 's', New State: 'm_ss_ss____', Correct: True
Guessed: 'i', New State: 'mississi__i', Correct: True
Guessed: 'o', New State: 'mississi__i', Correct: False
Guessed: 'p', New State: 'mississippi', Correct: True
__ss_ss____
2: For initial state: __ss_ss____
Guessed: 'n', New State: '__ss_ss____', Correct: False
Guessed: 'p', New State: '__ss_ss_pp_', Correct: True
Guessed: 'm', New State: 'm_ss_ss_pp_', Correct: True
Guessed: 'i', New State: 'mississippi', Correct: True
__ss_ss____
3: For initial state: __ss_ss____
Guessed: 'n', New State: '__ss_ss____', Correct: False
Guessed: 'c', New State: '__ss_ss____', Correct: False
Guessed: 'y', New State: '__ss_ss____', Correct: False
Guessed: 'z', New State: '__ss_ss____', Correct: False
Guessed: 'k', New State: '__ss_ss____', Correct: False
Guessed: 'm', New State: 'm_ss_ss____', Correct: True
Guessed: 'p', New S

##### Writing Parquet

In [12]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Assuming the function 'process_word_for_six_states' is defined elsewhere
from scr.game import simulate_game_progress, process_word_for_six_states

def process_batch_to_parquet(batch, file_path, start_game_counter):
    game_counter = start_game_counter
    data_for_parquet = []

    for game_data in batch:
        word, state_name, initial_state, difficulty, outcome, won, game_progress = game_data
        if not game_progress:
            continue

        final_state = game_progress[-1][1]
        guessed_states = [initial_state] + [state for _, state, _ in game_progress]
        guessed_letters = [letter for letter, _, _ in game_progress]

        data_for_parquet.append({
            'game_id': game_counter,
            'word': word,
            'initial_state': initial_state,
            'final_state': final_state,
            'guessed_states': ','.join(guessed_states),
            'guessed_letters': ','.join(guessed_letters),
            'game_state': state_name,
            'difficulty': difficulty,
            'outcome': outcome,
            'word_length': len(word),
            'won': won
        })

        game_counter += 1

    df = pd.DataFrame(data_for_parquet)
    table = pa.Table.from_pandas(df)
    pq.write_to_dataset(table, root_path=file_path, compression='snappy')

    return game_counter

def generate_batch_for_word(word):
    batch = []
    initial_states = process_word_for_six_states(word)
    games_generated = 0  # Counter for games generated

    for state_name, initial_state in initial_states.items():
        for difficulty in ["easy", "medium", "hard"]:
            for outcome in ["win", "lose"]:
                won, game_progress = simulate_game_progress(
                    model=None, word=word, initial_state=initial_state,
                    char_frequency={}, max_word_length=len(word),
                    device=None, max_attempts=6, normalize=True,
                    difficulty=difficulty, outcome_preference=outcome
                )
                batch.append((word, state_name, initial_state, \
                    difficulty, outcome, won, game_progress))
                games_generated += 1  # Increment game counter

    return batch, games_generated

def main_execution(words, parquet_path):
    game_counter = 0
    games_per_word = len(initial_states) * 3 * 2  # 7 states, 3 difficulties, 2 outcomes

    for word in tqdm(words, desc="Processing Words"):
        batch, games_generated = generate_batch_for_word(word)

        # Update game count
        game_counter += games_generated

        # Process and save batch
        process_batch_to_parquet(batch, parquet_path, game_counter)

    print(f"Total games processed: {game_counter}")

In [13]:
# =======================================================
# Execute the main function
main_execution(sampled_words_by_length, parquet_path)
# =======================================================

Processing Words:   0%|          | 0/5039 [00:00<?, ?it/s]

Processing Words: 100%|██████████| 5039/5039 [00:16<00:00, 298.85it/s]

Total games processed: 211638





##### Checking the dataset 

In [14]:
import pandas as pd
from pathlib import Path


# Find all Parquet files in the directory
parquet_files = list(parquet_path.glob('*.parquet'))

if parquet_files:
    total_game_sequences = 0

    # Iterate over each file and sum the number of game sequences
    for file in parquet_files:
        df = pd.read_parquet(file)
        total_game_sequences += len(df)

    print(f"Total number of game sequences across all files: {total_game_sequences}")
else:
    print("No Parquet files found in the specified directory.")


Total number of game sequences across all files: 211602


In [15]:
import pandas as pd
import glob
from pathlib import Path

# Use glob to find all Parquet files in the folder
parquet_files = parquet_path.glob('*.parquet')

# Read and concatenate all Parquet files into a single DataFrame
df = pd.concat([pd.read_parquet(file) for file in \
    parquet_files], ignore_index=True)

# # Display the first few rows of the DataFrame
# print(df.head())

# Get the total number of rows (games) in the DataFrame
total_games = len(df)
print(f"Total number of games in the dataset: {total_games}")

# Additional checks and summary statistics
print("Null values in each column:")
print(df.isnull().sum())

print("\nSummary statistics:")
print(df.describe())

# Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"\nNumber of unique words: {unique_words}")

# Inspect the distribution of game outcomes, difficulties, etc.
print("\nOutcome distribution:")
print(df['outcome'].value_counts())

print("\nDifficulty distribution:")
print(df['difficulty'].value_counts())

# Word length distribution
print("\nWord Length Distribution:")
print(df['word_length'].value_counts())

Total number of games in the dataset: 211602
Null values in each column:
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64

Summary statistics:
             game_id    word_length
count  211602.000000  211602.000000
mean   105842.724232       9.411064
std     61084.765488       3.100986
min        42.000000       1.000000
25%     52942.250000       7.000000
50%    105842.500000       9.000000
75%    158742.750000      11.000000
max    211679.000000      29.000000

Number of unique words: 5039

Outcome distribution:
outcome
win     105801
lose    105801
Name: count, dtype: int64

Difficulty distribution:
difficulty
easy      70534
medium    70534
hard      70534
Name: count, dtype: int64

Word Length Distribution:
word_length
9     28602
8     28140
10    24906
7     23982
11    21042
6     18102

In [16]:
import pandas as pd

# # Replace this with the path to your Parquet file
# parquet_file_path = 'path/to/your/HangmanData.parquet'

# Read the Parquet file
df = pd.read_parquet(parquet_path)

# Display the first few rows of the DataFrame
df

Unnamed: 0,game_id,word,initial_state,final_state,guessed_states,guessed_letters,game_state,difficulty,outcome,word_length,won
0,193578,unproportionably,________________,unproportionably,"________________,______________l_,____________...","l,a,r,n,b,p,z,i,y,t,o,u",allMasked,easy,win,16,True
1,193579,unproportionably,________________,unproportionably,"________________,______________l_,__p__p______...","l,p,y,i,t,n,q,u,r,o,b,a",allMasked,easy,lose,16,True
2,193580,unproportionably,________________,unp_opo___on__ly,"________________,u_______________,u___________...","u,g,y,z,n,p,q,l,d,o,j,c",allMasked,medium,win,16,False
3,193581,unproportionably,________________,un_r___rti_na__y,"________________,_______________y,_________i__...","y,i,e,f,a,d,n,t,k,r,u,s,h",allMasked,medium,lose,16,False
4,193582,unproportionably,________________,____o_o__io_a_ly,"________________,_________i______,_________i__...","i,y,l,s,k,o,v,z,h,a,g",allMasked,hard,win,16,False
...,...,...,...,...,...,...,...,...,...,...,...
211597,115831,prediscourage,pred_scourage,prediscourage,"pred_scourage,prediscourage",i,nearEnd,easy,lose,13,True
211598,115832,prediscourage,pred_scourage,prediscourage,"pred_scourage,pred_scourage,prediscourage","x,i",nearEnd,medium,win,13,True
211599,115833,prediscourage,pred_scourage,prediscourage,"pred_scourage,prediscourage",i,nearEnd,medium,lose,13,True
211600,115834,prediscourage,pred_scourage,pred_scourage,"pred_scourage,pred_scourage,pred_scourage,pred...","j,q,t,y,k,l",nearEnd,hard,win,13,False


In [17]:
# Get the total number of rows (games) in the DataFrame
total_games = len(df)

print(f"Total number of games in the dataset: {total_games}")

# Additional checks you might want to perform:
# - Check for any null values or anomalies in the data
print(df.isnull().sum())

# - Get a summary of the DataFrame
print(df.describe())

# - Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"Number of unique words: {unique_words}")

# - Inspect the distribution of game outcomes, difficulties, etc.
print(df['outcome'].value_counts())
print(df['difficulty'].value_counts())
print(df['won'].value_counts())

Total number of games in the dataset: 211602
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64
             game_id    word_length
count  211602.000000  211602.000000
mean   105842.724232       9.411064
std     61084.765488       3.100986
min        42.000000       1.000000
25%     52942.250000       7.000000
50%    105842.500000       9.000000
75%    158742.750000      11.000000
max    211679.000000      29.000000
Number of unique words: 5039
outcome
win     105801
lose    105801
Name: count, dtype: int64
difficulty
easy      70534
medium    70534
hard      70534
Name: count, dtype: int64
won
True     140814
False     70788
Name: count, dtype: int64


##### Dataset Creation and train test split

In [18]:
# Create datasets directly from the saved parquet files
hangman_dataset = HangmanDataset(parquet_path)
# valid_dataset = HangmanDataset(parquet_valid_path)

In [19]:
# Split the dataset into training and validation sets
train_dataset, valid_dataset = hangman_dataset.split(test_size=0.2)

In [20]:
def check_seq_len_distribution(dataset):
    distribution = {}
    for seq_len, indices in dataset.seq_len_index.items():
        distribution[seq_len] = len(indices)
    return distribution

def check_seq_len_distribution(dataset):
    distribution = {}
    for seq_len, indices in dataset.seq_len_index.items():
        distribution[seq_len] = len(indices)
    return distribution

# Check proportions
print(f"Total samples in original dataset: {len(hangman_dataset)}")
print(f"Samples in training dataset: {len(train_dataset)}")
print(f"Samples in validation dataset: {len(valid_dataset)}")

# Verify that the sum of train and validation samples equals the total samples
assert len(train_dataset) + len(valid_dataset) == len(hangman_dataset), "Mismatch in total sample count"

# Check sequence length distribution in each dataset
train_distribution = check_seq_len_distribution(train_dataset)
valid_distribution = check_seq_len_distribution(valid_dataset)

print("Training dataset sequence length distribution:", train_distribution)
print("Validation dataset sequence length distribution:", valid_distribution)

# Ensure each sequence length is represented in both datasets
assert set(train_distribution.keys()) == set(valid_distribution.keys()), "Mismatch in sequence length representation"

Total samples in original dataset: 211602
Samples in training dataset: 169273
Samples in validation dataset: 42329
Training dataset sequence length distribution: {13: 3523, 14: 2030, 12: 6014, 11: 8786, 15: 1017, 10: 12079, 8: 22252, 7: 25064, 9: 16881, 5: 10049, 4: 11620, 2: 25194, 3: 14766, 6: 9357, 17: 167, 16: 400, 18: 47, 19: 19, 21: 4, 20: 4}
Validation dataset sequence length distribution: {13: 881, 14: 508, 12: 1504, 11: 2197, 15: 255, 10: 3020, 8: 5564, 7: 6266, 9: 4221, 5: 2513, 4: 2906, 2: 6299, 3: 3692, 6: 2340, 17: 42, 16: 101, 18: 12, 19: 5, 21: 1, 20: 2}


In [21]:
def count_unique_seq_lens(dataset):
    # Count the unique sequence lengths in the dataset
    unique_seq_lens = len(dataset.seq_len_index)
    return unique_seq_lens

train_unique_seq_lens = count_unique_seq_lens(train_dataset)
valid_unique_seq_lens = count_unique_seq_lens(valid_dataset)

print(f"Unique sequence lengths in training dataset: {train_unique_seq_lens}")
print(f"Unique sequence lengths in validation dataset: {valid_unique_seq_lens}")

Unique sequence lengths in training dataset: 20
Unique sequence lengths in validation dataset: 20


In [22]:
hangman_dataset[0]

{'game_id': 193578,
 'word': 'unproportionably',
 'initial_state': ['________________'],
 'final_state': 'unproportionably',
 'guessed_states': ['________________',
  '______________l_',
  '____________a_l_',
  '___r___r____a_l_',
  '_n_r___r___na_l_',
  '_n_r___r___nabl_',
  '_npr_p_r___nabl_',
  '_npr_p_r___nabl_',
  '_npr_p_r_i_nabl_',
  '_npr_p_r_i_nably',
  '_npr_p_rti_nably',
  '_nproportionably',
  'unproportionably'],
 'guessed_letters': ['l',
  'a',
  'r',
  'n',
  'b',
  'p',
  'z',
  'i',
  'y',
  't',
  'o',
  'u'],
 'game_state': 'allMasked',
 'difficulty': 'easy',
 'outcome': 'win',
 'word_length': 16,
 'won': True}

In [23]:
import random

# Define the function simulate_game_progress here

# Simulate a game with the specified parameters
word = 'salicaceae'
initial_state = '__________'  # Adjusted from ['__________'] to '__________' to match expected input type
game_won, game_progress = simulate_game_progress(None, word, initial_state, None, None, None, difficulty='easy', outcome_preference='win')

# Extract the final state from the last entry in game_progress, if available
final_state = game_progress[-1][1] if game_progress else initial_state

# Extract guessed states and guessed letters from game_progress
guessed_states = [state for _, state, _ in game_progress]
guessed_letters = [guess for guess, _, _ in game_progress]

# Determine if the game was won based on the final state
won = final_state == word

# Output the results for comparison
print(f"Final State: {final_state}")
print(f"Guessed States: {guessed_states}")
print(f"Guessed Letters: {guessed_letters}")
print(f"Game Won: {won}")

# Compare the results with the expected outcomes
# Note: Direct comparison might not be feasible due to the randomness in guessed letters and states


Final State: salicaceae
Guessed States: ['__l_______', '__li______', 's_li______', 's_li___e_e', 's_lic_ce_e', 'salicaceae']
Guessed Letters: ['l', 'i', 's', 'e', 'c', 'a']
Game Won: True
