##### Imports

In [1]:
import sys
from pathlib import Path
import warnings
from torch.utils.data import Dataset

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import pickle
from pathlib import Path
from scr.dataset import *
# from scr.game import *
from scr.feature_engineering import *
# from scr.plot_utils import *
import gc
from scr.utils import print_scenarios


from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

In [2]:
# import h5py
# import json

# # Sample Hangman game data
# games_data = [
#     {
#         'guessed_states': ['___b____e', 'a_aba___e', 'anaba__ne', 'anaba_ine'],
#         'guessed_letters': ['a', 'n', 'i', 's'],
#         'word': 'anabasine',
#         'initial_state': '___b_____',
#         'attributes': {'game_state': 'early', 'difficulty': 'easy', 'outcome': 'win', 
#                           'word_length': 9}
#     },
#     {
#         'guessed_states': ['__c______', '_a_a_____'],
#         'guessed_letters': ['c', 'a'],
#         'word': 'cacophony',
#         'initial_state': '__c______',
#         'attributes': {'game_state': 'mid', 'difficulty': 'hard', 'outcome': 'lose', 
#                             'word_length': 9}
#     }
    
#     # ... more games
# ]

# # Creating the HDF5 file
# with h5py.File('HangmanData.h5', 'w') as f:
#     for i, game in enumerate(games_data):
#         # Create a group for each game
#         game_group = f.create_group(f'game_{i+1}')

#         # Serialize and store each part of the game data
#         for key, value in game.items():
#             if isinstance(value, dict):
#                 # If the value is a dictionary, serialize it to JSON
#                 value = json.dumps(value)
#             game_group.create_dataset(key, data=value)

##### Data reading and Params Settings

In [3]:
#### Papermil if need
 
NUM_STRATIFIED_SAMPLES = 250_000 # This will be overwritten by Papermill

In [4]:
# Define the base directory
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")

# Create a subdirectory for the stratified samples
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
stratified_samples_dir.mkdir(parents=True, exist_ok=True)

print(stratified_samples_dir)

/media/sayem/510B93E12554BBD1/dataset/250000


In [5]:
# Splitting Dataset Function
import random

def split_dataset(word_list, train_ratio=0.8):
    total_words = len(word_list)
    train_size = int(total_words * train_ratio)
    random.shuffle(word_list)
    return word_list[:train_size], word_list[train_size:]

# Splitting the word list
train_words, test_words = split_dataset(word_list)

# Save split datasets to files
save_words_to_file(train_words, stratified_samples_dir / 'train_words.txt')
save_words_to_file(test_words, stratified_samples_dir / 'test_words.txt')

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(train_words)
char_frequency = calculate_char_frequencies(train_words)
max_word_length = max(len(word) for word in train_words)

In [6]:
len(train_words)

181840

##### Stratified Sample Generation

In [7]:
## we are taking starified samples from train_words

from scr.custom_sampler import \
    stratified_sample_by_length_and_frequency, \
        stratified_sample_by_length, stratified_sample_by_length_and_uniqueness

print(f'Number of Statrified samples: {NUM_STRATIFIED_SAMPLES}')

# sampled_words_by_length_and_frequency \
#     = stratified_sample_by_length_and_frequency(train_words, \
#     word_frequencies, \
#     NUM_STRATIFIED_SAMPLES)

sampled_words_by_length = stratified_sample_by_length_and_uniqueness(word_list, \
    NUM_STRATIFIED_SAMPLES)


print(len(sampled_words_by_length))

Number of Statrified samples: 250000
227300


##### Intial State Simulation Testing

In [8]:
from scr.game import *

In [9]:
# word = "mississippi"
word = "mythopoetize"
# word = "cat"

initial_states = process_word_for_six_states(word)

initial_states

{'allMasked': '____________',
 'early': '____o_o_____',
 'quarterRevealed': '__t_o_o_t___',
 'midRevealed': 'myt_o_o_t___',
 'midLateRevealed': 'mytho_oet__e',
 'lateRevealed': 'mythopoet_ze',
 'nearEnd': 'mythopoet_ze'}

In [10]:
len(initial_states)

7

In [11]:
initial_states

{'allMasked': '____________',
 'early': '____o_o_____',
 'quarterRevealed': '__t_o_o_t___',
 'midRevealed': 'myt_o_o_t___',
 'midLateRevealed': 'mytho_oet__e',
 'lateRevealed': 'mythopoet_ze',
 'nearEnd': 'mythopoet_ze'}

##### Dataset Generation: Simulation

In [12]:
word = "mississippi"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States: ", initial_states)

Generated Initial States:  {'allMasked': '___________', 'early': '__ss_ss____', 'quarterRevealed': '__ss_ss____', 'midRevealed': '_ississi__i', 'midLateRevealed': '_ississippi', 'lateRevealed': '_ississippi', 'nearEnd': '_ississippi'}


In [13]:
from scr.game import simulate_game_progress, \
    play_game_with_a_word, process_word

# Example word and initial state
# Example usage
word = "mississippi"
# word = "cat"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States:")
for state_name, initial_state in initial_states.items():
    # Simulate the game
    print(initial_state)
    print(f"For initial state: {initial_state}")
    won, game_progress = simulate_game_progress(
        model=None,  # Assuming model is not used in this example
        word=word, 
        initial_state=initial_state, 
        char_frequency={},  # Assuming char_frequency is not used in this example
        max_word_length=len(word), 
        device=None,  # Assuming device is not used in this example
        max_attempts=6, 
        normalize=True,
        difficulty="medium", 
        outcome_preference='win'
    )

    # Display game progress
    for step in game_progress:
        print(f"Guessed: '{step[0]}', New State: '{step[1]}', Correct: {step[2]}")

        # break

    # break

    # print("Game Result:", "Won" if won else "Lost")

Generated Initial States:
___________
For initial state: ___________
Guessed: 'c', New State: '___________', Correct: False
Guessed: 'd', New State: '___________', Correct: False
Guessed: 'o', New State: '___________', Correct: False
Guessed: 'g', New State: '___________', Correct: False
Guessed: 'i', New State: '_i__i__i__i', Correct: True
Guessed: 'j', New State: '_i__i__i__i', Correct: False
Guessed: 'q', New State: '_i__i__i__i', Correct: False
________pp_
For initial state: ________pp_
Guessed: 's', New State: '__ss_ss_pp_', Correct: True
Guessed: 'w', New State: '__ss_ss_pp_', Correct: False
Guessed: 'l', New State: '__ss_ss_pp_', Correct: False
Guessed: 'i', New State: '_ississippi', Correct: True
Guessed: 'm', New State: 'mississippi', Correct: True
________pp_
For initial state: ________pp_
Guessed: 'm', New State: 'm_______pp_', Correct: True
Guessed: 'l', New State: 'm_______pp_', Correct: False
Guessed: 'x', New State: 'm_______pp_', Correct: False
Guessed: 'z', New State: 

In [14]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

# Function to process a batch of games
def process_batch_to_parquet(batch, file_path, start_game_counter):
    game_counter = start_game_counter
    data_for_parquet = []

    for game_data in batch:
        word, state_name, initial_state, difficulty, outcome, won, game_progress = game_data

        # Check if game_progress is not empty
        if not game_progress:
            continue  # Skip to the next game if game_progress is empty

        final_state = game_progress[-1][1]  # Adjust this based on your game_progress structure

        # Preparing data for storage
        guessed_states = [initial_state] + [state for _, state, _ in game_progress]
        guessed_letters = [letter for letter, _, _ in game_progress]

        # Add to data list for parquet
        data_for_parquet.append({
            'game_id': game_counter,
            'word': word,
            'initial_state': initial_state,
            'final_state': final_state,
            'guessed_states': ','.join(guessed_states[:-1]),
            'guessed_letters': ','.join(guessed_letters),
            'game_state': state_name,
            'difficulty': difficulty,
            'outcome': outcome,
            'word_length': len(word),
            'won': won
        })

        game_counter += 1

    # Create a DataFrame and write to Parquet
    df = pd.DataFrame(data_for_parquet)
    table = pa.Table.from_pandas(df)
    pq.write_table(table, file_path, compression='snappy')

    return game_counter

# Adjust the rest of the code to generate and process batches

# Function to generate a batch of game data
def generate_batch(start_index, end_index, sampled_words, game_states_func):
    batch = []
    for i in range(start_index, end_index):
        word = sampled_words[i]
        game_states = game_states_func(word)  # Your function to get game states
        for state_name, initial_state in game_states.items():
            for difficulty in ["easy", "medium", "hard"]:
                for outcome in ["win", "lose"]:
                    won, game_progress = simulate_game_progress(
                        model=None, word=word, initial_state=initial_state,
                        char_frequency=char_frequency, max_word_length=max_word_length,
                        device=device, max_attempts=6, normalize=True,
                        difficulty=difficulty, outcome_preference=outcome
                    )
                    batch.append((word, state_name, initial_state, \
                        difficulty, outcome, won, game_progress))
    return batch


# Main execution
batch_size = 1000  # Adjust based on your requirements
parquet_file_path = stratified_samples_dir / "HangmanData.parquet"


# ######################### Uncomment if necessary #####################################

game_counter = 1
total_words = len(sampled_words_by_length)

for start_index in tqdm(range(0, total_words, batch_size)):
    end_index = min(start_index + batch_size, total_words)
    batch = generate_batch(start_index, end_index, \
        sampled_words_by_length, process_word_for_six_states)
    game_counter = process_batch_to_parquet(batch, parquet_file_path, game_counter)

print(f"Total games processed: {game_counter - 1}")

# ######################### Uncomment if necessary #####################################

  0%|          | 0/228 [00:00<?, ?it/s]

100%|██████████| 228/228 [03:49<00:00,  1.01s/it]

Total games processed: 9546084





In [17]:
import pandas as pd

# # Replace this with the path to your Parquet file
# parquet_file_path = 'path/to/your/HangmanData.parquet'

# Read the Parquet file
df = pd.read_parquet(parquet_file_path)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,game_id,word,initial_state,final_state,guessed_states,guessed_letters,game_state,difficulty,outcome,word_length,won
0,9533833,antivivisectionists,___________________,antivivisectionists,"___________________,_____________o_____,______...","o,s,t,c,v,n,i,e,a",allMasked,easy,win,19,True
1,9533834,antivivisectionists,___________________,antivivisectionists,"___________________,_n____________n____,_n____...","n,e,l,a,s,t,h,v,i,z,m,c,p,o",allMasked,easy,lose,19,True
2,9533835,antivivisectionists,___________________,__t_v_v_s__t____sts,"___________________,___________________,______...","y,g,s,h,t,f,v,u,r",allMasked,medium,win,19,False
3,9533836,antivivisectionists,___________________,antivivise_tionists,"___________________,___________________,______...","r,b,e,p,v,i,n,s,a,t,x,j,o,m",allMasked,medium,lose,19,False
4,9533837,antivivisectionists,___________________,__ti_i_i__cti__i_t_,"___________________,___________________,__t___...","g,t,y,i,c,w,b,m,z",allMasked,hard,win,19,False


In [18]:
# Get the total number of rows (games) in the DataFrame
total_games = len(df)

print(f"Total number of games in the dataset: {total_games}")

# Additional checks you might want to perform:
# - Check for any null values or anomalies in the data
print(df.isnull().sum())

# - Get a summary of the DataFrame
print(df.describe())

# - Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"Number of unique words: {unique_words}")

# - Inspect the distribution of game outcomes, difficulties, etc.
print(df['outcome'].value_counts())
print(df['difficulty'].value_counts())


Total number of games in the dataset: 12252
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64
            game_id   word_length
count  1.225200e+04  12252.000000
mean   9.539958e+06     16.471596
std    3.536992e+03      6.614032
min    9.533833e+06      1.000000
25%    9.536896e+06     15.000000
50%    9.539958e+06     19.000000
75%    9.543021e+06     21.000000
max    9.546084e+06     29.000000
Number of unique words: 300
outcome
win     6126
lose    6126
Name: count, dtype: int64
difficulty
easy      4084
medium    4084
hard      4084
Name: count, dtype: int64


##### Reading Checking

In [None]:
from torch.utils.data import DataLoader
from scr.dataset import HangmanDataset # , custom_collate_fn

from scr.feature_engineering import process_batch_of_games

# Assuming HangmanDataset is defined as provided
hangman_dataset = HangmanDataset(parquet_file_path)  # Replace with your Parquet file path

hangman_dataset[0]

{'game_id': 9533701,
 'word': 'cachecache',
 'initial_state': ['__________'],
 'final_state': 'cachecache',
 'guessed_states': ['__________', '___h____h_', '___he___he', '_a_he_a_he'],
 'guessed_letters': ['h', 'e', 'a', 'c'],
 'game_state': 'allMasked',
 'difficulty': 'easy',
 'outcome': 'win',
 'word_length': 10,
 'won': False}

In [None]:
hangman_dataset[-1]

{'game_id': 9546084,
 'word': 'hydroxydesoxycorticosterone',
 'initial_state': ['hy_roxy_esoxycorticosterone'],
 'final_state': 'hydroxydesoxycorticosterone',
 'guessed_states': ['hy_roxy_esoxycorticosterone',
  'hy_roxy_esoxycorticosterone',
  'hy_roxy_esoxycorticosterone',
  'hy_roxy_esoxycorticosterone',
  'hy_roxy_esoxycorticosterone'],
 'guessed_letters': ['b', 'q', 'z', 'm', 'd'],
 'game_state': 'nearEnd',
 'difficulty': 'hard',
 'outcome': 'lose',
 'word_length': 27,
 'won': False}

In [None]:
len(hangman_dataset)

12384

In [None]:
# from torch.utils.data import DataLoader
# from scr.dataset import HangmanDataset # , custom_collate_fn

# from scr.feature_engineering import process_batch_of_games

# # Assuming HangmanDataset is defined as provided
# hangman_dataset = HangmanDataset(parquet_file_path)  # Replace with your Parquet file path

# dataloader = DataLoader(hangman_dataset, batch_size=32, \
#     collate_fn=custom_collate_fn, shuffle=True)

# for batch in dataloader:
#     # Now, directly use the keys of the batch dictionary
#     states = batch['guessed_states']
#     guesses = batch['guessed_letters']
#     max_seq_length = batch['max_seq_len']
#     original_seq_lengths =  batch['original_seq_lengths']

#     states, guesses, max_seq_length, original_seq_lengths = batch

#     assert len(states) == len(guesses)

#     print(len(states))
#     print(len(guesses))
#     print(max_seq_length)
#     print(original_seq_lengths)

#     batch_states, batch_missed_chars = process_batch_of_games(states, guesses,
#                            char_frequency, max_word_length, max_seq_length)

#     print(batch_states.shape)

#     print(batch_missed_chars.shape)

#     # return batch_states, batch_missed_chars, max_seq_length, original_seq_lengths 

#     break

In [None]:
# word = 'sassa_ras_s'
# fets = build_feature_set(word, char_frequency, \
#     max_word_length, ngram_n=3, normalize=True)