##### Imports

In [1]:
import sys
from pathlib import Path
import warnings
from torch.utils.data import Dataset

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import pickle
from pathlib import Path
from scr.dataset import *
# from scr.game import *
from scr.feature_engineering import *
# from scr.plot_utils import *
import gc
from scr.utils import print_scenarios


from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

In [2]:
# import h5py
# import json

# # Sample Hangman game data
# games_data = [
#     {
#         'guessed_states': ['___b____e', 'a_aba___e', 'anaba__ne', 'anaba_ine'],
#         'guessed_letters': ['a', 'n', 'i', 's'],
#         'word': 'anabasine',
#         'initial_state': '___b_____',
#         'attributes': {'game_state': 'early', 'difficulty': 'easy', 'outcome': 'win', 
#                           'word_length': 9}
#     },
#     {
#         'guessed_states': ['__c______', '_a_a_____'],
#         'guessed_letters': ['c', 'a'],
#         'word': 'cacophony',
#         'initial_state': '__c______',
#         'attributes': {'game_state': 'mid', 'difficulty': 'hard', 'outcome': 'lose', 
#                             'word_length': 9}
#     }
    
#     # ... more games
# ]

# # Creating the HDF5 file
# with h5py.File('HangmanData.h5', 'w') as f:
#     for i, game in enumerate(games_data):
#         # Create a group for each game
#         game_group = f.create_group(f'game_{i+1}')

#         # Serialize and store each part of the game data
#         for key, value in game.items():
#             if isinstance(value, dict):
#                 # If the value is a dictionary, serialize it to JSON
#                 value = json.dumps(value)
#             game_group.create_dataset(key, data=value)

##### Data reading and Params Settings

In [3]:
#### Papermil if need
 
NUM_STRATIFIED_SAMPLES = 1_000 # This will be overwritten by Papermill

# Define the base directory
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")

# Create a subdirectory for the stratified samples
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
stratified_samples_dir.mkdir(parents=True, exist_ok=True)

print(stratified_samples_dir)

/media/sayem/510B93E12554BBD1/dataset/1000


In [4]:
# Splitting Dataset Function
import random

def split_dataset(word_list, train_ratio=0.8):
    total_words = len(word_list)
    train_size = int(total_words * train_ratio)
    random.shuffle(word_list)
    return word_list[:train_size], word_list[train_size:]

# Splitting the word list
train_words, test_words = split_dataset(word_list)

# Save split datasets to files
save_words_to_file(train_words, stratified_samples_dir / 'train_words.txt')
save_words_to_file(test_words, stratified_samples_dir / 'test_words.txt')

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(train_words)
char_frequency = calculate_char_frequencies(train_words)
max_word_length = max(len(word) for word in train_words)

In [5]:
len(train_words)

181840

##### Stratified Sample Generation

In [6]:
## we are taking starified samples from train_words

from scr.custom_sampler import \
    stratified_sample_by_length_and_frequency, \
        stratified_sample_by_length, stratified_sample_by_length_and_uniqueness

print(f'Number of Statrified samples: {NUM_STRATIFIED_SAMPLES}')

# sampled_words_by_length_and_frequency \
#     = stratified_sample_by_length_and_frequency(train_words, \
#     word_frequencies, \
#     NUM_STRATIFIED_SAMPLES)

sampled_words_by_length = stratified_sample_by_length_and_uniqueness(word_list, \
    NUM_STRATIFIED_SAMPLES)


print(len(sampled_words_by_length))

Number of Statrified samples: 1000
1091


##### Intial State Simulation Testing

In [7]:
from scr.game import *

In [8]:
# word = "mississippi"
word = "mythopoetize"
# word = "cat"

initial_states = process_word_for_six_states(word)

initial_states

{'allMasked': '____________',
 'early': '_______e___e',
 'quarterRevealed': '__t____et__e',
 'midRevealed': '_yt____eti_e',
 'midLateRevealed': '_yt__p_etize',
 'lateRevealed': 'myth_p_etize',
 'nearEnd': 'myth_p_etize'}

In [9]:
len(initial_states)

7

In [10]:
initial_states

{'allMasked': '____________',
 'early': '_______e___e',
 'quarterRevealed': '__t____et__e',
 'midRevealed': '_yt____eti_e',
 'midLateRevealed': '_yt__p_etize',
 'lateRevealed': 'myth_p_etize',
 'nearEnd': 'myth_p_etize'}

##### Dataset Generation: Simulation

In [11]:
word = "mississippi"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States: ", initial_states)

Generated Initial States:  {'allMasked': '___________', 'early': '________pp_', 'quarterRevealed': '________pp_', 'midRevealed': 'm_______pp_', 'midLateRevealed': 'mi__i__ippi', 'lateRevealed': 'mi__i__ippi', 'nearEnd': 'mi__i__ippi'}


In [12]:
from scr.game import simulate_game_progress, \
    play_game_with_a_word, process_word

# Example word and initial state
# Example usage
word = "mississippi"
# word = "cat"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States:")
for state_name, initial_state in initial_states.items():
    # Simulate the game
    print(initial_state)
    print(f"For initial state: {initial_state}")
    won, game_progress = simulate_game_progress(
        model=None,  # Assuming model is not used in this example
        word=word, 
        initial_state=initial_state, 
        char_frequency={},  # Assuming char_frequency is not used in this example
        max_word_length=len(word), 
        device=None,  # Assuming device is not used in this example
        max_attempts=6, 
        normalize=True,
        difficulty="medium", 
        outcome_preference='win'
    )

    # Display game progress
    for step in game_progress:
        print(f"Guessed: '{step[0]}', New State: '{step[1]}', Correct: {step[2]}")

        # break

    # break

    # print("Game Result:", "Won" if won else "Lost")

Generated Initial States:
___________
For initial state: ___________
Guessed: 'n', New State: '___________', Correct: False
Guessed: 'h', New State: '___________', Correct: False
Guessed: 't', New State: '___________', Correct: False
Guessed: 'd', New State: '___________', Correct: False
Guessed: 'b', New State: '___________', Correct: False
Guessed: 'm', New State: 'm__________', Correct: True
Guessed: 'c', New State: 'm__________', Correct: False
________pp_
For initial state: ________pp_
Guessed: 't', New State: '________pp_', Correct: False
Guessed: 'g', New State: '________pp_', Correct: False
Guessed: 'i', New State: '_i__i__ippi', Correct: True
Guessed: 'y', New State: '_i__i__ippi', Correct: False
Guessed: 'w', New State: '_i__i__ippi', Correct: False
Guessed: 'k', New State: '_i__i__ippi', Correct: False
Guessed: 'f', New State: '_i__i__ippi', Correct: False
________pp_
For initial state: ________pp_
Guessed: 's', New State: '__ss_ss_pp_', Correct: True
Guessed: 'i', New State

##### Writing Parquet

In [13]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

# Assuming functions 'simulate_game_progress' and 'process_word_for_six_states' are defined elsewhere

def process_batch_to_parquet(batch, file_path, start_game_counter):
    game_counter = start_game_counter
    data_for_parquet = []

    for game_data in batch:
        word, state_name, initial_state, difficulty, outcome, won, game_progress = game_data
        if not game_progress:
            continue

        final_state = game_progress[-1][1]  # Get the final state from game_progress

        guessed_states = [initial_state] + [state for _, state, _ in game_progress]
        guessed_letters = [letter for letter, _, _ in game_progress]

        data_for_parquet.append({
            'game_id': game_counter,
            'word': word,
            'initial_state': initial_state,
            'final_state': final_state,
            'guessed_states': ','.join(guessed_states[:-1]),
            'guessed_letters': ','.join(guessed_letters),
            'game_state': state_name,
            'difficulty': difficulty,
            'outcome': outcome,
            'word_length': len(word),
            'won': won
        })

        game_counter += 1

    df = pd.DataFrame(data_for_parquet)
    table = pa.Table.from_pandas(df)
    pq.write_to_dataset(table, root_path=file_path, compression='snappy') #, existing_data_behavior='append')

    return game_counter

def generate_batch(start_index, end_index, sampled_words, game_states_func):
    batch = []
    for i in range(start_index, end_index):
        word = sampled_words[i]
        game_states = game_states_func(word)
        for state_name, initial_state in game_states.items():
            for difficulty in ["easy", "medium", "hard"]:
                for outcome in ["win", "lose"]:
                    won, game_progress = simulate_game_progress(
                        model=None, word=word, initial_state=initial_state,
                        char_frequency=char_frequency, max_word_length=max_word_length,
                        device=device, max_attempts=6, normalize=True,
                        difficulty=difficulty, outcome_preference=outcome
                    )
                    batch.append((word, state_name, initial_state, difficulty, outcome, won, game_progress))
    return batch

def main_execution(sampled_words, game_states_func, batch_size, parquet_file_path):
    game_counter = 1
    total_words = len(sampled_words)

    print(f"Total number of words: ", total_words)

    for start_index in tqdm(range(0, total_words, batch_size), desc="Processing Batches"):
        end_index = min(start_index + batch_size, total_words)
        batch = generate_batch(start_index, end_index, sampled_words, game_states_func)
        game_counter = process_batch_to_parquet(batch, parquet_file_path, game_counter)

    print(f"Total games processed: {game_counter - 1}")

# Execute the main function
# Main execution
batch_size = 1000  # Adjust based on your requirements

parquet_file_path = Path(stratified_samples_dir) / "parquets"
parquet_file_path.mkdir(exist_ok=True, parents=True)

main_execution(sampled_words_by_length, process_word_for_six_states, batch_size, parquet_file_path)

Total number of words:  1091


Processing Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Processing Batches: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]

Total games processed: 45738





In [14]:
parquet_file_path

PosixPath('/media/sayem/510B93E12554BBD1/dataset/1000/parquets')

In [15]:
import pandas as pd

# # Replace this with the path to your Parquet file
# parquet_file_path = 'path/to/your/HangmanData.parquet'

# Read the Parquet file
df = pd.read_parquet(parquet_file_path)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,game_id,word,initial_state,final_state,guessed_states,guessed_letters,game_state,difficulty,outcome,word_length,won
0,42001,strawberryraspberry,___________________,strawberryraspberry,"___________________,______e________e___,______...","e,y,p,t,a,w,b,r,s",allMasked,easy,win,19,True
1,42002,strawberryraspberry,___________________,strawberryraspberry,"___________________,__r____rr_r_____rr_,__r___...","r,y,g,b,p,a,s,u,e,t,w",allMasked,easy,lose,19,True
2,42003,strawberryraspberry,___________________,s_raw_erryras__erry,"___________________,s___________s______,s__a__...","s,a,e,y,i,g,d,u,w,r,o,c",allMasked,medium,win,19,False
3,42004,strawberryraspberry,___________________,strawberryraspberry,"___________________,______e________e___,_t____...","e,t,r,u,o,y,p,b,w,v,s,a",allMasked,medium,lose,19,True
4,42005,strawberryraspberry,___________________,s_r__berryr_spberry,"___________________,_____________p_____,______...","p,q,o,y,m,g,b,s,e,r,l,c",allMasked,hard,win,19,False


In [16]:
# Get the total number of rows (games) in the DataFrame
total_games = len(df)

print(f"Total number of games in the dataset: {total_games}")

# Additional checks you might want to perform:
# - Check for any null values or anomalies in the data
print(df.isnull().sum())

# - Get a summary of the DataFrame
print(df.describe())

# - Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"Number of unique words: {unique_words}")

# - Inspect the distribution of game outcomes, difficulties, etc.
print(df['outcome'].value_counts())
print(df['difficulty'].value_counts())


Total number of games in the dataset: 2405109
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64
            game_id   word_length
count  2.405109e+06  2.405109e+06
mean   2.287243e+04  1.002929e+01
std    1.320532e+04  4.003819e+00
min    1.000000e+00  1.000000e+00
25%    1.143400e+04  7.000000e+00
50%    2.287500e+04  9.000000e+00
75%    3.430800e+04  1.200000e+01
max    4.573800e+04  2.900000e+01
Number of unique words: 2141
outcome
win     1202714
lose    1202395
Name: count, dtype: int64
difficulty
medium    801774
hard      801723
easy      801612
Name: count, dtype: int64


##### Reading Checking

In [17]:
import os
from sklearn.model_selection import train_test_split

# Assuming the HangmanDataset class is already defined as above

def split_dataset(directory, test_size=0.20, random_state=42):
    # Get the list of all parquet files
    parquet_files = sorted([f for f in os.listdir(directory) if f.endswith('.parquet')])

    # Split the list of files
    train_files, valid_files = train_test_split(parquet_files, \
        test_size=test_size, random_state=random_state)

    # Construct the full path for each file
    train_files = [os.path.join(directory, f) for f in train_files]
    valid_files = [os.path.join(directory, f) for f in valid_files]

    return train_files, valid_files

# # Directory containing the dataset
# parquet_file_path = '/media/sayem/510B93E12554BBD1/dataset/250000/parquets'

# Split the dataset
train_files, valid_files = split_dataset(parquet_file_path)

# Create dataset instances
train_dataset = HangmanDataset(train_files)
valid_dataset = HangmanDataset(valid_files)

In [18]:
train_dataset[0]

{'game_id': 4389,
 'word': 'plectron',
 'initial_state': ['__ec_r_n'],
 'final_state': 'plectron',
 'guessed_states': ['__ec_r_n',
  '__ectr_n',
  'p_ectr_n',
  'p_ectron',
  'p_ectron',
  'p_ectron'],
 'guessed_letters': ['t', 'p', 'o', 'w', 'f', 'l'],
 'game_state': 'midRevealed',
 'difficulty': 'medium',
 'outcome': 'win',
 'word_length': 8,
 'won': False}

In [19]:
len(train_dataset)

1990075

In [20]:
# Assuming hangman_dataset is already loaded and its total size is known
total_size = len(train_dataset)
last_element_index = total_size - 1

# Access the last element
last_data = train_dataset[last_element_index]

In [21]:
last_data

{'game_id': 16948,
 'word': 'cocarde',
 'initial_state': ['c_ca_d_'],
 'final_state': 'cocarde',
 'guessed_states': ['c_ca_d_',
  'c_ca_d_',
  'c_ca_d_',
  'c_card_',
  'c_card_',
  'c_carde'],
 'guessed_letters': ['p', 'y', 'r', 'v', 'e', 'o'],
 'game_state': 'midRevealed',
 'difficulty': 'medium',
 'outcome': 'lose',
 'word_length': 7,
 'won': False}