##### Imports

In [1]:
import sys
from pathlib import Path
import warnings
from torch.utils.data import Dataset

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import pickle
from pathlib import Path
from scr.dataset import *
# from scr.game import *
from scr.feature_engineering import *
# from scr.plot_utils import *
import gc
from scr.utils import print_scenarios


from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)


# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

In [2]:
# import h5py
# import json

# # Sample Hangman game data
# games_data = [
#     {
#         'guessed_states': ['___b____e', 'a_aba___e', 'anaba__ne', 'anaba_ine'],
#         'guessed_letters': ['a', 'n', 'i', 's'],
#         'word': 'anabasine',
#         'initial_state': '___b_____',
#         'attributes': {'game_state': 'early', 'difficulty': 'easy', 'outcome': 'win', 
#                           'word_length': 9}
#     },
#     {
#         'guessed_states': ['__c______', '_a_a_____'],
#         'guessed_letters': ['c', 'a'],
#         'word': 'cacophony',
#         'initial_state': '__c______',
#         'attributes': {'game_state': 'mid', 'difficulty': 'hard', 'outcome': 'lose', 
#                             'word_length': 9}
#     }
    
#     # ... more games
# ]

# # Creating the HDF5 file
# with h5py.File('HangmanData.h5', 'w') as f:
#     for i, game in enumerate(games_data):
#         # Create a group for each game
#         game_group = f.create_group(f'game_{i+1}')

#         # Serialize and store each part of the game data
#         for key, value in game.items():
#             if isinstance(value, dict):
#                 # If the value is a dictionary, serialize it to JSON
#                 value = json.dumps(value)
#             game_group.create_dataset(key, data=value)

##### Data reading and Params Settings

In [3]:
import shutil
from pathlib import Path

NUM_STRATIFIED_SAMPLES = 100 # This will be overwritten by Papermill

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")

stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)

parquet_train_path = stratified_samples_dir / 'train_parquets'
parquet_valid_path = stratified_samples_dir / 'valid_parquets'

# Function to delete and recreate a directory
def recreate_directory(path):
    if path.exists():
        shutil.rmtree(path)  # Delete the directory and its contents
    path.mkdir(parents=True)  # Create the directory

# Recreate the train and valid directories
recreate_directory(parquet_train_path)
recreate_directory(parquet_valid_path)

print(f"Directories '{parquet_train_path}' and '{parquet_valid_path}' have been recreated.")

Directories '/media/sayem/510B93E12554BBD1/dataset/100/train_parquets' and '/media/sayem/510B93E12554BBD1/dataset/100/valid_parquets' have been recreated.


In [4]:
len(word_list)

227300

##### Stratified Sample Generation

In [5]:
## we are taking starified samples from train_words

from scr.custom_sampler import \
    stratified_sample_by_length_and_frequency, \
        stratified_sample_by_length, stratified_sample_by_length_and_uniqueness

print(f'Number of Statrified samples: {NUM_STRATIFIED_SAMPLES}')

# sampled_words_by_length_and_frequency \
#     = stratified_sample_by_length_and_frequency(train_words, \
#     word_frequencies, \
#     NUM_STRATIFIED_SAMPLES)

sampled_words_by_length = stratified_sample_by_length_and_uniqueness(word_list, \
    NUM_STRATIFIED_SAMPLES)


print(len(sampled_words_by_length))

Number of Statrified samples: 100
230


##### Intial State Simulation Testing

In [6]:
from scr.game import *

In [7]:
# word = "mississippi"
word = "mythopoetize"
# word = "cat"

initial_states = process_word_for_six_states(word)

initial_states

{'allMasked': '____________',
 'early': '____o_o_____',
 'quarterRevealed': '____o_o__i__',
 'midRevealed': '_y__o_o__iz_',
 'midLateRevealed': 'my__opo__iz_',
 'lateRevealed': 'myt_opoetize',
 'nearEnd': 'myt_opoetize'}

In [8]:
len(initial_states)

7

In [9]:
initial_states

{'allMasked': '____________',
 'early': '____o_o_____',
 'quarterRevealed': '____o_o__i__',
 'midRevealed': '_y__o_o__iz_',
 'midLateRevealed': 'my__opo__iz_',
 'lateRevealed': 'myt_opoetize',
 'nearEnd': 'myt_opoetize'}

##### Dataset Generation: Simulation

In [10]:
word = "mississippi"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States: ", initial_states)

Generated Initial States:  {'allMasked': '___________', 'early': '________pp_', 'quarterRevealed': '________pp_', 'midRevealed': '_i__i__ippi', 'midLateRevealed': '_ississippi', 'lateRevealed': '_ississippi', 'nearEnd': '_ississippi'}


In [11]:
from scr.game import simulate_game_progress, \
    play_game_with_a_word, process_word

# Example word and initial state
# Example usage
word = "mississippi"
# word = "cat"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States:")
for state_name, initial_state in initial_states.items():
    # Simulate the game
    print(initial_state)
    print(f"For initial state: {initial_state}")
    won, game_progress = simulate_game_progress(
        model=None,  # Assuming model is not used in this example
        word=word, 
        initial_state=initial_state, 
        char_frequency={},  # Assuming char_frequency is not used in this example
        max_word_length=len(word), 
        device=None,  # Assuming device is not used in this example
        max_attempts=6, 
        normalize=True,
        difficulty="medium", 
        outcome_preference='win'
    )

    # Display game progress
    for step in game_progress:
        print(f"Guessed: '{step[0]}', New State: '{step[1]}', Correct: {step[2]}")

        # break

    # break

    # print("Game Result:", "Won" if won else "Lost")

Generated Initial States:
___________
For initial state: ___________
Guessed: 's', New State: '__ss_ss____', Correct: True
Guessed: 'r', New State: '__ss_ss____', Correct: False
Guessed: 'l', New State: '__ss_ss____', Correct: False
Guessed: 'p', New State: '__ss_ss_pp_', Correct: True
Guessed: 'g', New State: '__ss_ss_pp_', Correct: False
Guessed: 'm', New State: 'm_ss_ss_pp_', Correct: True
Guessed: 'i', New State: 'mississippi', Correct: True
__ss_ss____
For initial state: __ss_ss____
Guessed: 'h', New State: '__ss_ss____', Correct: False
Guessed: 'o', New State: '__ss_ss____', Correct: False
Guessed: 'z', New State: '__ss_ss____', Correct: False
Guessed: 'm', New State: 'm_ss_ss____', Correct: True
Guessed: 'i', New State: 'mississi__i', Correct: True
Guessed: 'c', New State: 'mississi__i', Correct: False
Guessed: 'b', New State: 'mississi__i', Correct: False
Guessed: 'e', New State: 'mississi__i', Correct: False
__ss_ss____
For initial state: __ss_ss____
Guessed: 'p', New State: '

##### Writing Parquet

In [12]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Assuming the function 'process_word_for_six_states' is defined elsewhere
from scr.game import simulate_game_progress, process_word_for_six_states

def process_batch_to_parquet(batch, file_path, start_game_counter):
    game_counter = start_game_counter
    data_for_parquet = []

    for game_data in batch:
        word, state_name, initial_state, difficulty, outcome, won, game_progress = game_data
        if not game_progress:
            continue

        final_state = game_progress[-1][1]
        guessed_states = [initial_state] + [state for _, state, _ in game_progress]
        guessed_letters = [letter for letter, _, _ in game_progress]

        data_for_parquet.append({
            'game_id': game_counter,
            'word': word,
            'initial_state': initial_state,
            'final_state': final_state,
            'guessed_states': ','.join(guessed_states[:-1]),
            'guessed_letters': ','.join(guessed_letters),
            'game_state': state_name,
            'difficulty': difficulty,
            'outcome': outcome,
            'word_length': len(word),
            'won': won
        })

        game_counter += 1

    df = pd.DataFrame(data_for_parquet)
    table = pa.Table.from_pandas(df)
    pq.write_to_dataset(table, root_path=file_path, compression='snappy')

    return game_counter

def generate_batch_for_word(word):
    batch = []
    initial_states = process_word_for_six_states(word)
    games_generated = 0  # Counter for games generated

    for state_name, initial_state in initial_states.items():
        for difficulty in ["easy", "medium", "hard"]:
            for outcome in ["win", "lose"]:
                won, game_progress = simulate_game_progress(
                    model=None, word=word, initial_state=initial_state,
                    char_frequency={}, max_word_length=len(word),
                    device=None, max_attempts=6, normalize=True,
                    difficulty=difficulty, outcome_preference=outcome
                )
                batch.append((word, state_name, initial_state, difficulty, outcome, won, game_progress))
                games_generated += 1  # Increment game counter

    return batch, games_generated


def main_execution(words, parquet_train_path, parquet_valid_path, test_size=0.20):
    train_game_counter = 0
    valid_game_counter = 0
    games_per_word = 6 * 3 * 2  # 6 states, 3 difficulties, 2 outcomes

    train_words, valid_words = train_test_split(words, test_size=test_size)

    # Process each word set
    for word_set, path in [(train_words, parquet_train_path), (valid_words, parquet_valid_path)]:
        total_words_processed = 0
        total_games_generated = 0

        for word in tqdm(word_set, desc="Processing Words"):
            batch, games_generated = generate_batch_for_word(word)

            # Update word count and game count
            total_words_processed += 1
            total_games_generated += games_generated

            # Update game counters
            if word_set is train_words:
                train_game_counter += games_generated
                process_batch_to_parquet(batch, path, train_game_counter)
            else:
                valid_game_counter += games_generated
                process_batch_to_parquet(batch, path, valid_game_counter)

    print(f"Final Total games processed in training set: {train_game_counter}")
    print(f"Final Total games processed in validation set: {valid_game_counter}")

# Execute the main function
main_execution(sampled_words_by_length, parquet_train_path, parquet_valid_path, test_size=0.20)


Processing Words: 100%|██████████| 184/184 [00:02<00:00, 82.55it/s]
Processing Words: 100%|██████████| 46/46 [00:00<00:00, 82.79it/s]

Final Total games processed in training set: 7728
Final Total games processed in validation set: 1932





##### Checking Train

In [13]:
import pandas as pd
from pathlib import Path

# Assuming parquet_train_path is already defined as a Path object
# If not, define it here
# parquet_train_path = Path('path_to_your_train_parquet_directory')

# Use glob to find all Parquet files in the folder
parquet_files = parquet_train_path.glob('*.parquet')

# Count the number of files
file_count = sum(1 for _ in parquet_files)

print(f"Number of Parquet files: {file_count}")

Number of Parquet files: 184


In [14]:
import pandas as pd
from pathlib import Path


# Find all Parquet files in the directory
parquet_files = list(parquet_train_path.glob('*.parquet'))

if parquet_files:
    total_game_sequences = 0

    # Iterate over each file and sum the number of game sequences
    for file in parquet_files:
        df = pd.read_parquet(file)
        total_game_sequences += len(df)

    print(f"Total number of game sequences across all files: {total_game_sequences}")
else:
    print("No Parquet files found in the specified directory.")


Total number of game sequences across all files: 7668


In [15]:
import pandas as pd
import glob
from pathlib import Path

# Use glob to find all Parquet files in the folder
parquet_files = parquet_train_path.glob('*.parquet')

# Read and concatenate all Parquet files into a single DataFrame
df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

# # Display the first few rows of the DataFrame
# print(df.head())

# Get the total number of rows (games) in the DataFrame
total_games = len(df)
print(f"Total number of games in the dataset: {total_games}")

# Additional checks and summary statistics
print("Null values in each column:")
print(df.isnull().sum())

print("\nSummary statistics:")
print(df.describe())

# Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"\nNumber of unique words: {unique_words}")

# Inspect the distribution of game outcomes, difficulties, etc.
print("\nOutcome distribution:")
print(df['outcome'].value_counts())

print("\nDifficulty distribution:")
print(df['difficulty'].value_counts())


Total number of games in the dataset: 7668
Null values in each column:
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64

Summary statistics:
           game_id  word_length
count  7668.000000  7668.000000
mean   3912.241784    13.090767
std    2233.193521     5.716223
min      42.000000     1.000000
25%    1958.750000     8.000000
50%    3923.500000    12.000000
75%    5840.250000    18.000000
max    7769.000000    27.000000

Number of unique words: 184

Outcome distribution:
outcome
win     3834
lose    3834
Name: count, dtype: int64

Difficulty distribution:
difficulty
easy      2556
medium    2556
hard      2556
Name: count, dtype: int64


In [16]:
import pandas as pd

# # Replace this with the path to your Parquet file
# parquet_file_path = 'path/to/your/HangmanData.parquet'

# Read the Parquet file
df = pd.read_parquet(parquet_train_path)

# Display the first few rows of the DataFrame
df

Unnamed: 0,game_id,word,initial_state,final_state,guessed_states,guessed_letters,game_state,difficulty,outcome,word_length,won
0,6006,overindustrializing,___________________,overindustrializing,"___________________,_________t_________,___r__...","t,r,h,m,q,a,o,f,e,s,i,l,z,n,d,v,u,g",allMasked,easy,win,19,True
1,6007,overindustrializing,___________________,overindustrializing,"___________________,_________t_________,_____n...","t,n,a,s,f,r,i,m,u,z,w,o,c,e,d,h,g,v,l",allMasked,easy,lose,19,True
2,6008,overindustrializing,___________________,o__r_ndu_tr_a____n_,"___________________,_________t_________,______...","t,m,r,a,d,o,b,u,w,n,k,p,h",allMasked,medium,win,19,False
3,6009,overindustrializing,___________________,overindustrializing,"___________________,_________t_________,______...","t,u,s,y,i,v,b,p,q,l,a,o,d,n,e,g,z,h,r",allMasked,medium,lose,19,True
4,6010,overindustrializing,___________________,__e__n__s___a____n_,"___________________,_____n___________n_,_____n...","n,k,e,b,a,p,s,j,x,f",allMasked,hard,win,19,False
...,...,...,...,...,...,...,...,...,...,...,...
7663,3775,scientificophilosophical,scientificophi_osophica_,scientificophilosophical,scientificophi_osophica_,l,nearEnd,easy,lose,24,True
7664,3776,scientificophilosophical,scientificophi_osophica_,scientificophilosophical,scientificophi_osophica_,l,nearEnd,medium,win,24,True
7665,3777,scientificophilosophical,scientificophi_osophica_,scientificophilosophical,"scientificophi_osophica_,scientificophi_osophica_","z,l",nearEnd,medium,lose,24,True
7666,3778,scientificophilosophical,scientificophi_osophica_,scientificophilosophical,scientificophi_osophica_,l,nearEnd,hard,win,24,True


In [17]:
# Get the total number of rows (games) in the DataFrame
total_games = len(df)

print(f"Total number of games in the dataset: {total_games}")

# Additional checks you might want to perform:
# - Check for any null values or anomalies in the data
print(df.isnull().sum())

# - Get a summary of the DataFrame
print(df.describe())

# - Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"Number of unique words: {unique_words}")

# - Inspect the distribution of game outcomes, difficulties, etc.
print(df['outcome'].value_counts())
print(df['difficulty'].value_counts())

Total number of games in the dataset: 7668
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64
           game_id  word_length
count  7668.000000  7668.000000
mean   3912.241784    13.090767
std    2233.193521     5.716223
min      42.000000     1.000000
25%    1958.750000     8.000000
50%    3923.500000    12.000000
75%    5840.250000    18.000000
max    7769.000000    27.000000
Number of unique words: 184
outcome
win     3834
lose    3834
Name: count, dtype: int64
difficulty
easy      2556
medium    2556
hard      2556
Name: count, dtype: int64


##### Checking the Valid

In [18]:
import pandas as pd

# # Replace this with the path to your Parquet file
# parquet_file_path = 'path/to/your/HangmanData.parquet'

# Read the Parquet file
df = pd.read_parquet(parquet_valid_path)

# Display the first few rows of the DataFrame
df

Unnamed: 0,game_id,word,initial_state,final_state,guessed_states,guessed_letters,game_state,difficulty,outcome,word_length,won
0,1176,publicheartedness,_________________,publicheartedness,"_________________,_________________,_______e__...","f,e,r,a,t,i,z,d,l,n,b,c,p,u,h,s",allMasked,easy,win,17,True
1,1177,publicheartedness,_________________,publicheartedness,"_________________,___l_____________,___l______...","l,j,a,r,t,s,q,u,p,e,c,i,d,h,b,n",allMasked,easy,lose,17,True
2,1178,publicheartedness,_________________,pu_lic__art____ss,"_________________,_________________,_____c____...","j,c,l,s,y,a,k,t,r,i,o,z,p,u,m",allMasked,medium,win,17,False
3,1179,publicheartedness,_________________,p_______a___dn___,"_________________,_________________,p_________...","f,p,d,x,k,w,n,v,a,z",allMasked,medium,lose,17,False
4,1180,publicheartedness,_________________,_________________,"_________________,_________________,__________...","q,w,o,x,v,y",allMasked,hard,win,17,False
...,...,...,...,...,...,...,...,...,...,...,...
1903,1843,hypogastrium,hypoga_trium,hypogastrium,hypoga_trium,s,nearEnd,easy,lose,12,True
1904,1844,hypogastrium,hypoga_trium,hypogastrium,hypoga_trium,s,nearEnd,medium,win,12,True
1905,1845,hypogastrium,hypoga_trium,hypogastrium,hypoga_trium,s,nearEnd,medium,lose,12,True
1906,1846,hypogastrium,hypoga_trium,hypogastrium,"hypoga_trium,hypoga_trium,hypoga_trium,hypoga_...","b,j,w,s",nearEnd,hard,win,12,True


In [19]:
# Get the total number of rows (games) in the DataFrame
total_games = len(df)

print(f"Total number of games in the dataset: {total_games}")

# Additional checks you might want to perform:
# - Check for any null values or anomalies in the data
print(df.isnull().sum())

# - Get a summary of the DataFrame
print(df.describe())

# - Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"Number of unique words: {unique_words}")

# - Inspect the distribution of game outcomes, difficulties, etc.
print(df['outcome'].value_counts())
print(df['difficulty'].value_counts())

Total number of games in the dataset: 1908
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64
           game_id  word_length
count  1908.000000  1908.000000
mean   1010.481132    13.625786
std     560.681541     6.718507
min      42.000000     3.000000
25%     518.750000     9.000000
50%    1019.500000    13.000000
75%    1496.250000    18.000000
max    1973.000000    29.000000
Number of unique words: 46
outcome
win     954
lose    954
Name: count, dtype: int64
difficulty
easy      636
medium    636
hard      636
Name: count, dtype: int64


##### Reading Checking

In [20]:
# Create datasets directly from the saved parquet files
train_dataset = HangmanDataset(parquet_train_path)
valid_dataset = HangmanDataset(parquet_valid_path)

In [21]:
train_dataset[0]

{'game_id': 6006,
 'word': 'overindustrializing',
 'initial_state': ['___________________'],
 'final_state': 'overindustrializing',
 'guessed_states': ['___________________',
  '_________t_________',
  '___r_____tr________',
  '___r_____tr________',
  '___r_____tr________',
  '___r_____tr________',
  '___r_____tr_a______',
  'o__r_____tr_a______',
  'o__r_____tr_a______',
  'o_er_____tr_a______',
  'o_er____str_a______',
  'o_eri___stria_i_i__',
  'o_eri___striali_i__',
  'o_eri___strializi__',
  'o_erin__strializin_',
  'o_erind_strializin_',
  'overind_strializin_',
  'overindustrializin_'],
 'guessed_letters': ['t',
  'r',
  'h',
  'm',
  'q',
  'a',
  'o',
  'f',
  'e',
  's',
  'i',
  'l',
  'z',
  'n',
  'd',
  'v',
  'u',
  'g'],
 'game_state': 'allMasked',
 'difficulty': 'easy',
 'outcome': 'win',
 'word_length': 19,
 'won': False}

In [22]:
train_loader = DataLoader(train_dataset, batch_size=512, 
                          collate_fn=custom_collate_fn, 
                          shuffle=True, 
                          num_workers=15,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs
                          
val_loader = DataLoader(valid_dataset, batch_size=512, 
                          collate_fn=custom_collate_fn, 
                          shuffle=True, 
                          num_workers=15,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs

In [23]:
for batch in train_dataset:
    pass

In [24]:
len(train_dataset)

7668