##### Imports

In [1]:
import sys
from pathlib import Path
import warnings
from torch.utils.data import Dataset

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import pickle
from pathlib import Path
from scr.dataset import *
# from scr.game import *
from scr.feature_engineering import *
# from scr.plot_utils import *
import gc
from scr.utils import print_scenarios

from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Data Path

In [2]:
import shutil
from pathlib import Path

NUM_STRATIFIED_SAMPLES = 25_000 # This will be overwritten by Papermill

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")

stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)

parquet_train_path = stratified_samples_dir / 'train_parquets'
parquet_valid_path = stratified_samples_dir / 'valid_parquets'

# Function to delete and recreate a directory
def recreate_directory(path):
    if path.exists():
        shutil.rmtree(path)  # Delete the directory and its contents
    path.mkdir(parents=True)  # Create the directory

# Recreate the train and valid directories
recreate_directory(parquet_train_path)
recreate_directory(parquet_valid_path)

print(f"Directories '{parquet_train_path}' and '{parquet_valid_path}' have been recreated.")

Directories '/media/sayem/510B93E12554BBD1/dataset/2500/train_parquets' and '/media/sayem/510B93E12554BBD1/dataset/2500/valid_parquets' have been recreated.


In [3]:
len(word_list)

227300

##### Testing Data

In [4]:
# Define the total number of words and the number of test samples
from scr.custom_sampler import *
NUM_TEST_SAMPLES = 10_000

# Assuming 'word_list' contains the 250,000 words
# First, separate 10,000 words for the final testing set
testing_words = stratified_sample_by_length_and_uniqueness(
    word_list, 
    NUM_TEST_SAMPLES
)

# Define the file path for saving the testing words
testing_words_file_path = stratified_samples_dir / "testing_words.txt"

# Save the testing words to a file
with open(testing_words_file_path, 'w') as file:
    for word in testing_words:
        file.write(word + '\n')

print(f"Testing words saved in {testing_words_file_path}")

# Now, remove these testing samples from the original word list
remaining_words = [word for word in word_list if word not in testing_words]

Testing words saved in /media/sayem/510B93E12554BBD1/dataset/2500/testing_words.txt


##### Stratified Sample Generation

In [5]:
## we are taking starified samples from train_words

from scr.custom_sampler import \
    stratified_sample_by_length_and_frequency, \
        stratified_sample_by_length, stratified_sample_by_length_and_uniqueness

print(f'Number of Statrified samples: {NUM_STRATIFIED_SAMPLES}')

# sampled_words_by_length_and_frequency \
#     = stratified_sample_by_length_and_frequency(train_words, \
#     word_frequencies, \
#     NUM_STRATIFIED_SAMPLES)

sampled_words_by_length = stratified_sample_by_length_and_uniqueness(remaining_words, \
    NUM_STRATIFIED_SAMPLES)

print(len(sampled_words_by_length))

Number of Statrified samples: 2500
2554


##### Intial State Simulation Testing

In [6]:
from scr.game import *

In [7]:
# word = "mississippi"
word = "mythopoetize"
# word = "cat"

initial_states = process_word_for_six_states(word)

initial_states

{'allMasked': '____________',
 'early': '__________z_',
 'quarterRevealed': 'm_________z_',
 'midRevealed': 'm______e_ize',
 'midLateRevealed': 'm_th___etize',
 'lateRevealed': 'm_thopoetize',
 'nearEnd': 'm_thopoetize'}

In [8]:
len(initial_states)

7

In [9]:
initial_states

{'allMasked': '____________',
 'early': '__________z_',
 'quarterRevealed': 'm_________z_',
 'midRevealed': 'm______e_ize',
 'midLateRevealed': 'm_th___etize',
 'lateRevealed': 'm_thopoetize',
 'nearEnd': 'm_thopoetize'}

##### Dataset Generation: Simulation

In [10]:
word = "mississippi"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States: ", initial_states)

Generated Initial States:  {'allMasked': '___________', 'early': 'm__________', 'quarterRevealed': 'm__________', 'midRevealed': 'm_ss_ss____', 'midLateRevealed': 'm_ss_ss_pp_', 'lateRevealed': 'm_ss_ss_pp_', 'nearEnd': 'm_ss_ss_pp_'}


In [11]:
from scr.game import simulate_game_progress, \
    play_game_with_a_word, process_word

# Example word and initial state
# Example usage
word = "mississippi"
# word = "cat"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States:")
for state_name, initial_state in initial_states.items():
    # Simulate the game
    print(initial_state)
    print(f"For initial state: {initial_state}")
    won, game_progress = simulate_game_progress(
        model=None,  # Assuming model is not used in this example
        word=word, 
        initial_state=initial_state, 
        char_frequency={},  # Assuming char_frequency is not used in this example
        max_word_length=len(word), 
        device=None,  # Assuming device is not used in this example
        max_attempts=6, 
        normalize=True,
        difficulty="medium", 
        outcome_preference='win'
    )

    # Display game progress
    for step in game_progress:
        print(f"Guessed: '{step[0]}', New State: '{step[1]}', Correct: {step[2]}")

        # break

    # break

    # print("Game Result:", "Won" if won else "Lost")

Generated Initial States:
___________
For initial state: ___________
Guessed: 'm', New State: 'm__________', Correct: True
Guessed: 's', New State: 'm_ss_ss____', Correct: True
Guessed: 'i', New State: 'mississi__i', Correct: True
Guessed: 'p', New State: 'mississippi', Correct: True
__ss_ss____
For initial state: __ss_ss____
Guessed: 'q', New State: '__ss_ss____', Correct: False
Guessed: 'p', New State: '__ss_ss_pp_', Correct: True
Guessed: 'i', New State: '_ississippi', Correct: True
Guessed: 'b', New State: '_ississippi', Correct: False
Guessed: 'y', New State: '_ississippi', Correct: False
Guessed: 't', New State: '_ississippi', Correct: False
Guessed: 'm', New State: 'mississippi', Correct: True
__ss_ss____
For initial state: __ss_ss____
Guessed: 'i', New State: '_ississi__i', Correct: True
Guessed: 'v', New State: '_ississi__i', Correct: False
Guessed: 'b', New State: '_ississi__i', Correct: False
Guessed: 't', New State: '_ississi__i', Correct: False
Guessed: 'k', New State: '_i

##### Writing Parquet

In [12]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Assuming the function 'process_word_for_six_states' is defined elsewhere
from scr.game import simulate_game_progress, process_word_for_six_states

def process_batch_to_parquet(batch, file_path, start_game_counter):
    game_counter = start_game_counter
    data_for_parquet = []

    for game_data in batch:
        word, state_name, initial_state, difficulty, outcome, won, game_progress = game_data
        if not game_progress:
            continue

        final_state = game_progress[-1][1]
        guessed_states = [initial_state] + [state for _, state, _ in game_progress]
        guessed_letters = [letter for letter, _, _ in game_progress]

        data_for_parquet.append({
            'game_id': game_counter,
            'word': word,
            'initial_state': initial_state,
            'final_state': final_state,
            'guessed_states': ','.join(guessed_states[:-1]),
            'guessed_letters': ','.join(guessed_letters),
            'game_state': state_name,
            'difficulty': difficulty,
            'outcome': outcome,
            'word_length': len(word),
            'won': won
        })

        game_counter += 1

    df = pd.DataFrame(data_for_parquet)
    table = pa.Table.from_pandas(df)
    pq.write_to_dataset(table, root_path=file_path, compression='snappy')

    return game_counter

def generate_batch_for_word(word):
    batch = []
    initial_states = process_word_for_six_states(word)
    games_generated = 0  # Counter for games generated

    for state_name, initial_state in initial_states.items():
        for difficulty in ["easy", "medium", "hard"]:
            for outcome in ["win", "lose"]:
                won, game_progress = simulate_game_progress(
                    model=None, word=word, initial_state=initial_state,
                    char_frequency={}, max_word_length=len(word),
                    device=None, max_attempts=6, normalize=True,
                    difficulty=difficulty, outcome_preference=outcome
                )
                batch.append((word, state_name, initial_state, difficulty, outcome, won, game_progress))
                games_generated += 1  # Increment game counter

    return batch, games_generated


def main_execution(words, parquet_train_path, parquet_valid_path, test_size=0.20):
    train_game_counter = 0
    valid_game_counter = 0
    games_per_word = 6 * 3 * 2  # 6 states, 3 difficulties, 2 outcomes

    train_words, valid_words = train_test_split(words, test_size=test_size)

    # Process each word set
    for word_set, path in [(train_words, parquet_train_path), (valid_words, parquet_valid_path)]:
        total_words_processed = 0
        total_games_generated = 0

        for word in tqdm(word_set, desc="Processing Words"):
            batch, games_generated = generate_batch_for_word(word)

            # Update word count and game count
            total_words_processed += 1
            total_games_generated += games_generated

            # Update game counters
            if word_set is train_words:
                train_game_counter += games_generated
                process_batch_to_parquet(batch, path, train_game_counter)
            else:
                valid_game_counter += games_generated
                process_batch_to_parquet(batch, path, valid_game_counter)

    print(f"Final Total games processed in training set: {train_game_counter}")
    print(f"Final Total games processed in validation set: {valid_game_counter}")

# Execute the main function
main_execution(sampled_words_by_length, parquet_train_path, parquet_valid_path, test_size=0.20)


Processing Words: 100%|██████████| 2043/2043 [00:06<00:00, 292.80it/s]
Processing Words: 100%|██████████| 511/511 [00:01<00:00, 311.82it/s]

Final Total games processed in training set: 85806
Final Total games processed in validation set: 21462





##### Checking Train

In [13]:
import pandas as pd
from pathlib import Path

# Assuming parquet_train_path is already defined as a Path object
# If not, define it here
# parquet_train_path = Path('path_to_your_train_parquet_directory')

# Use glob to find all Parquet files in the folder
parquet_files = parquet_train_path.glob('*.parquet')

# Count the number of files
file_count = sum(1 for _ in parquet_files)

print(f"Number of Parquet files: {file_count}")

Number of Parquet files: 2043


In [14]:
import pandas as pd
from pathlib import Path


# Find all Parquet files in the directory
parquet_files = list(parquet_train_path.glob('*.parquet'))

if parquet_files:
    total_game_sequences = 0

    # Iterate over each file and sum the number of game sequences
    for file in parquet_files:
        df = pd.read_parquet(file)
        total_game_sequences += len(df)

    print(f"Total number of game sequences across all files: {total_game_sequences}")
else:
    print("No Parquet files found in the specified directory.")


Total number of game sequences across all files: 85782


In [15]:
import pandas as pd
import glob
from pathlib import Path

# Use glob to find all Parquet files in the folder
parquet_files = parquet_train_path.glob('*.parquet')

# Read and concatenate all Parquet files into a single DataFrame
df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

# # Display the first few rows of the DataFrame
# print(df.head())

# Get the total number of rows (games) in the DataFrame
total_games = len(df)
print(f"Total number of games in the dataset: {total_games}")

# Additional checks and summary statistics
print("Null values in each column:")
print(df.isnull().sum())

print("\nSummary statistics:")
print(df.describe())

# Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"\nNumber of unique words: {unique_words}")

# Inspect the distribution of game outcomes, difficulties, etc.
print("\nOutcome distribution:")
print(df['outcome'].value_counts())

print("\nDifficulty distribution:")
print(df['difficulty'].value_counts())


Total number of games in the dataset: 85782
Null values in each column:
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64

Summary statistics:
            game_id   word_length
count  85782.000000  85782.000000
mean   42946.299538      9.549206
std    24770.523971      3.281541
min       42.000000      2.000000
25%    21499.250000      7.000000
50%    42944.500000      9.000000
75%    64401.750000     11.000000
max    85847.000000     25.000000

Number of unique words: 2043

Outcome distribution:
outcome
win     42891
lose    42891
Name: count, dtype: int64

Difficulty distribution:
difficulty
easy      28594
medium    28594
hard      28594
Name: count, dtype: int64


In [16]:
import pandas as pd

# # Replace this with the path to your Parquet file
# parquet_file_path = 'path/to/your/HangmanData.parquet'

# Read the Parquet file
df = pd.read_parquet(parquet_train_path)

# Display the first few rows of the DataFrame
df

Unnamed: 0,game_id,word,initial_state,final_state,guessed_states,guessed_letters,game_state,difficulty,outcome,word_length,won
0,25116,amiability,__________,amiability,"__________,______l___,____b_l___,a__ab_l___,a_...","l,b,a,i,x,y,m,d,s,t",allMasked,easy,win,10,True
1,25117,amiability,__________,amiability,"__________,_m________,_mi__i_i__,_mi_bi_i__,am...","m,i,b,a,l,t,y",allMasked,easy,lose,10,True
2,25118,amiability,__________,__i__i_i__,"__________,__________,__i__i_i__,__i__i_i__,__...","u,i,z,n,g,e,q",allMasked,medium,win,10,False
3,25119,amiability,__________,a__a_____y,"__________,__________,__________,__________,__...","p,f,k,s,y,a,v,d",allMasked,medium,lose,10,False
4,25120,amiability,__________,__________,"__________,__________,__________,__________,__...","j,n,s,c,q,z",allMasked,hard,win,10,False
...,...,...,...,...,...,...,...,...,...,...,...
85777,4279,cheapjohn,_heapjohn,cheapjohn,_heapjohn,c,nearEnd,easy,lose,9,True
85778,4280,cheapjohn,_heapjohn,cheapjohn,_heapjohn,c,nearEnd,medium,win,9,True
85779,4281,cheapjohn,_heapjohn,cheapjohn,"_heapjohn,_heapjohn,_heapjohn,_heapjohn,_heapj...","q,f,k,b,d,c",nearEnd,medium,lose,9,True
85780,4282,cheapjohn,_heapjohn,cheapjohn,"_heapjohn,_heapjohn,_heapjohn","k,l,c",nearEnd,hard,win,9,True


In [17]:
# Get the total number of rows (games) in the DataFrame
total_games = len(df)

print(f"Total number of games in the dataset: {total_games}")

# Additional checks you might want to perform:
# - Check for any null values or anomalies in the data
print(df.isnull().sum())

# - Get a summary of the DataFrame
print(df.describe())

# - Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"Number of unique words: {unique_words}")

# - Inspect the distribution of game outcomes, difficulties, etc.
print(df['outcome'].value_counts())
print(df['difficulty'].value_counts())

Total number of games in the dataset: 85782
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64
            game_id   word_length
count  85782.000000  85782.000000
mean   42946.299538      9.549206
std    24770.523971      3.281541
min       42.000000      2.000000
25%    21499.250000      7.000000
50%    42944.500000      9.000000
75%    64401.750000     11.000000
max    85847.000000     25.000000
Number of unique words: 2043
outcome
win     42891
lose    42891
Name: count, dtype: int64
difficulty
easy      28594
medium    28594
hard      28594
Name: count, dtype: int64


##### Checking the Valid

In [18]:
import pandas as pd

# # Replace this with the path to your Parquet file
# parquet_file_path = 'path/to/your/HangmanData.parquet'

# Read the Parquet file
df = pd.read_parquet(parquet_valid_path)

# Display the first few rows of the DataFrame
df

Unnamed: 0,game_id,word,initial_state,final_state,guessed_states,guessed_letters,game_state,difficulty,outcome,word_length,won
0,5040,encup,_____,encup,"_____,__c__,e_c__,e_cu_,encu_","c,e,u,n,p",allMasked,easy,win,5,True
1,5041,encup,_____,encup,"_____,__c__,e_c__,e_c_p,e_cup,e_cup,e_cup","c,e,p,u,y,d,n",allMasked,easy,lose,5,True
2,5042,encup,_____,encup,"_____,_____,___u_,___up,__cup,__cup,e_cup,e_cup","b,u,p,c,q,e,k,n",allMasked,medium,win,5,True
3,5043,encup,_____,encup,"_____,_n___,_n__p,_n__p,_n__p,en__p,en_up","n,p,b,w,e,u,c",allMasked,medium,lose,5,True
4,5044,encup,_____,___u_,"_____,___u_,___u_,___u_,___u_,___u_,___u_","u,s,l,i,f,w,d",allMasked,hard,win,5,False
...,...,...,...,...,...,...,...,...,...,...,...
21445,14401,unpetticoated,unpettico_ted,unpetticoated,unpettico_ted,a,nearEnd,easy,lose,13,True
21446,14402,unpetticoated,unpettico_ted,unpetticoated,unpettico_ted,a,nearEnd,medium,win,13,True
21447,14403,unpetticoated,unpettico_ted,unpetticoated,"unpettico_ted,unpettico_ted,unpettico_ted,unpe...","k,z,s,y,a",nearEnd,medium,lose,13,True
21448,14404,unpetticoated,unpettico_ted,unpetticoated,"unpettico_ted,unpettico_ted","v,a",nearEnd,hard,win,13,True


In [19]:
# Get the total number of rows (games) in the DataFrame
total_games = len(df)

print(f"Total number of games in the dataset: {total_games}")

# Additional checks you might want to perform:
# - Check for any null values or anomalies in the data
print(df.isnull().sum())

# - Get a summary of the DataFrame
print(df.describe())

# - Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"Number of unique words: {unique_words}")

# - Inspect the distribution of game outcomes, difficulties, etc.
print(df['outcome'].value_counts())
print(df['difficulty'].value_counts())

Total number of games in the dataset: 21450
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64
            game_id   word_length
count  21450.000000  21450.000000
mean   10776.533007      9.372587
std     6195.075458      3.168436
min       42.000000      1.000000
25%     5416.250000      7.000000
50%    10778.500000      9.000000
75%    16140.750000     11.000000
max    21503.000000     29.000000
Number of unique words: 511
outcome
win     10725
lose    10725
Name: count, dtype: int64
difficulty
easy      7150
medium    7150
hard      7150
Name: count, dtype: int64


##### Reading Checking

In [20]:
# Create datasets directly from the saved parquet files
train_dataset = HangmanDataset(parquet_train_path)
valid_dataset = HangmanDataset(parquet_valid_path)

In [21]:
train_dataset[0]

{'game_id': 25116,
 'word': 'amiability',
 'initial_state': ['__________'],
 'final_state': 'amiability',
 'guessed_states': ['__________',
  '______l___',
  '____b_l___',
  'a__ab_l___',
  'a_iabili__',
  'a_iabili__',
  'a_iabili_y',
  'amiabili_y',
  'amiabili_y',
  'amiabili_y'],
 'guessed_letters': ['l', 'b', 'a', 'i', 'x', 'y', 'm', 'd', 's', 't'],
 'game_state': 'allMasked',
 'difficulty': 'easy',
 'outcome': 'win',
 'word_length': 10,
 'won': False}

In [22]:
train_loader = DataLoader(train_dataset, batch_size=512, 
                          collate_fn=custom_collate_fn, 
                          shuffle=True, 
                          num_workers=2,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs
                          
val_loader = DataLoader(valid_dataset, batch_size=512, 
                          collate_fn=custom_collate_fn, 
                          shuffle=True, 
                          num_workers=15,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs