##### Imports

In [1]:
import sys
from pathlib import Path
import warnings
from torch.utils.data import Dataset

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import pickle
from pathlib import Path
from scr.dataset import *
# from scr.game import *
from scr.feature_engineering import *
# from scr.plot_utils import *
import gc
from scr.utils import print_scenarios

from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Data Path

In [2]:
import shutil
from pathlib import Path

NUM_STRATIFIED_SAMPLES = 200 # This will be overwritten by Papermill

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")

stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)

parquet_train_path = stratified_samples_dir / 'train_parquets'
parquet_valid_path = stratified_samples_dir / 'valid_parquets'

# Function to delete and recreate a directory
def recreate_directory(path):
    if path.exists():
        shutil.rmtree(path)  # Delete the directory and its contents
    path.mkdir(parents=True)  # Create the directory

# Recreate the train and valid directories
recreate_directory(parquet_train_path)
recreate_directory(parquet_valid_path)

print(f"Directories '{parquet_train_path}' and '{parquet_valid_path}' have been recreated.")

Directories '/media/sayem/510B93E12554BBD1/dataset/200/train_parquets' and '/media/sayem/510B93E12554BBD1/dataset/200/valid_parquets' have been recreated.


In [3]:
len(word_list)

227300

##### Testing Data

In [4]:
# Define the total number of words and the number of test samples
from scr.custom_sampler import *
NUM_TEST_SAMPLES = 10_000

# Assuming 'word_list' contains the 250,000 words
# First, separate 10,000 words for the final testing set
testing_words = stratified_sample_by_length_and_uniqueness(
    word_list, 
    NUM_TEST_SAMPLES
)

# Define the file path for saving the testing words
testing_words_file_path = stratified_samples_dir / "testing_words.txt"

# Save the testing words to a file
with open(testing_words_file_path, 'w') as file:
    for word in testing_words:
        file.write(word + '\n')

print(f"Testing words saved in {testing_words_file_path}")

# Now, remove these testing samples from the original word list
remaining_words = [word for word in word_list if word not in testing_words]

Testing words saved in /media/sayem/510B93E12554BBD1/dataset/200/testing_words.txt


##### Stratified Sample Generation

In [5]:
## we are taking starified samples from train_words

from scr.custom_sampler import \
    stratified_sample_by_length_and_frequency, \
        stratified_sample_by_length, stratified_sample_by_length_and_uniqueness

print(f'Number of Statrified samples: {NUM_STRATIFIED_SAMPLES}')

# sampled_words_by_length_and_frequency \
#     = stratified_sample_by_length_and_frequency(train_words, \
#     word_frequencies, \
#     NUM_STRATIFIED_SAMPLES)

sampled_words_by_length = stratified_sample_by_length_and_uniqueness(remaining_words, \
    NUM_STRATIFIED_SAMPLES)

print(len(sampled_words_by_length))

Number of Statrified samples: 200
293


##### Intial State Simulation Testing

In [6]:
from scr.game import *

In [7]:
# word = "mississippi"
word = "mythopoetize"
# word = "cat"

initial_states = process_word_for_six_states(word)

initial_states

{'allMasked': '____________',
 'early': '__________z_',
 'quarterRevealed': '_____p____z_',
 'midRevealed': '_____p_e_ize',
 'midLateRevealed': '_y__opoe_ize',
 'lateRevealed': '_ythopoetize',
 'nearEnd': '_ythopoetize'}

In [8]:
len(initial_states)

7

In [9]:
initial_states

{'allMasked': '____________',
 'early': '__________z_',
 'quarterRevealed': '_____p____z_',
 'midRevealed': '_____p_e_ize',
 'midLateRevealed': '_y__opoe_ize',
 'lateRevealed': '_ythopoetize',
 'nearEnd': '_ythopoetize'}

##### Dataset Generation: Simulation

In [10]:
word = "mississippi"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States: ", initial_states)

Generated Initial States:  {'allMasked': '___________', 'early': '_i__i__i__i', 'quarterRevealed': '_i__i__i__i', 'midRevealed': 'mi__i__i__i', 'midLateRevealed': 'mississi__i', 'lateRevealed': 'mississi__i', 'nearEnd': 'mississi__i'}


In [11]:
from scr.game import simulate_game_progress, \
    play_game_with_a_word, process_word

# Example word and initial state
# Example usage
word = "mississippi"
# word = "cat"

initial_states = process_word_for_six_states(word)

# print(initial_states)
# Print generated initial states
print("Generated Initial States:")
for state_name, initial_state in initial_states.items():
    # Simulate the game
    print(initial_state)
    print(f"For initial state: {initial_state}")
    won, game_progress = simulate_game_progress(
        model=None,  # Assuming model is not used in this example
        word=word, 
        initial_state=initial_state, 
        char_frequency={},  # Assuming char_frequency is not used in this example
        max_word_length=len(word), 
        device=None,  # Assuming device is not used in this example
        max_attempts=6, 
        normalize=True,
        difficulty="medium", 
        outcome_preference='win'
    )

    # Display game progress
    for step in game_progress:
        print(f"Guessed: '{step[0]}', New State: '{step[1]}', Correct: {step[2]}")

        # break

    # break

    # print("Game Result:", "Won" if won else "Lost")

Generated Initial States:
___________
For initial state: ___________
Guessed: 'r', New State: '___________', Correct: False
Guessed: 's', New State: '__ss_ss____', Correct: True
Guessed: 'p', New State: '__ss_ss_pp_', Correct: True
Guessed: 'j', New State: '__ss_ss_pp_', Correct: False
Guessed: 'h', New State: '__ss_ss_pp_', Correct: False
Guessed: 'x', New State: '__ss_ss_pp_', Correct: False
Guessed: 'd', New State: '__ss_ss_pp_', Correct: False
Guessed: 'c', New State: '__ss_ss_pp_', Correct: False
__ss_ss____
For initial state: __ss_ss____
Guessed: 'p', New State: '__ss_ss_pp_', Correct: True
Guessed: 'w', New State: '__ss_ss_pp_', Correct: False
Guessed: 'i', New State: '_ississippi', Correct: True
Guessed: 'm', New State: 'mississippi', Correct: True
__ss_ss____
For initial state: __ss_ss____
Guessed: 'd', New State: '__ss_ss____', Correct: False
Guessed: 't', New State: '__ss_ss____', Correct: False
Guessed: 'q', New State: '__ss_ss____', Correct: False
Guessed: 'z', New State: 

##### Writing Parquet

In [12]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Assuming the function 'process_word_for_six_states' is defined elsewhere
from scr.game import simulate_game_progress, process_word_for_six_states

def process_batch_to_parquet(batch, file_path, start_game_counter):
    game_counter = start_game_counter
    data_for_parquet = []

    for game_data in batch:
        word, state_name, initial_state, difficulty, outcome, won, game_progress = game_data
        if not game_progress:
            continue

        final_state = game_progress[-1][1]
        guessed_states = [initial_state] + [state for _, state, _ in game_progress]
        guessed_letters = [letter for letter, _, _ in game_progress]

        data_for_parquet.append({
            'game_id': game_counter,
            'word': word,
            'initial_state': initial_state,
            'final_state': final_state,
            'guessed_states': ','.join(guessed_states[:-1]),
            'guessed_letters': ','.join(guessed_letters),
            'game_state': state_name,
            'difficulty': difficulty,
            'outcome': outcome,
            'word_length': len(word),
            'won': won
        })

        game_counter += 1

    df = pd.DataFrame(data_for_parquet)
    table = pa.Table.from_pandas(df)
    pq.write_to_dataset(table, root_path=file_path, compression='snappy')

    return game_counter

def generate_batch_for_word(word):
    batch = []
    initial_states = process_word_for_six_states(word)
    games_generated = 0  # Counter for games generated

    for state_name, initial_state in initial_states.items():
        for difficulty in ["easy", "medium", "hard"]:
            for outcome in ["win", "lose"]:
                won, game_progress = simulate_game_progress(
                    model=None, word=word, initial_state=initial_state,
                    char_frequency={}, max_word_length=len(word),
                    device=None, max_attempts=6, normalize=True,
                    difficulty=difficulty, outcome_preference=outcome
                )
                batch.append((word, state_name, initial_state, difficulty, outcome, won, game_progress))
                games_generated += 1  # Increment game counter

    return batch, games_generated

# def main_execution(words, parquet_train_path, parquet_valid_path, test_size=0.20):
#     train_game_counter = 0
#     valid_game_counter = 0
#     games_per_word = 6 * 3 * 2  # 6 states, 3 difficulties, 2 outcomes

#     train_words, valid_words = train_test_split(words, test_size=test_size)

#     # Process each word set
#     for word_set, path in [(train_words, parquet_train_path), (valid_words, parquet_valid_path)]:
#         total_words_processed = 0
#         total_games_generated = 0

#         for word in tqdm(word_set, desc="Processing Words"):
#             batch, games_generated = generate_batch_for_word(word)

#             # Update word count and game count
#             total_words_processed += 1
#             total_games_generated += games_generated

#             # Update game counters
#             if word_set is train_words:
#                 train_game_counter += games_generated
#                 process_batch_to_parquet(batch, path, train_game_counter)
#             else:
#                 valid_game_counter += games_generated
#                 process_batch_to_parquet(batch, path, valid_game_counter)

#     print(f"Final Total games processed in training set: {train_game_counter}")
#     print(f"Final Total games processed in validation set: {valid_game_counter}")

# # Execute the main function
# main_execution(sampled_words_by_length, parquet_train_path, parquet_valid_path, test_size=0.20)

def main_execution(words, parquet_path):
    game_counter = 0
    games_per_word = 6 * 3 * 2  # 6 states, 3 difficulties, 2 outcomes

    for word in tqdm(words, desc="Processing Words"):
        batch, games_generated = generate_batch_for_word(word)

        # Update game count
        game_counter += games_generated

        # Process and save batch
        process_batch_to_parquet(batch, parquet_path, game_counter)

    print(f"Total games processed: {game_counter}")

# Execute the main function
main_execution(sampled_words_by_length, parquet_path)

NameError: name 'parquet_path' is not defined

##### Checking Train

In [None]:
import pandas as pd
from pathlib import Path

# Assuming parquet_train_path is already defined as a Path object
# If not, define it here
# parquet_train_path = Path('path_to_your_train_parquet_directory')

# Use glob to find all Parquet files in the folder
parquet_files = parquet_train_path.glob('*.parquet')

# Count the number of files
file_count = sum(1 for _ in parquet_files)

print(f"Number of Parquet files: {file_count}")

Number of Parquet files: 234


In [None]:
import pandas as pd
from pathlib import Path


# Find all Parquet files in the directory
parquet_files = list(parquet_train_path.glob('*.parquet'))

if parquet_files:
    total_game_sequences = 0

    # Iterate over each file and sum the number of game sequences
    for file in parquet_files:
        df = pd.read_parquet(file)
        total_game_sequences += len(df)

    print(f"Total number of game sequences across all files: {total_game_sequences}")
else:
    print("No Parquet files found in the specified directory.")


Total number of game sequences across all files: 9792


In [None]:
import pandas as pd
import glob
from pathlib import Path

# Use glob to find all Parquet files in the folder
parquet_files = parquet_train_path.glob('*.parquet')

# Read and concatenate all Parquet files into a single DataFrame
df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

# # Display the first few rows of the DataFrame
# print(df.head())

# Get the total number of rows (games) in the DataFrame
total_games = len(df)
print(f"Total number of games in the dataset: {total_games}")

# Additional checks and summary statistics
print("Null values in each column:")
print(df.isnull().sum())

print("\nSummary statistics:")
print(df.describe())

# Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"\nNumber of unique words: {unique_words}")

# Inspect the distribution of game outcomes, difficulties, etc.
print("\nOutcome distribution:")
print(df['outcome'].value_counts())

print("\nDifficulty distribution:")
print(df['difficulty'].value_counts())


# Word length distribution
print("\nWord Length Distribution:")
print(df['word_length'].value_counts())

Total number of games in the dataset: 9792
Null values in each column:
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64

Summary statistics:
           game_id  word_length
count  9792.000000  9792.000000
mean   4954.544118    11.449142
std    2837.314543     5.073242
min      42.000000     1.000000
25%    2501.750000     8.000000
50%    4961.500000    10.000000
75%    7409.250000    15.000000
max    9869.000000    29.000000

Number of unique words: 234

Outcome distribution:
outcome
win     4896
lose    4896
Name: count, dtype: int64

Difficulty distribution:
difficulty
easy      3264
medium    3264
hard      3264
Name: count, dtype: int64

Word Length Distribution:
word_length
8     966
9     966
10    966
11    714
7     714
6     714
12    588
14    462
13    462
5     378
15    336
16    3

In [None]:
import pandas as pd

# # Replace this with the path to your Parquet file
# parquet_file_path = 'path/to/your/HangmanData.parquet'

# Read the Parquet file
df = pd.read_parquet(parquet_train_path)

# Display the first few rows of the DataFrame
df

Unnamed: 0,game_id,word,initial_state,final_state,guessed_states,guessed_letters,game_state,difficulty,outcome,word_length,won
0,3696,unsanguinariness,________________,unsanguinariness,"________________,________________,_n__n___n___...","t,n,x,d,i,a,r,v,g,e,u,s",allMasked,easy,win,16,True
1,3697,unsanguinariness,________________,unsanguinariness,"________________,________________,_______i___i...","c,i,r,a,m,n,s,g,v,u,e",allMasked,easy,lose,16,True
2,3698,unsanguinariness,________________,_ns_n__in_riness,"________________,________________,_______i___i...","v,i,n,b,e,p,r,s,d,z,t",allMasked,medium,win,16,False
3,3699,unsanguinariness,________________,unsanguinariness,"________________,________________,_______i___i...","h,i,k,u,e,r,m,w,a,s,n,g",allMasked,medium,lose,16,True
4,3700,unsanguinariness,________________,u_____u_________,"________________,________________,u_____u_____...","f,u,c,j,m,l,k",allMasked,hard,win,16,False
...,...,...,...,...,...,...,...,...,...,...,...
9787,5371,vituperance,vitupera_ce,vituperance,vitupera_ce,n,nearEnd,easy,lose,11,True
9788,5372,vituperance,vitupera_ce,vituperance,vitupera_ce,n,nearEnd,medium,win,11,True
9789,5373,vituperance,vitupera_ce,vituperance,"vitupera_ce,vitupera_ce","d,n",nearEnd,medium,lose,11,True
9790,5374,vituperance,vitupera_ce,vituperance,"vitupera_ce,vitupera_ce","d,n",nearEnd,hard,win,11,True


In [None]:
# Get the total number of rows (games) in the DataFrame
total_games = len(df)

print(f"Total number of games in the dataset: {total_games}")

# Additional checks you might want to perform:
# - Check for any null values or anomalies in the data
print(df.isnull().sum())

# - Get a summary of the DataFrame
print(df.describe())

# - Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"Number of unique words: {unique_words}")

# - Inspect the distribution of game outcomes, difficulties, etc.
print(df['outcome'].value_counts())
print(df['difficulty'].value_counts())

Total number of games in the dataset: 9792
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64
           game_id  word_length
count  9792.000000  9792.000000
mean   4954.544118    11.449142
std    2837.314543     5.073242
min      42.000000     1.000000
25%    2501.750000     8.000000
50%    4961.500000    10.000000
75%    7409.250000    15.000000
max    9869.000000    29.000000
Number of unique words: 234
outcome
win     4896
lose    4896
Name: count, dtype: int64
difficulty
easy      3264
medium    3264
hard      3264
Name: count, dtype: int64


##### Checking the Valid

In [None]:
import pandas as pd

# # Replace this with the path to your Parquet file
# parquet_file_path = 'path/to/your/HangmanData.parquet'

# Read the Parquet file
df = pd.read_parquet(parquet_valid_path)

# Display the first few rows of the DataFrame
df

Unnamed: 0,game_id,word,initial_state,final_state,guessed_states,guessed_letters,game_state,difficulty,outcome,word_length,won
0,1764,pyrenocarp,__________,pyrenocarp,"__________,____n_____,____n_____,____n_____,__...","n,h,u,c,o,a,p,r,e,s,x,y",allMasked,easy,win,10,True
1,1765,pyrenocarp,__________,pyrenocarp,"__________,__________,p________p,py_______p,py...","q,p,y,a,n,e,r,c,o",allMasked,easy,lose,10,True
2,1766,pyrenocarp,__________,_y__noca__,"__________,__________,__________,_______a__,_y...","b,z,a,y,k,m,t,n,c,o,j",allMasked,medium,win,10,False
3,1767,pyrenocarp,__________,pyrenocarp,"__________,____n_____,____n_____,__r_n___r_,__...","n,l,r,w,o,p,c,a,y,q,x,e",allMasked,medium,lose,10,True
4,1768,pyrenocarp,__________,____n_____,"__________,__________,__________,__________,__...","h,i,u,g,n,z,w",allMasked,hard,win,10,False
...,...,...,...,...,...,...,...,...,...,...,...
2473,2473,desmodont,des_odont,desmodont,des_odont,m,nearEnd,easy,lose,9,True
2474,2474,desmodont,des_odont,desmodont,des_odont,m,nearEnd,medium,win,9,True
2475,2475,desmodont,des_odont,desmodont,des_odont,m,nearEnd,medium,lose,9,True
2476,2476,desmodont,des_odont,desmodont,"des_odont,des_odont,des_odont,des_odont,des_odont","j,f,r,l,m",nearEnd,hard,win,9,True


In [None]:
# Get the total number of rows (games) in the DataFrame
total_games = len(df)

print(f"Total number of games in the dataset: {total_games}")

# Additional checks you might want to perform:
# - Check for any null values or anomalies in the data
print(df.isnull().sum())

# - Get a summary of the DataFrame
print(df.describe())

# - Count the number of unique words or game states
unique_words = df['word'].nunique()
print(f"Number of unique words: {unique_words}")

# - Inspect the distribution of game outcomes, difficulties, etc.
print(df['outcome'].value_counts())
print(df['difficulty'].value_counts())

Total number of games in the dataset: 2478
game_id            0
word               0
initial_state      0
final_state        0
guessed_states     0
guessed_letters    0
game_state         0
difficulty         0
outcome            0
word_length        0
won                0
dtype: int64
           game_id  word_length
count  2478.000000  2478.000000
mean   1280.500000    11.508475
std     715.481307     5.054540
min      42.000000     3.000000
25%     661.250000     8.000000
50%    1280.500000    11.000000
75%    1899.750000    14.000000
max    2519.000000    25.000000
Number of unique words: 59
outcome
win     1239
lose    1239
Name: count, dtype: int64
difficulty
easy      826
medium    826
hard      826
Name: count, dtype: int64


##### Reading Checking

In [None]:
# Create datasets directly from the saved parquet files
train_dataset = HangmanDataset(parquet_train_path)
valid_dataset = HangmanDataset(parquet_valid_path)

In [None]:
train_dataset[0]

{'game_id': 3696,
 'word': 'unsanguinariness',
 'initial_state': ['________________'],
 'final_state': 'unsanguinariness',
 'guessed_states': ['________________',
  '________________',
  '_n__n___n___n___',
  '_n__n___n___n___',
  '_n__n___n___n___',
  '_n__n__in__in___',
  '_n_an__ina_in___',
  '_n_an__inarin___',
  '_n_an__inarin___',
  '_n_ang_inarin___',
  '_n_ang_inarine__',
  'un_anguinarine__'],
 'guessed_letters': ['t',
  'n',
  'x',
  'd',
  'i',
  'a',
  'r',
  'v',
  'g',
  'e',
  'u',
  's'],
 'game_state': 'allMasked',
 'difficulty': 'easy',
 'outcome': 'win',
 'word_length': 16,
 'won': False}

In [None]:
import os 

train_loader = DataLoader(train_dataset, batch_size=128, 
                          collate_fn=custom_collate_fn, 
                          shuffle=True, 
                          num_workers=os.cpu_count() or 1,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs
                          
val_loader = DataLoader(valid_dataset, batch_size=512, 
                          collate_fn=custom_collate_fn, 
                          shuffle=False, 
                          num_workers=os.cpu_count() or 1,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs

In [None]:
for batch in train_loader:
    pass