##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import pickle
from pathlib import Path
from scr.dataset import *
from scr.utils import *
# # For inference
from scr.feature_engineering import *

import gc

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

from scr.utils import print_scenarios
torch.set_float32_matmul_precision('medium')
from pathlib import Path

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)

random.shuffle(word_list)

# Calculate Frequencies and Max Word Length
word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Data Dir

In [2]:
NUM_STRATIFIED_SAMPLES = 50_000 # This will be overwritten by Papermill

NUM_WORD_SAMPLE = 1_000 # words for testing

FAST_DEV_RUN = False

MAX_EPOCH = 250

In [3]:
from pathlib import Path
from scr.custom_sampler import *

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
parquet_path = stratified_samples_dir / 'parquets'

# Create directories for train and validation parquets if they don't exist
parquet_path.mkdir(parents=True, exist_ok=True)
# parquet_valid_path.mkdir(parents=True, exist_ok=True)

# Define and create the directory for models
models_dir = Path("/home/sayem/Desktop/Hangman/models")
models_dir.mkdir(parents=True, exist_ok=True)

# Define your output directory
# Define your output directory and logger directory
output_dir = Path("/home/sayem/Desktop/Hangman/training_outputs")
logger_dir = output_dir / "lightning_logs"

# Create the output and logger directories if they don't exist
output_dir.mkdir(parents=True, exist_ok=True)
logger_dir.mkdir(parents=True, exist_ok=True)

# Define the file path for saving the testing words
testing_words_file_path = stratified_samples_dir / "testing_words.txt"

try:
    testing_word_list = read_words(testing_words_file_path)
    print(f"Length of the testing word list: {len(testing_word_list)}")
    sampled_test_words = stratified_sample_by_length_and_uniqueness(testing_word_list, NUM_WORD_SAMPLE)
    print(f"Sampled {len(sampled_test_words)} unique words for testing.")
except FileNotFoundError:
    print(f"File not found: {testing_words_file_path}")

print(len(sampled_test_words))

Length of the testing word list: 10048
Sampled 1085 unique words for testing.
1085


##### Dataset Loading and train test split

In [4]:
parquet_path

PosixPath('/media/sayem/510B93E12554BBD1/dataset/50000/parquets')

In [5]:
# Create datasets directly from the saved parquet files
hangman_dataset = HangmanDataset(parquet_path)
# valid_dataset = HangmanDataset(parquet_valid_path)

from scr.utils import *

# Split the dataset into training and validation sets
train_dataset, valid_dataset = hangman_dataset.split(test_size=0.2)

In [6]:
assert len(train_dataset) > len(valid_dataset)

In [7]:
# Access the first element in the dataset
first_element = hangman_dataset[0]  # Using an integer index
print(first_element)

{'game_id': 112728, 'word': 'midseason', 'initial_state': ['_________'], 'final_state': 'midseason', 'guessed_states': ['_________', '_________', '__d______', '__d_____n', '__d_e___n', 'm_d_e___n', 'm_dse_s_n', 'm_dseas_n', 'm_dseas_n', 'midseas_n', 'midseason'], 'guessed_letters': ['g', 'd', 'n', 'e', 'm', 's', 'a', 'v', 'i', 'o'], 'game_state': 'allMasked', 'difficulty': 'easy', 'outcome': 'win', 'word_length': 9, 'won': True}


In [8]:
# Access the first row of the first file (assuming at least one row exists)
tuple_element = hangman_dataset[(0, 0)]  # Using a tuple (file index, row index)
print(tuple_element)

{'game_id': 112728, 'word': 'midseason', 'initial_state': ['_________'], 'final_state': 'midseason', 'guessed_states': ['_________', '_________', '__d______', '__d_____n', '__d_e___n', 'm_d_e___n', 'm_dse_s_n', 'm_dseas_n', 'm_dseas_n', 'midseas_n', 'midseason'], 'guessed_letters': ['g', 'd', 'n', 'e', 'm', 's', 'a', 'v', 'i', 'o'], 'game_state': 'allMasked', 'difficulty': 'easy', 'outcome': 'win', 'word_length': 9, 'won': True}


In [9]:
import numpy as np

# Set the random seed for reproducible results
np.random.seed(42)

# Generate sequence lengths from 1 to 20
sequence_lengths = np.arange(1, 21)  # 1 to 20 inclusive

# Generate random miss penalties between 0.05 and 0.3 for each sequence length
miss_penalties = np.round(np.random.uniform(0.05, 0.3, len(sequence_lengths)), 2)

# Combine sequence lengths and miss penalties into a dictionary
performance_metrics = dict(zip(sequence_lengths, miss_penalties))

# Display the generated performance metrics
performance_metrics

{1: 0.14,
 2: 0.29,
 3: 0.23,
 4: 0.2,
 5: 0.09,
 6: 0.09,
 7: 0.06,
 8: 0.27,
 9: 0.2,
 10: 0.23,
 11: 0.06,
 12: 0.29,
 13: 0.26,
 14: 0.1,
 15: 0.1,
 16: 0.1,
 17: 0.13,
 18: 0.18,
 19: 0.16,
 20: 0.12}

In [10]:
composite_score = \
    {13: 1.0144969265237789, 14: 1.0139399278380974, 12: 1.0145889222021265, 11: 1.0149790846522682, \
        15: 1.0137005413389382, 10: 1.0149863660317038, 8: 0.9827807551208211, 7: 0.9876452759289711, \
            9: 1.014814988760513, 5: 1.0138412270631227, 4: 1.013670403951156, 2: 1.0134486201164408, \
                3: 0.01386993304338964, 6: 1.0140813705881533, 17: 1.0140782178690035, 16: 1.0145310966095122, \
                    18: 1.0138246885811288, 19: 1.0127181950956583, 21: 1.012694844044745, 20: 1.01259642560035}

In [11]:
# # Specify the batch size for the sampler
# batch_size = 3

# # Initialize the PerformanceBasedSampler
# sampler = PerformanceBasedSampler(dataset=hangman_dataset, \
#     performance_metrics=composite_score, batch_size=batch_size)

In [12]:
# next(iter(sampler))

In [13]:
# from torch.utils.data import DataLoader

# # Specify the batch size for the sampler
# batch_size = 128

# # Initialize the PerformanceBasedSampler
# sampler = PerformanceBasedSampler(dataset=hangman_dataset, \
#     performance_metrics=performance_metrics, batch_size=batch_size)

# # Initialize DataLoader with the HangmanDataset and PerformanceBasedSampler
# data_loader = DataLoader(
#     dataset=hangman_dataset,
#     batch_sampler=sampler,  # Correct usage for custom batch handling
#     collate_fn=new_custom_collate_fn,  # Custom collate function, if needed
# )

# import torch  # Ensure torch is imported

# for batch in tqdm(data_loader):
#     states = batch['guessed_states']
#     guesses = batch['guessed_letters']
#     max_seq_length = batch['max_seq_len']
#     original_seq_lengths = batch['original_seq_lengths']


#     batch_features, batch_missed_chars = process_batch_of_games(
#                 states, guesses, char_frequency,
#                     max_word_length,
#                     max_seq_length)

#     print(f"{batch_features.shape}")

#     break


In [14]:
# hangman_dataset[(0, 34)]

In [15]:
from scr.data_module import *

# Initialize Data Module
initial_batch_size = 1024 # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                new_custom_collate_fn)

In [16]:
# performance_metrics_dict = {k: v for d in combined_eval_metrics for k, v in d.items()}

# # # # # print("Converted Performance Metrics Dictb ionary:", performance_metrics_dict)

# data_module.update_performance_metrics(composite_score)

In [17]:
from tqdm import tqdm

# Assuming data_module is an instance of HangmanDataModule
data_loader = data_module.train_dataloader()  # Call the method to get the DataLoader instance

# Initialize a list to store the diversity of sequence lengths in each batch
batch_seq_length_diversity = []

for batch in tqdm(data_loader):
    # Extract original_seq_lengths from the batch
    original_seq_lengths = batch['original_seq_lengths']

    # Analyze the diversity of sequence lengths in the current batch
    unique_seq_lengths = set(original_seq_lengths)  # Use a set to find unique sequence lengths
    batch_seq_length_diversity.append(len(unique_seq_lengths))  # Store the count of unique lengths

    print(f"First state in batch: {batch['guessed_states'][0]}")
    # print(f"Original sequence lengths in batch: {original_seq_lengths}")
    print(f"Unique sequence lengths in batch: {unique_seq_lengths}")
    print(f"Diversity (number of unique sequence lengths) in batch: {len(unique_seq_lengths)}")

    # break  # Remove this break to analyze all batches

    print()

# After collecting data, analyze the overall diversity
print(f"Average diversity of sequence lengths across batches: {sum(batch_seq_length_diversity) / len(batch_seq_length_diversity)}")


  0%|          | 0/1641 [00:00<?, ?it/s]

  0%|          | 2/1641 [00:03<44:08,  1.62s/it]  

First state in batch: ['d____________', 'd____________', 'd_____a______', 'd_____a______', 'd_____a______', 'd_____a______', 'd_____a______', 'd__e__a__e___', 'd__e__a__e___', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_av_s__', '_av_so_', '_av_son', '_aveson', 'waveson', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16



  0%|          | 3/1641 [00:04<30:57,  1.13s/it]

First state in batch: ['un__maniz_', 'un_omaniz_', 'unwomaniz_', 'unwomanize', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['kaw_ah', 'kaw_ah', 'kaw_ah', 'kaweah', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['b_i__l___', 'b_i__l___', 'b_i__l_ss', 'b_i__l_ss', 'b_i_gl_ss', 'b_i_gl_ss', 'b_i_gl_ss', 'b_i_gl_ss', 'b_i_gl_ss', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_____________', '___t_________', '___t_____n___', '___t_____n_e_', 'c__t_____n_e_', 'c__t_____n_e

  2%|▏         | 25/1641 [00:06<04:47,  5.62it/s]

First state in batch: ['___e__te_ic', '___e__teric', '_o_e_oteric', '_o_esoteric', 'nonesoteric', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15



  3%|▎         | 43/1641 [00:07<02:08, 12.39it/s]

First state in batch: ['_________', '_________', '_________', '_________', '_________', '_________', '_________', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['___________', '__rr_______', '__rr_______', '__rr___i___', '__rr___i___', '__rra__i___', '__rra__i___', '__rra__i___', '__rra_vi___', '__rra_vi___', '__rra_vi___', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['bridec_ke', 'bridecake', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['c_c_________c___', 'c_c_________c___', '

  3%|▎         | 49/1641 [00:09<04:24,  6.01it/s]

First state in batch: ['telec_emic', 'telechemic', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15



  3%|▎         | 53/1641 [00:10<04:35,  5.76it/s]

First state in batch: ['sev_gne', 'sev_gne', 'sev_gne', 'sev_gne', 'sev_gne', 'sev_gne', 'sev_gne', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_________', '_________', '_________', '_________', '_________', '_________', '_________', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_ethib', 'kethib', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_______', '_______', '__m____', '__m____', '__m____', '__m____', '__m____', '__m____', '', '', '', '', '', '', '', '', '']

  4%|▍         | 73/1641 [00:13<03:46,  6.92it/s]

First state in batch: ['br__e__a_ed', 'br__e__a_ed', 'br__e__a_ed', 'br__e__a_ed', 'br__e__a_ed', 'br__e__a_ed', 'br__e__a_ed', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16



  6%|▌         | 94/1641 [00:14<02:13, 11.61it/s]

First state in batch: ['___r____', '___r____', '__ur____', '__ur____', '__ur____', '__ur____', '__ur____', '__ur____', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}
Diversity (number of unique sequence lengths) in batch: 18

First state in batch: ['multi_arously', 'multi_arously', 'multi_arously', 'multi_arously', 'multi_arously', 'multifarously', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['coenos_eal', 'coenos_eal', 'coenosteal', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['re_ed__e_e', 're_eda_eae', 'reseda_eae', 'resedaceae', '', '', '', '',

  7%|▋         | 111/1641 [00:17<03:00,  8.46it/s]

First state in batch: ['_________', '_________', '_________', '_________', '_________', '__e____e_', '__e____e_', '__e____e_', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__r_am', '__r_am', '__r_am', '__r_am', '__r_am', '_cr_am', '_cr_am', '_cr_am', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['___________', '______i__i_', '__m___i__i_', '__mo_oi__i_', '__mo_oi_ti_', '__mo_oi_ti_', '__mopoi_ti_', '_emopoieti_', '_emopoieti_', '_emopoieti_', '_emopoietic', '_emopoietic', 'hemopoietic', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in 

  7%|▋         | 121/1641 [00:20<04:11,  6.04it/s]

First state in batch: ['___________s___', 'i__________si__', 'i__________si__', 'i__________si__', 'i__________si__', 'in_____n___si_n', 'in_____n_e_si_n', 'int____n_e_si_n', 'int__c_n_e_si_n', 'intr_c_n_ersi_n', 'introcon_ersion', 'introconversion', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14



  8%|▊         | 126/1641 [00:20<03:57,  6.37it/s]

First state in batch: ['thrapp_e', 'thrapp_e', 'thrapple', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['i_terlocutory', 'interlocutory', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['_a_onom__a__y', '_a_onomi_a__y', '_axonomi_a__y', 'taxonomi_a__y', 'taxonomica__y', 'taxonomically', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['o_serva_le', 'observable', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {

  9%|▊         | 142/1641 [00:21<02:26, 10.23it/s]

First state in batch: ['__on___ng', '__on___ng', '__on___ng', '__on___ng', '__on___ng', '__on___ng', '__on___ng', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_i____', '_i____', '_i_l__', '_i_lt_', '_ielt_', 'kielt_', 'kielty', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['herrgrds_st', 'herrgrdsost', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16



  9%|▉         | 154/1641 [00:23<03:09,  7.84it/s]

First state in batch: ['re__en', 're_sen', 're_sen', 'remsen', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['a_t_elect____', 'a_t_elect_o__', 'a_t_electro__', 'a_t_electro__', 'ant_electron_', 'ant_electron_', 'antielectron_', 'antielectrons', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 19}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_y__a', 'wy__a', 'wyo_a', 'wyola', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['___e__ee_e_', '___e__ee_e_', '___e__ee_e_', '___e__ee_e_', '___e__ee_e_', '___e__ee_e_', '___e__ee

 10%|█         | 166/1641 [00:24<02:34,  9.56it/s]

First state in batch: ['le__type', 'le_mtype', 'le_mtype', 'le_mtype', 'le_mtype', 'le_mtype', 'le_mtype', 'le_mtype', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_________', '___b_____', '___b_____', '___b___o_', '_i_b___o_', '_i_b___o_', 'wi_b___o_', 'wi_bu__o_', 'wilbu__o_', 'wilbur_o_', 'wilburto_', 'wilburton', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['s__ps', 's__ps', 'sk_ps', 'skeps', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15



 10%|█         | 169/1641 [00:27<05:01,  4.89it/s]

First state in batch: ['______l_l________', 's_____l_l_s___s_s', 's____yl_l_s___s_s', 's____yl_l_s_h_s_s', 's___dyl_l_s_h_s_s', 's___dyl_l_s_h_s_s', 's___dyl_l_s_h_s_s', 'sp__dyl_l_s_h_s_s', 'sp__dyl_lis_h_sis', 'sp_ndyl_lis_h_sis', 'sp_ndyl_listh_sis', 'sp_ndyl_listh_sis', 'spondylolisth_sis', 'spondylolisthesis', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['p_edete_t', 'p_edetest', 'predetest', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['me_a', 'me_a', 'me_a', 'me_a', 'me_a', 'meda', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batc

 12%|█▏        | 190/1641 [00:28<02:42,  8.92it/s]

First state in batch: ['s_l__rgr_s_h__', 's_l__rgr_s_h__', 's_l__rgr_s_h__', 's_l__rgr_s_h__', 's_l__rgr_s_h__', 's_lb_rgr_s_h__', 's_lb_rgr_s_h_n', 's_lb_rgr_s_h_n', 's_lb_rgr_s_h_n', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['hornblen_ophyre', 'hornblendophyre', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['_____r____', '_____r____', '_____rn_n_', '__t__rn_n_', '_ut_urn_n_', 'out_urn_n_', 'out_urn_n_', 'out_urn_n_', 'out_urnin_', 'out_urnin_', 'out_urnin_', 'out_urnin_', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) 

 12%|█▏        | 193/1641 [00:31<04:39,  5.18it/s]

First state in batch: ['____o_', '____o_', '__r_o_', '__r_o_', '__r_o_', '__r_o_', '__r_o_', '__r_o_', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['__si_____i_', '__si____hi_', '__si____hic', 'p_si___phic', 'p_si___phic', 'p_sig__phic', 'p_sig__phic', 'pasig_aphic', 'pasigraphic', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['met_psychology', 'met_psychology', 'met_psychology', 'met_psychology', 'met_psychology', 'metapsychology', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_______

 13%|█▎        | 214/1641 [00:31<02:46,  8.57it/s]

First state in batch: ['i__e__al__la_', 'i__e__al_ula_', 'in_e__al_ula_', 'in_e__al_ula_', 'in_er_al_ular', 'inter_al_ular', 'inter_al_ular', 'intervalvular', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['sur_i_oress', 'survivoress', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['______________', '______________', '_____p________', '__oo_po_______', '__oo_po_______', '__oo_po_______', '__oo_po__u____', '__oo_po__u_i__', '__oo_po__u_i__', '__oo_po__u_i__', '__oo_po__u_i__', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15



 13%|█▎        | 219/1641 [00:34<03:51,  6.14it/s]

First state in batch: ['sup_rag_nci_s', 'superagencies', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['com__ara_t', 'com__ara_t', 'com__ara_t', 'com__ara_t', 'com__ara_t', 'com__ara_t', 'com__ara_t', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_e_er__l', '_e_er__l', '_e_erb_l', '_e_erb_l', 'de_erb_l', 'de_erb_l', 'de_erb_l', 'de_erbal', 'de_erbal', 'de_erbal', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_____r', '_i___r', '_i___r', '_i_k_r', 'bi_k_r', 'bi_k_r', 'bi_ker', 'bilk

 14%|█▍        | 229/1641 [00:34<02:32,  9.27it/s]

First state in batch: ['tr_p_y_ess', 'tr_p_y_ess', 'tr_p_y_ess', 'tr_p_y_ess', 'trop_y_ess', 'trop_y_ess', 'trop_y_ess', 'trophy_ess', 'trophy_ess', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_i__tow_r_d', 'hi_htow_r_d', 'hightow_r_d', 'hightowered', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['___l___', '___la__', '___la__', '___la__', '___la__', '___la__', '___la__', '___la_t', '___la_t', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['qu_tr_i_s', 'qu_tr_i_s', 'qu_tr_i_s', 'qu_tr_i_s', 'qu_tr_i_s', 'qu_tr_i_

 15%|█▍        | 238/1641 [00:35<02:50,  8.23it/s]

First state in batch: ['i_is_man', 'i_is_man', 'i_ishman', 'i_ishman', 'i_ishman', 'i_ishman', 'i_ishman', 'i_ishman', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__lfne__e_', 'h_lfne_he_', 'halfne_he_', 'halfnephe_', 'halfnephew', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__u__u_____', '__u__u_b___', '__u__u_bi__', '__u_nu_bin_', '__u_nu_bing', '_ou_nu_bing', 'sou_nu_bing', 'soulnu_bing', 'soulnu_bing', 'soulnumbing', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16



 15%|█▍        | 243/1641 [00:37<04:01,  5.78it/s]

First state in batch: ['e_fa__a', 'e_fa_la', 'eufaula', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['e____________', 'e__________m_', 'e__________my', 'e__________my', 'e__________my', 'e__________my', 'e__________my', 'e_____g____my', 'e_____g____my', 'e__p__g____my', 'e__p__g____my', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__t__li_e', 'o_to_li_e', 'o_to_line', 'outo_line', 'outofline', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['simplici_ies', 'simplici_ies', 's

 15%|█▌        | 253/1641 [00:37<02:19,  9.92it/s]

First state in batch: ['p____g__', 'p____g__', 'p____g__', 'p____g__', 'p____g__', 'p____g__', 'p____ge_', 'po___ge_', 'po___ge_', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['t_dric', 't_dric', 't_dric', 't_dric', 'tedric', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15



 16%|█▌        | 257/1641 [00:37<02:12, 10.48it/s]

First state in batch: ['b____y_e_i_', 'b___hy_e_i_', 'br__hy_eri_', 'br_chyceric', 'brachyceric', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['s____', 's___k', 's__ck', 'sm_ck', 'sm_ck', 'smack', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['____m____um', '____m____um', 'z___m____um', 'z___m____um', 'z___m____um', 'z___m____um', 'z___m____um', 'z___m____um', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['_ulciform', '_ulciform', '_ulciform', 'sulciform', '', '', '', '', '', '', '', '', '', '', '']
Unique sequenc

 16%|█▌        | 262/1641 [00:39<03:11,  7.20it/s]

First state in batch: ['_____b__i', 'k____b_ki', 'k____b_ki', 'k____bski', 'k__z_bski', 'ko_z_bski', 'ko_zybski', 'korzybski', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['e__er___me', 'e__er___me', 'e__er___me', 'e__ero_ome', 'e__ero_ome', 'e__ero_ome', 'e__ero_ome', 'e__ero_ome', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['______', '______', '______', '______', '______', '______', '_o____', '_o____', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18}
Diversity (number of unique sequence lengths) in batch: 16



 16%|█▋        | 270/1641 [00:41<03:44,  6.11it/s]

First state in batch: ['____________', '____________', '____________', 'h___________', 'h___________', 'h______ll___', 'h______ll___', 'h______ll___', 'h______ll___', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['mu_i_ity', 'mu_i_ity', 'mu_idity', 'mu_idity', 'mu_idity', 'mucidity', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['mi__m_____', 'mi__m_____', 'mi__m_____', 'mi__m_____', 'mi__m_____', 'mi__m_____', 'mi__m_____', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_h_____', '_h_____',

 17%|█▋        | 279/1641 [00:41<02:17,  9.87it/s]

First state in batch: ['___d__', '___d__', '___d__', '___d__', '___d__', 'w__d__', 'w__d__', 'w__d_n', 'w_yd_n', 'w_yd_n', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['r__cc_ssion', 'r__cc_ssion', 'r__cc_ssion', 'r_acc_ssion', 'reaccession', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_____b__k', '_____b__k', '_____b__k', '_____b__k', '_____b__k', '_____b_ck', '_____b_ck', '_____b_ck', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_r_____', '_ry____', '_ry_i__', '_ry_i__', '_ry_i__', '_ry_i__', 'dry_i

 17%|█▋        | 286/1641 [00:43<03:14,  6.98it/s]

First state in batch: ['___________', '___________', '___________', '___________', '__________y', '__________y', '_______e__y', '_______e__y', '_______e__y', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['pr_____r_____', 'pr_____r____n', 'pr_____r____n', 'pr_____r____n', 'pr_____r____n', 'pr_m___r____n', 'pr_m___r_t__n', 'pr_m___r_t__n', 'pr_m___r_t__n', 'pr_m___r_t__n', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['ant_', 'anth', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16



 18%|█▊        | 291/1641 [00:44<04:15,  5.28it/s]

First state in batch: ['_________', '_________', '_________', '_________', '_______l_', '_______l_', '_______l_', '_______ls', '_______ls', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['i_terstim_lati__', 'i_terstim_lati__', 'i_terstimulati__', 'i_terstimulati__', 'i_terstimulati__', 'interstimulati_n', 'interstimulation', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['__p__gm_ti___', 'u_p__gm_ti___', 'u_pr_gm_ti___', 'u_pr_gm_ti__l', 'unpr_gm_ti__l', 'unpr_gm_tic_l', 'unpragmatical', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

 19%|█▉        | 310/1641 [00:46<02:59,  7.43it/s]

First state in batch: ['mu_exid', 'murexid', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['___________', '__________l', '_n________l', '_n________l', '_n___s____l', 'in___s____l', 'in___s____l', 'in__rs__r_l', 'int_rs__r_l', 'int_rs__r_l', 'int_rsp_r_l', 'intersp_r_l', 'intersp_r_l', 'intersp_r_l', 'interspor_l', 'interspor_l', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['_______', '____e__', '____e__', '____e__', '____e__', '____e__', '____e__', '____e__', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15



 19%|█▉        | 313/1641 [00:47<03:26,  6.43it/s]

First state in batch: ['quasimi_h__', 'quasimi_h__', 'quasimi_h__', 'quasimi_h__', 'quasimi_h__', 'quasimi_h_y', 'quasimi_h_y', 'quasimi_hty', 'quasimi_hty', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16



 19%|█▉        | 315/1641 [00:48<04:01,  5.50it/s]

First state in batch: ['d_______', 'd___a___', 'd___a___', 'd___a___', 'd_f_a___', 'd_f_a___', 'd_f_at__', 'd_f_at__', 'd_f_at__', 'defeatee', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['k_ttl_ng', 'kittling', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['gl___r', 'gl__or', 'gl__or', 'gl__or', 'gl__or', 'gl__or', 'gl__or', 'gl__or', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__o______', '__o____e_', '__o____e_', '_wo____e_', '_wof___e_', '_wof___e_', '_wofi__e_', '_wo

 20%|██        | 334/1641 [00:50<02:54,  7.48it/s]

First state in batch: ['s_____e___s', 's_____e___s', 's_____e___s', 's_____e___s', 's_____e___s', 's_____e___s', 's_____e___s', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_o________', '_o_____a__', '_o_____a__', '_o_____a__', 'co____ca__', 'co____ca__', 'co_n__ca__', 'co_n__ca__', 'co_nd_ca__', 'co_nd_ca__', 'co_nd_ca__', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_______', '_______', 't______', 't______', 't______', 't______', 't______', 't__s___', 't__s___', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15



 21%|██        | 337/1641 [00:50<02:58,  7.31it/s]

First state in batch: ['_vo__t_o_a_', '_vo_ut_o_a_', 'evo_ut_o_a_', 'evo_utio_a_', 'evolutio_al', 'evolutional', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16



 21%|██        | 338/1641 [00:51<03:42,  5.85it/s]

First state in batch: ['__e__g_am', '_ie__g_am', '_ie__g_am', '_ie__g_am', '_ie__g_am', '_ier_gram', '_ier_gram', '_ierogram', 'hierogram', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['hydro_enisation', 'hydro_enisation', 'hydro_enisation', 'hydrogenisation', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['__________', '________u_', '________u_', '_____i__u_', '_____i__up', '_____i_eup', 'd____ideup', 'do___ideup', 'dow__ideup', 'down_ideup', 'down_ideup', 'downsideup', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First sta

 22%|██▏       | 358/1641 [00:54<03:18,  6.48it/s]

First state in batch: ['____rgoo__', '____rgoo__', '____rgoo__', '____rgoo__', '____rgoo__', '____rgoo__', '____rgoo__', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['____________', 'c____c______', 'c____c______', 'c____c_____e', 'c____c_____e', 'c____c_s___e', 'c____c_sa__e', 'c__t_c_sa__e', 'c__t_c_sa__e', 'c__t_c_sa_le', 'c__t_c_sable', 'c__t_c_sable', 'c__t_c_sable', 'cr_t_c_sable', 'cr_t_c_sable', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['the_a', 'the_a', 'thema', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in ba

 22%|██▏       | 362/1641 [00:55<03:20,  6.36it/s]

First state in batch: ['phyl_genetic', 'phylogenetic', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['___________c', '_______l___c', '_______l___c', '____o__lo__c', '_m__om_lo__c', '_m_nom_lon_c', '_minom_lonic', 'aminomalonic', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['___a___', 'r__a___', 'r__a___', 'r__a_p_', 're_a_p_', 're_a_ps', 're_amps', 'revamps', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['p_____t_nt_y', 'p__s_st_nt_y', 'pe_s_stent_y', 'pers_stent

 23%|██▎       | 382/1641 [00:57<02:52,  7.30it/s]

First state in batch: ['pe_iu_ula_', 'pe_iuvula_', 'pe_iuvula_', 'pe_iuvula_', 'periuvular', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['______t___ht', '__r___t___ht', 's_r___t___ht', 'spr___t___ht', 'spri__t___ht', 'spri__t___ht', 'spri__ta__ht', 'spri_gta_ght', 'spri_gta_ght', 'spri_gtaught', 'springtaught', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_t______', '_t______', '_t____g_', 'st____g_', 'st__d_g_', 'st__d_g_', 'st__d_g_', 'st__d_ge', 'st__d_ge', 'st_nd_ge', 'standage', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15



 23%|██▎       | 385/1641 [00:57<02:44,  7.64it/s]

First state in batch: ['s__s_________', 's__sa___a____', 's__sa___a____', 's__sa___a____', 's__sa___a____', 's__sa___a____', 's__sa___a____', 's__sa___a____', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16



 24%|██▎       | 386/1641 [00:58<03:43,  5.61it/s]

First state in batch: ['ergogra_h', 'ergograph', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__r_______', '__r_______', '__r_______', '__r_______', '__r_______', '__r_______', '_ar_______', '_ar_______', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['_____s_', '_____s_', '____os_', 'c___os_', 'c___os_', 'c__pos_', 'c__pos_', 'cu_pos_', 'cu_pos_', 'cu_pos_', 'cu_pos_', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_________e_', '_________e_', '_________e_', '_________e_', '___

 25%|██▍       | 406/1641 [01:01<02:55,  7.04it/s]

First state in batch: ['_______n_', '_______n_', '_______n_', '_______n_', '_______n_', '_______n_', '_______n_', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['i_e_apped', 'icecapped', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['________', '________', '________', '________', '________', '________', '________', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['reti_ulas', 'reti_ulas', 'retinulas', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence

 25%|██▍       | 410/1641 [01:02<03:30,  5.86it/s]

First state in batch: ['_______', '_______', '_______', 'b______', 'b______', 'b______', 'b______', 'ba_____', 'ba__e__', 'ba__e__', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_a___a_ue', '_a_d_a_ue', '_a_dva_ue', '_a_dva_ue', '_a_dva_ue', 'la_dvalue', 'la_dvalue', 'landvalue', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['________i___', '________i___', '________i___', '________i___', '_______ai___', '_______ai___', '_______ai___', '_______ai___', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['o_e_f_own', 

 26%|██▌       | 430/1641 [01:04<02:50,  7.12it/s]

First state in batch: ['_____u_v_v___', '_____u_v_v___', '_____u_v_v___', '_____u_v_v__g', '_____u_v_v_ng', 's___su_v_v_ng', 'se__su_v_v_ng', 'se__su_viving', 'sel_su_viving', 'sel_surviving', 'selfsurviving', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__________________', '__________________', '__________________', '__________________', '________________t_', '________________ty', '________________ty', '__i__________i_ity', '_nin____n____i_ity', '_nin____n____i_ity', '_nin____n____i_ity', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_e__ee', '_e__ee', 'fe__ee', 'fe__ee', 'fe_cee', 'fe_cee', 'fe_cee', 'fe_cee', 'fe_cee', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: 

 26%|██▋       | 434/1641 [01:06<03:39,  5.50it/s]

First state in batch: ['checker_erry', 'checkerberry', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_i___i_i____', '_i___i_i_a__', '_i___i_i_a__', '_i___i_i_a__', '_i___i_i_a__', '_i___i_i_a__', '_i___i_i_a__', '_is__i_i_a__', '_is__i_i_a__', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['b_s_uth_c', 'b_smuth_c', 'bismuthic', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_rosswords', '_rosswords', '_rosswords', 'crosswords', '', '', '', '', '', '', '', '', '', 

 28%|██▊       | 454/1641 [01:08<02:45,  7.17it/s]

First state in batch: ['karl_ng', 'karl_ng', 'karling', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['lim_', 'lime', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['rorifl_ent', 'rorifl_ent', 'rorifluent', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['un_hicken', 'unthicken', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch:

 28%|██▊       | 458/1641 [01:09<03:10,  6.22it/s]

First state in batch: ['buffe_er', 'buffe_er', 'buffe_er', 'buffe_er', 'buffe_er', 'buffe_er', 'buffe_er', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['____________', '_h__________', '_h________r_', 'ch________r_', 'ch________r_', 'ch________rs', 'chi_____i_rs', 'chi__o__i_rs', 'chiffo__i_rs', 'chiffo__iers', 'chiffonniers', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['c_i___h__e', 'c_i___h__e', 'c_i__wh__e', 'c_i__wh__e', 'c_i__wh__e', 'c_i_gwh__e', 'c_i_gwh__e', 'c_i_gwh__e', 'c_i_gwh__e', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19}
Diversity (number of unique sequence lengths) in batch: 17



 29%|██▉       | 478/1641 [01:12<02:49,  6.87it/s]

First state in batch: ['____i_', '____i_', 'p___i_', 'p___i_', 'p__ai_', 'p__ai_', 'p__ail', 'p_tail', 'p_tail', 'potail', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['al_id', 'algid', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['____________', '__v_________', '__v_____t___', '__v_____t___', '__v_____t___', '__va___at___', '__va___at___', '__va___at___', '__va___at___', '__va___at___', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['const_r', 'const_r', 'const_r', 'conster

 29%|██▉       | 482/1641 [01:13<02:54,  6.64it/s]

First state in batch: ['_a________', '_a________', '_a________', '_a________', '_a________', '_a_l______', '_a_l______', '_a_l______', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['i________i_ili__', 'i____p___i_ili__', 'i____p___i_ili_y', 'i____p___i_ili_y', 'i____p___i_ili_y', 'i____p___i_ili_y', 'i____p___i_ili_y', 'i____p___i_ili_y', 'i____p_ssi_ili_y', 'i____p_ssi_ili_y', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['micro_ymenoptera', 'micro_ymenoptera', 'micro_ymenoptera', 'microhymenoptera', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique se

 31%|███       | 502/1641 [01:16<02:49,  6.71it/s]

First state in batch: ['_________', '_n_______', '_n_______', '_n_______', '_n_______', '_n_______', '_n_______', '_n_______', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_________', '______i__', '____a_i__', '____a_i_o', '____a_i_o', '_e__a_i_o', '_e__ani_o', 'me__ani_o', 'mec_anico', 'mechanico', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_u_el', '_u_el', '_u_el', '_u_el', '_urel', '_urel', '_urel', '_urel', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['__ri_at_ticat_', 'p_ripat_ticat_', 'peri

 31%|███       | 506/1641 [01:16<02:46,  6.81it/s]

First state in batch: ['m_ri____s', 'm_ri____s', 'myri____s', 'myri_p__s', 'myri_p__s', 'myriopo_s', 'myriopo_s', 'myriopo_s', 'myriopods', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__rr_a__', '__rr_a__', '__rr_a__', '__rr_a__', '__rr_a__', 'b_rr_a__', 'b_rr_a__', 'b_rr_a__', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__oo_______', '__oo_______', '__oo_______', '__oo_______', '__oo_______', '__oo_______', '__oo_______', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__________', '_____e__e_', '

 32%|███▏      | 526/1641 [01:19<02:33,  7.25it/s]

First state in batch: ['_____', '_____', '_____', '_____', '_____', '___n_', '__un_', '__un_', '__uns', '__uns', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_a_________', '_a_________', '_a_________', '_a__o__o___', '_a_pop_o___', '_arpop_o___', '_arpop_o___', '_arpop_o___', '_arpop_o___', '_arpop_o___', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['___fu_i__', '___fu_i__', '_o_fu_io_', '_o_fusio_', 'nonfusion', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['ew_', 'ew_', 'ew_', 'ew_', 'ew_', 'ew_', 

 32%|███▏      | 530/1641 [01:20<02:49,  6.56it/s]

First state in batch: ['__ee_hibito_', '_ree_hibitor', '_ree_hibitor', '_reexhibitor', 'preexhibitor', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['___', 'n__', 'n_e', 'nae', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['____', 'e__e', 'e__e', 'er_e', 'erme', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['a____', 'as___', 'asc__', 'ascr_', 'ascry', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10,

 34%|███▎      | 550/1641 [01:22<02:35,  7.02it/s]

First state in batch: ['googo_p_exes', 'googo_p_exes', 'googo_p_exes', 'googo_p_exes', 'googo_p_exes', 'googo_p_exes', 'googo_p_exes', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['electrod_namometer', 'electrod_namometer', 'electrodynamometer', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['procollect_v_st_c', 'procollect_v_st_c', 'procollectivistic', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['misadmeasu_ement', 'misadmeasu_ement', 'misadmeasurem

 34%|███▍      | 554/1641 [01:23<02:42,  6.69it/s]

First state in batch: ['hi_e', 'hite', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_____e_e__', 'a____e_e__', 'a____e_e__', 'a____e_e__', 'a____e_e__', 'a____e_e__', 'a____e_e__', 'ar___ere__', 'ar___ere__', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__________', 'i__i______', 'i__i___o__', 'i__i___o__', 'i__i___o__', 'i__i___o__', 'i__i___o__', 'i__i___o__', 'i__i___o__', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['__diff____ti_t_d', '__diff____ti_t_d', '__diff____ti_t_d', '__diffe_e_ti_ted',

 35%|███▍      | 574/1641 [01:26<02:36,  6.80it/s]

First state in batch: ['_e_____b_e_e__e_', '_e_____b_e_e__e_', 're_____b_e_e__e_', 're_____b_e_e__e_', 're_____b_e_e__e_', 're_____b_e_e__e_', 're__o__b_e_e__e_', 're__o__b_e_e__e_', 're__o__b_e_e__e_', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['omnis_ient', 'omniscient', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['______________', '_______o____o_', '_______o__t_o_', '__ee___o__t_o_', 'p_ee__po__t_o_', 'p_ee__po__t_on', 'p_ee_apo_at_on', 'p_eevapo_at_on', 'p_eevapo_ation', 'preevaporation', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequen

 35%|███▌      | 578/1641 [01:27<02:35,  6.84it/s]

First state in batch: ['___t_g__', '___t_g__', '___t_ge_', '_o_t_ge_', '_o_tage_', '_o_tage_', '_o_taged', '_ontaged', 'montaged', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['__________', '__________', '__c_______', '__c_______', '__c_______', '__c_______', '__c_______', '__c_______', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['____a__', '____a__', '____a__', '____a__', '_o__a__', '_o__a__', '_o__a__', '_o__a__', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['hypsibrachycephalis_', 'hypsibrachycephalis_', 'hypsibrachycephali

 37%|███▋      | 602/1641 [01:30<02:22,  7.32it/s]

First state in batch: ['c__e_a_e', 'cu_e_a_e', 'cu_e_a_e', 'cu_e_a_e', 'cu_e_a_e', 'cu_era_e', 'cu_era_e', 'cu_erage', 'cu_erage', 'cu_erage', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['t_________', 't_____j___', 't___e_j___', 't___e_j___', 't___e_j___', 't___e_j___', 't___e_j___', 't___e_j___', 't___e_ja__', 'ti__e_ja__', 'ti__e_jac_', 'ti__e_jac_', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['______', '_____l', 'h____l', 'h_mm_l', 'h_mm_l', 'h_mm_l', 'h_mm_l', 'h_mm_l', 'h_mm_l', 'hemmel', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19}
Diversity (number of unique sequence lengths) in batch: 16

First state in

 38%|███▊      | 622/1641 [01:34<02:44,  6.18it/s]

First state in batch: ['nonapplicati_e', 'nonapplicative', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['______a', '_____da', '____nda', '_e__nda', '_e__nda', '_e__nda', '_e_inda', 'se_inda', 'selinda', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_omm_n_ations', 'comm_n_ations', 'comm_n_ations', 'comm_n_ations', 'comm_n_ations', 'comm_n_ations', 'comm_n_ations', 'comm_n_ations', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['___', '___', '___', 'c__', 'c__', 'c_w', 'cbw', '', '', '', '', '',

 39%|███▉      | 646/1641 [01:37<02:32,  6.54it/s]

First state in batch: ['ironsi_es', 'ironsi_es', 'ironsides', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['kno_ula_', 'kno_ula_', 'kno_ula_', 'knobula_', 'knobula_', 'knobular', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['woo_fie__', 'woo_fie__', 'woo_fiel_', 'woodfield', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__d___d', '__du__d', '_edu_ed', '_edu_ed', '_educed', 'reduced', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5,

 41%|████      | 670/1641 [01:41<02:28,  6.55it/s]

First state in batch: ['__anslu___us', '__anslu___us', '__anslu___us', '__anslu___us', '__anslu___us', '__anslu___us', '__anslu___us', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['___________', '_________e_', '_______a_e_', '_______a_e_', '_______a_e_', '_______a_e_', '_______a_e_', '_____t_a_e_', '___i_t_a_e_', '___i_tma_e_', '___i_tma_e_', '__ri_tma_e_', '__ri_tma_e_', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_______', '_______', 'c______', 'c__g___', 'c__gu__', 'co_gu__', 'co_gu__', 'coagu_a', 'coagu_a', 'coagula', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number

 42%|████▏     | 694/1641 [01:44<02:23,  6.59it/s]

First state in batch: ['eucl_des', 'eucl_des', 'euclides', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['hope_ale', 'hope_ale', 'hope_ale', 'hopedale', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['grai_ier', 'grainier', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['dor_', 'dor_', 'dori', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}


 44%|████▍     | 718/1641 [01:48<02:19,  6.60it/s]

First state in batch: ['_es__ea_', 'wes__ea_', 'wes_hea_', 'westhea_', 'westhead', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_______s_', '__p____s_', '__p____s_', '__pee__s_', '__peev_s_', 'i_peevis_', 'i_peevish', 'impeevish', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_y______e_', 'hy______e_', 'hy_____te_', 'hyp____te_', 'hyp__r_ter', 'hyp__r_ter', 'hyp__rater', 'hyp_crater', 'hyp_crater', 'hypocrater', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['________a___', '________a___', '________a___', '______

 45%|████▌     | 742/1641 [01:52<02:16,  6.60it/s]

First state in batch: ['_eterodromous', '_eterodromous', 'heterodromous', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['wa__iga_', 'wa__iga_', 'wannigan', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_yxopodou_', '_yxopodou_', '_yxopodous', '_yxopodous', 'myxopodous', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['radarsco_e', 'radarscope', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 

 47%|████▋     | 766/1641 [01:55<02:12,  6.59it/s]

First state in batch: ['_______', '_______', '_______', '_______', '_______', '_______', '_______', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['pr_r__urn', 'pr_r_turn', 'prereturn', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_________', 'c________', 'c____d___', 'c___idi__', 'c___idiu_', 'cy__idiu_', 'cy_bidiu_', 'cymbidium', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_a__ina_itian', '_ar_ina_itian', '_ar_ina_itian', '_ardina_itian', '_ardinalitian', '_ardinalitian', '_ardinalitian', 

 48%|████▊     | 790/1641 [01:59<02:14,  6.34it/s]

First state in batch: ['w_ngp_st', 'w_ngpost', 'wingpost', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['____________', '__e_________', '__e_________', '__e_________', '__e_________', '__e_________', '__e_________', '__e_________', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['___', 'g__', 'g__', 'g__', 'g__', 'g_r', 'g_r', 'g_r', 'g_r', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['_x_r_v_____', '_x_r_v_s___', '_x_r_v_s___', '_x_r_v_s___', '_xtr_v_s_t_', '_xtr_v_s_t_', '_xtr_v_s_t_', '_xtr_v_s_t_', 

 49%|████▊     | 798/1641 [02:00<02:00,  7.02it/s]

First state in batch: ['_________', '______e__', '______ea_', '______ea_', '____rrea_', '_o_orrea_', '_o_orrean', '_o_orrean', 'go_orrean', 'gomorrean', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_p___h______', '_p___h______', '_p___h______', '_p___h_____c', '_p___h_____c', '_p___h_____c', '_p___h___t_c', '_p___h___t_c', '_p___h___t_c', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['b_eakp_i_t', 'b_eakp_i_t', 'b_eakpoi_t', 'breakpoi_t', 'breakpoi_t', 'breakpoint', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch

 50%|████▉     | 814/1641 [02:03<02:14,  6.14it/s]

First state in batch: ['_ynasticism', 'dynasticism', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['___i__i_', '___i__i_', '_r_ir_i_', 'ur_ir_i_', 'ur_ir_i_', 'ur_ir_i_', 'ur_ir_i_', 'ur_ir_i_', 'ur_ir_i_', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['da_tee_', 'da_tee_', 'da_teel', 'da_teel', 'dasteel', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['___m_z_', '__em_ze', 'i_emize', 'itemize', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence length

 50%|█████     | 822/1641 [02:03<01:56,  7.01it/s]

First state in batch: ['_________b__', '_________b__', '_cc______b__', '_cco_____b__', '_cco_____b__', '_cco_____b__', '_cco_____b__', '_cco_____b__', '_ccom____b__', '_ccom____b__', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['on_jo_nt_d', 'on_jo_nt_d', 'on_jo_nt_d', 'on_jo_nt_d', 'onejo_nted', 'onejo_nted', 'onejo_nted', 'onejo_nted', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['tr_nkgeld', 'trinkgeld', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['h_m_ch_amyde__s', 'h_m_chlamyde__s', 'h_m_chlamy

 51%|█████     | 838/1641 [02:07<02:16,  5.87it/s]

First state in batch: ['n_n_aised', 'n_n_aised', 'n_n_aised', 'n_n_aised', 'n_n_aised', 'n_nraised', 'nonraised', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['h____d_um', 'hi__idium', 'hippidium', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['fas_ly', 'fastly', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['t__u__ost__s', 't__u__ost__s', 't__u__ost__s', 'te_u__ost_es', 'te_u__ost_es', 'te_u__ost_es', 'te_u__ost_es', 'te_u__ost_es', '', '', '', '', '', '', '

 53%|█████▎    | 862/1641 [02:10<02:01,  6.40it/s]

First state in batch: ['dermosy_ov___s', 'dermosy_ov___s', 'dermosy_ov___s', 'dermosy_ov___s', 'dermosy_ovi_is', 'dermosy_ovi_is', 'dermosy_ovi_is', 'dermosy_ovi_is', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['tr_vor', 'trevor', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['__rderi_gly', '_urderi_gly', 'murderi_gly', 'murderingly', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['adrenalectomi_ing', 'adrenalectomizing', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '

 53%|█████▎    | 870/1641 [02:11<01:47,  7.18it/s]

First state in batch: ['angl_dutchman', 'anglodutchman', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_yscras_c', '_yscras_c', '_yscrasic', '_yscrasic', '_yscrasic', '_yscrasic', '_yscrasic', '_yscrasic', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_ridra', 'tridra', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_______', '_______', '_______', '_____o_', '_____o_', '_____on', '_____on', '_____on', '_____on', '', '', '', '', '', '', '', '', '']
Unique

 54%|█████▍    | 886/1641 [02:14<02:02,  6.18it/s]

First state in batch: ['y______', 'y______', 'y______', 'y______', 'y______', 'y______', 'yo_____', 'yo__n__', 'yo__n__', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['dark_uited', 'darksuited', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_hastisable', 'chastisable', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['____d__u___d', '__r_d_ru___d', '__r_d_ru___d', '__r_d_ru___d', '__r_dcru___d', '__r_dcru_h_d', '__r_dcru_h_d', '__r_dcru_h_d', '_or_dcru_h_d', '_or_dcru_h_d'

 54%|█████▍    | 894/1641 [02:14<01:44,  7.17it/s]

First state in batch: ['_________', '_________', 'c________', 'c________', 'c____e___', 'c____et__', 'co___et__', 'co___et__', 'co__ret__', 'co__ret__', 'co__ret__', 'co__retu_', 'com_retum', 'combretum', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['________', '_v______', '_v______', '_v______', '_v______', '_v______', '_v______', '_v______', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['a_rv', 'a_rv', 'acrv', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__p_p_lat__n', 'r_p_p_lat__n', 'r_p_p_lat__n', 'r_p_p_lat__n', '

 56%|█████▌    | 920/1641 [02:18<01:32,  7.77it/s]

First state in batch: ['undronel_ke', 'undronel_ke', 'undronel_ke', 'undronel_ke', 'undronel_ke', 'undronel_ke', 'undronelike', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_______d__', '_______d__', '_______d__', '_______d__', 't______d__', 't______d__', 't______d_r', 't______d_r', 't______der', 't___o__der', 't___o__der', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['rayn_r', 'rayner', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['____o___o____', '____ot__o____', '____ot__o____', '

 57%|█████▋    | 934/1641 [02:22<02:02,  5.76it/s]

First state in batch: ['____y___', '____y___', '____y___', '____yi__', '____yi__', '____yi__', '____yi_g', 'p___yi_g', 'p___yi_g', 'p___yi_g', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_p_______', '_p_______', '_p__g____', '_ph_g____', '_ph_g____', '_ph_g____', '_ph_g____', '_ph_g____', '_ph_g____', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['____r___', '____r_t_', '____r_t_', '____r_t_', '___er_t_', '_n_er_t_', '_n_er_t_', '_ncer_t_', '_ncer_t_', 'ancerata', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['unne____our__ne__', 'unne____our__ne__', 

 58%|█████▊    | 958/1641 [02:25<01:48,  6.27it/s]

First state in batch: ['g__up', 'g__up', 'g__up', 'geeup', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_o_vick', '_o_vick', '_onvick', 'gonvick', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['spondylolist_esis', 'spondylolisthesis', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['s_______s_____________', 's______ls_l_________l_', 's______ls_l_________l_', 's______ls_l__a___a__l_', 's______ls_lp_a___a__l_', 's______ls_lp_a___a_ol_', 's______ls_lp_a__

 61%|██████    | 1004/1641 [02:29<01:07,  9.46it/s]

First state in batch: ['ch_r_', 'ch_r_', 'ch_r_', 'ch_r_', 'ch_r_', 'ch_r_', 'ch_r_', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_____________', '_y___y_______', '_y___y_______', '_y_t_y_______', 'dy_t_y___d___', 'dy_t_y___d___', 'dy_t_y__idi__', 'dy_t_y__idi__', 'dy_t_y__idi__', 'dy_t_y_oidi__', 'dyst_y_oidis_', 'dyst_y_oidism', 'dysthy_oidism', 'dysthy_oidism', 'dysthy_oidism', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['____________', '___tt_______', '___tt____e__', '___ttr___e__', '__sttr___e__', '__sttra__ea_', 'p_sttra__ea_', 'posttra__ea_', 'posttrac_ea_', 'posttrac_ea_', 'posttrachea_', 'posttracheal', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8,

 62%|██████▏   | 1012/1641 [02:33<01:41,  6.19it/s]

First state in batch: ['_u______o_____', '_u__r___o_____', '_u__r___oc__c_', '_u__ri__oc__c_', '_u__rinnoc_nc_', '_u__rinnoc_nc_', 'su__rinnoc_nc_', 'su_erinnocence', 'su_erinnocence', 'superinnocence', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['a_odis', 'apodis', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['g_sser', 'g_sser', 'gosser', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['uncur_ously', 'uncuriously', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']

 63%|██████▎   | 1030/1641 [02:36<01:42,  5.95it/s]

First state in batch: ['deca_bu_ation', 'decarburation', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['roselippe_', 'roselippe_', 'roselippe_', 'roselippe_', 'roselippe_', 'roselippe_', 'roselippe_', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['ille_t', 'illect', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['v_letudin_ri_ni__', 'v_letudin_ri_nis_', 'v_letudin_ri_nism', 'v_letudin_ri_nism', 'valetudinarianism', '', '', '', '', '', '', '', '', '', '', '', ''

 64%|██████▍   | 1054/1641 [02:39<01:32,  6.33it/s]

First state in batch: ['su________', 'summ______', 'summ______', 'summ____l_', 'summ_r__l_', 'summer__le', 'summerd_le', 'summerdale', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_urie_', '_urie_', '_uries', '_uries', 'curies', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['____l', 'r___l', 'r___l', 'r___l', 'r___l', 'r_y_l', 'roy_l', 'roy_l', 'roy_l', 'roy_l', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['em__ss_ge', 'em__ss_ge', 'emb_ss_ge', 'embassage', '', '', '', '', '', '', '',

 66%|██████▌   | 1078/1641 [02:42<01:19,  7.09it/s]

First state in batch: ['__o_a', '__o_a', '__o_a', '__o_a', '__o_a', '__o_a', '__o_a', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['______a', '______a', '______a', '_o____a', '_os___a', '_os___a', '_os__ta', '_osp_ta', 'hosp_ta', 'hospita', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_________', '_________', '________s', '________s', '________s', '________s', '____h___s', '_r__h___s', '_r__h___s', 'tr__h_t_s', 'tr_ch_t_s', 'tr_ch_t_s', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_rchduke', '_rchduke', 'archduke', '', ''

 67%|██████▋   | 1102/1641 [02:45<01:15,  7.13it/s]

First state in batch: ['_______', '_______', '_______', '_______', '_______', 'h______', 'h______', 'h______', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['ambitio_i_g', 'ambitio_i_g', 'ambitio_i_g', 'ambitio_i_g', 'ambitio_i_g', 'ambitio_i_g', 'ambitio_i_g', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['ouranop_obia', 'ouranophobia', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__t__nost__s', '__t__nost__s', '_at__nost__s', '_at__nost__s', 'pat__nost__s', 'pat__nost__s', 'pat__

 69%|██████▊   | 1126/1641 [02:51<01:25,  6.05it/s]

First state in batch: ['_r_____ld', '_r_____ld', '_r____old', 'gr___gold', 'gree_gold', 'gree_gold', 'greengold', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['shop_ille', 'shop_ille', 'shop_ille', 'shop_ille', 'shop_ille', 'shop_ille', 'shop_ille', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['___l_____ll_', 'u__l_____ll_', 'u__l____all_', 'u__l____all_', 'u__l____all_', 'u__l____all_', 'u__l____all_', 'u_cl___call_', 'u_cl___call_', 'uncl___call_', 'uncle__call_', 'uncler_call_', 'unclericall_', 'unclerically', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

Firs

 70%|███████   | 1150/1641 [02:54<01:20,  6.13it/s]

First state in batch: ['_h__h___', '_hy_h___', '_hy_h___', 'rhy_h___', 'rhy_h_a_', 'rhy_hma_', 'rhy_hmal', 'rhy_hmal', 'rhythmal', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['a_arya', 'anarya', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['______g______', '______g______', '______g______', '______g______', '______g______', '______g______', '______g______', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['irr_concil_m_nt', 'irr_concil_m_nt', 'irr_concil_m_nt', 'ir

 72%|███████▏  | 1174/1641 [02:58<01:13,  6.32it/s]

First state in batch: ['_____o___', 'w___wo___', 'w___wom__', 'w___wom_n', 'w_s_wom_n', 'w_s_wom_n', 'w_s_wom_n', 'w_shwom_n', 'washwoman', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['ma_e', 'ma_e', 'made', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_ac__iz__', '_ac__iz__', 'rac__iz__', 'rac_miz__', 'racemize_', 'racemizes', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['f__o_____', 'fa_o_a___', 'fa_o_ab__', 'fa_o_ab__', 'fa_o_abl_', 'favo_abl_', 'favo_ably', 'favorably', '', '', 

 73%|███████▎  | 1195/1641 [02:58<00:53,  8.29it/s]

First state in batch: ['ra__anim', 'rabbanim', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['sauce_ot', 'sauce_ot', 'saucepot', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15



 73%|███████▎  | 1198/1641 [03:02<01:20,  5.52it/s]

First state in batch: ['__y_i_', '_hy_i_', '_hymi_', '_hymi_', 'chymic', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['ch___e', 'ch_b_e', 'ch_b_e', 'ch_b_e', 'ch_b_e', 'ch_b_e', 'ch_bie', 'ch_bie', 'chobie', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_____f__', '_____f__', '____rf__', '__m_rf__', '__m_rf__', '__m_rf__', '_um_rfu_', 'hum_rfu_', 'hum_rful', 'humorful', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['_npat__nt', '_npat__nt', '_npat__nt', 'inpati_nt', 'inpati_nt', 'inpatient', '', '

 74%|███████▍  | 1222/1641 [03:06<01:10,  5.93it/s]

First state in batch: ['_________n_', '_________n_', '_________n_', 'e________ne', 'e________ne', 'e________ne', 'e________ne', 'e________ne', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['__________', '__________', '__________', '__________', '__________', '__________', '__________', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_______________', '_______h_______', '___o___h_______', '_u_o___h_______', '_u_o___h_______', '_u_o___h____i__', '_u_o___h___ri__', '_u_o___h___ri__', '_u_o___h___ri__', '_u_o___h___ri__', '_u_o___h___ri__', '_u_o___h___ri__', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 

 76%|███████▌  | 1246/1641 [03:10<01:06,  5.93it/s]

First state in batch: ['adr_a___', 'adr_a___', 'adr_at__', 'adriati_', 'adriatic', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['brachygra_hic', 'brachygraphic', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_armership', 'farmership', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['whi__en', 'whiffen', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 

 77%|███████▋  | 1270/1641 [03:14<01:04,  5.79it/s]

First state in batch: ['_____________', '_____________', '_______s_____', '____u__s_____', '____u__s_____', '____u__s___l_', '____u__s___l_', '____u__s___l_', '____u__s___l_', '____u__s__bl_', '__n_u__s__bl_', '__n_u_is_ibl_', '__n_u_is_ibl_', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['lamplight_d', 'lamplighted', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['h_atspot', 'heatspot', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['____a_a_', '____a_a_', '____a_a_', '____axa

 79%|███████▉  | 1294/1641 [03:17<00:53,  6.52it/s]

First state in batch: ['_ll____', '_ll____', '_ll____', '_ll____', '_ll___y', 'elle_ey', 'ellerey', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['sadis_', 'sadist', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['________', '_o___o__', '_o__ho__', '_o__hoe_', '_o__hoe_', '_o__hoe_', '_o__hoe_', 'po__hoe_', 'po__hoe_', 'po_choe_', 'ponchoe_', 'ponchoe_', 'ponchoe_', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_tyl_ne', '_tyline', 'styline', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths i

 80%|████████  | 1318/1641 [03:20<00:48,  6.69it/s]

First state in batch: ['___', '___', '___', 't__', 't_s', 't_s', 't_s', 't_s', 't_s', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['pa__oa___s_t_', 'pa__oa___sit_', 'pa_eoa__esite', 'pa_eoa_desite', 'pa_eoandesite', 'pa_eoandesite', 'paleoandesite', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['aca__micals', 'acad_micals', 'acad_micals', 'academicals', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['p_e_ehea_sal', 'p_e_ehea_sal', 'p_e_ehea_sal', 'p_e_ehea_sal', 'p_e_ehea_sal', 'prerehea

 82%|████████▏ | 1342/1641 [03:24<00:44,  6.78it/s]

First state in batch: ['ro_ge', 'rouge', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['________', '__h_____', '__h__i__', '_ph__i__', '_ph__iu_', '_phe_iu_', 'aphe_iu_', 'aphesius', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['______________', '______________', '______________', '______________', '______________', '______________', '__u___________', '__u____n______', '__u____n______', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_nowie_t', 'snowiest', '', '', '', '', '', '', '', '', '', '', '', '', '',

 83%|████████▎ | 1366/1641 [03:27<00:39,  6.88it/s]

First state in batch: ['__ccat_ng', '_iccating', 'siccating', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_uc_t_n', '_uc_t_n', '_uc_t_n', '_uc_t_n', '_uc_t_n', '_uc_t_n', '_uc_t_n', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_n__r__ic', '_n__r__ic', '_n__r__ic', '_n__r_pic', '_n__ropic', '_n_hropic', '_nthropic', 'anthropic', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['ca_sarian', 'ca_sarian', 'ca_sarian', 'ca_sarian', 'ca_sarian', 'ca_sarian', 'ca_sarian', '', '', '', '', '', '', 

 85%|████████▍ | 1390/1641 [03:31<00:36,  6.85it/s]

First state in batch: ['______m__________', '___f__m__________', '___f__m__a_____a_', '___f__mp_a_____a_', '___f_omp_a_____a_', '___f_omp_a_____a_', '___f_omp_a___t_a_', 's__f_omp_a___t_a_', 'se_f_omp_a_e_t_a_', 'se_f_omp_a_e_t_a_', 'se_f_omp_a_e_tia_', 'se_f_omp_a_e_tia_', 'se_f_omp_a_e_tia_', 'se_fcomp_ace_tia_', 'se_fcomp_ace_tia_', 'se_fcomp_acentia_', 'se_fcomp_acentia_']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['seawo__', 'seawo__', 'seawo__', 'seawor_', 'seawor_', 'seawor_', 'seaworn', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_n___________l_', '_n___________l_', '_n_____s____sl_', '_n_____s____sl_', '_n_____s____sl_', '_n_____s____sly', '_n_____s____sly', '_n_____s____sly',

 86%|████████▌ | 1414/1641 [03:33<00:30,  7.37it/s]

First state in batch: ['_o_______y__', '_o____r__y__', 'co____r__y__', 'co____r__y__', 'co____r__y__', 'co____r__y__', 'co____r__yl_', 'co____rs_yl_', 'cou___rs_yl_', 'cou_t_rstyl_', 'cou_t_rstyl_', 'cou_terstyle', 'counterstyle', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}
Diversity (number of unique sequence lengths) in batch: 18

First state in batch: ['_a_____', '_a_____', '_a_h___', '_a_h___', '_agh___', '_aghl__', '_aghl__', '_aghl__', '_aghl__', '_aghli_', '_aghli_', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['____s___s', '____s___s', '__r_s___s', '__r_s__ns', '_br_s__ns', '_br_si_ns', '_br_sions', 'abrasions', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique s

 88%|████████▊ | 1438/1641 [03:39<00:33,  6.06it/s]

First state in batch: ['_____i_', '____hi_', '____hi_', '___chic', '__achic', 'n_achic', 'n_achic', 'noachic', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['___________________', '___________________', '___c____________c__', '___c____________c__', 'e_ec_____e______c__', 'e_ec__o__e______c__', 'e_ec__o__e___u__c__', 'e_ec__o__e_a_u__ca_', 'elec__o__e_a_u__cal', 'elec__o__e_apu__cal', 'elect_ot_e_aput_cal', 'elect_othe_aput_cal', 'elect_othe_aputical', 'elect_othe_aputical', 'electrotheraputical', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_o__e_t_e__', '_o__e_t_e__', '_o__e_t_e__', '_o__e_t_ess', '_o__e_t_ess', '_o__e_t_ess', '_o__e_t_ess', '_o__e_t_ess', '', '', '', '', '', '', '', '', '']
Uniq

 89%|████████▉ | 1462/1641 [03:45<00:33,  5.35it/s]

First state in batch: ['_____r__', '_____r__', '_i__ir__', '_i__ir__', '_i__ire_', '_i__ired', '_i__ired', '_i__ired', '_i__ired', '_i__ired', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['________k', '________k', '___e____k', '___e____k', 'p_pe____k', 'p_pe____k', 'p_per__rk', 'p_perw_rk', 'paperw_rk', 'paperwork', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['_______', '____n__', '____n__', '__a_n__', '__a_n__', 's_a_n__', 's_a_n__', 's_a_ni_', 's_ayni_', 's_ayni_', 'stayni_', 'stayni_', 'staynil', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First

 91%|█████████ | 1486/1641 [03:49<00:29,  5.20it/s]

First state in batch: ['_errates', 'ferrates', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['_ase_ul', '_aseful', 'vaseful', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['_______________', '_______________', '__________l____', '__________l____', '__________l_x__', '__________l_x__', 'a___a_a___lax__', 'a__ia_a___laxi_', 'a__ia_a___laxis', 'a__ia_ap__laxis', 'a__ia_ap__laxis', 'a__ia_ap_ylaxis', 'a__ia_ap_ylaxis', 'an_ianap_ylaxis', 'an_ianap_ylaxis', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state 

 92%|█████████▏| 1510/1641 [03:54<00:24,  5.40it/s]

First state in batch: ['____p_asis', 'd___p_asis', 'd___phasis', 'd__mphasis', 'deemphasis', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['supertutelar_', 'supertutelary', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['_v___t', '_v___t', '_vo__t', '_vo__t', '_vo__t', '_vo__t', '_vo__t', '_vo__t', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
Diversity (number of unique sequence lengths) in batch: 14

First state in batch: ['brighe___', 'brighell_', 'brighella', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 

 93%|█████████▎| 1534/1641 [03:57<00:18,  5.66it/s]

First state in batch: ['eth_noyl', 'eth_noyl', 'eth_noyl', 'ethanoyl', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['_____e__', '_____e__', '_____e__', '_____ess', '____ness', '____ness', '_i__ness', '_il_ness', '_il_ness', 'mil_ness', 'mil_ness', 'mil_ness', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['t___a', 't_u_a', 't_uja', 't_uja', 'thuja', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17}
Diversity (number of unique sequence lengths) in batch: 15

First state in batch: ['ct_nidia', 'ct_nidia', 'ct_nidia', 'ctenidia', '', '', '', '', '', '', '', '', '', '', '', '', '', ''

 95%|█████████▍| 1558/1641 [04:01<00:14,  5.89it/s]

First state in batch: ['ken_ick', 'kenrick', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19}
Diversity (number of unique sequence lengths) in batch: 17

First state in batch: ['gha_tlily', 'gha_tlily', 'gha_tlily', 'ghastlily', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['m____', 'm____', 'm____', 'm____', 'm____', 'm____', 'm____', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
Diversity (number of unique sequence lengths) in batch: 16

First state in batch: ['pauca_', 'pauca_', 'paucal', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 1

In [None]:
from tqdm import tqdm
from collections import Counter

# Assuming data_module is an instance of HangmanDataModule
data_loader = data_module.train_dataloader()  # Call the method to get the DataLoader instance

# Initialize a list to store the diversity of sequence lengths in each batch
batch_seq_length_diversity = []

for batch in tqdm(data_loader):
    # Extract original_seq_lengths from the batch
    original_seq_lengths = batch['original_seq_lengths']

    # Analyze the diversity of sequence lengths in the current batch
    unique_seq_lengths = set(original_seq_lengths)  # Use a set to find unique sequence lengths
    batch_seq_length_diversity.append(len(unique_seq_lengths))  # Store the count of unique lengths

    # Count the number of examples under each unique sequence length
    seq_len_counts = Counter(original_seq_lengths)

    print(f"First state in batch: {batch['guessed_states'][0]}")
    # print(f"Original sequence lengths in batch: {original_seq_lengths}")
    print(f"Unique sequence lengths in batch: {unique_seq_lengths}")
    print(f"Diversity (number of unique sequence lengths) in batch: {len(unique_seq_lengths)}")
    print(f"Number of examples under each unique sequence length in batch: {seq_len_counts}")

    # break  # Remove this break to analyze all batches

    print()

# After collecting data, analyze the overall diversity
average_diversity = sum(batch_seq_length_diversity) / len(batch_seq_length_diversity) if batch_seq_length_diversity else 0
print(f"Average diversity of sequence lengths across batches: {average_diversity}")

100%|██████████| 7/7 [00:02<00:00,  3.36it/s]

First state in batch: ['____', '____', '____', '____', 'p_p_', 'p_p_', 'p_p_', 'p_p_', '', '', '', '', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
Diversity (number of unique sequence lengths) in batch: 19
Number of examples under each unique sequence length in batch: Counter({2: 161, 8: 135, 7: 135, 9: 112, 4: 81, 10: 76, 3: 74, 11: 51, 5: 51, 6: 48, 12: 39, 13: 22, 15: 12, 14: 11, 17: 5, 16: 5, 18: 4, 19: 1, 20: 1})

First state in batch: ['i___rr_c_io_a__y', 'i___rr_c_io_a__y', 'i___rr_c_io_a__y', 'in__rr_c_iona__y', 'in__rr_c_iona__y', 'ins_rr_c_iona__y', 'ins_rr_c_iona__y', 'ins_rrec_iona__y', 'ins_rrec_ionally', 'ins_rrec_ionally', 'insurrec_ionally', 'insurrec_ionally', '', '', '', '', '', '', '', '']
Unique sequence lengths in batch: {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
Diversity (number of unique sequence lengths) in batch: 19
Number of examples under each




In [None]:
STOP

NameError: name 'STOP' is not defined

In [None]:
# Assuming data_module is an instance of HangmanDataModule
data_loader = data_module.train_dataloader  # Call the method to get the DataLoader

for batch in tqdm(data_loader()):
    states = batch['guessed_states']
    guesses = batch['guessed_letters']
    max_seq_length = batch['max_seq_len']
    original_seq_lengths = batch['original_seq_lengths']

    # # Process the batch using your custom function
    # batch_features, batch_missed_chars = process_batch_of_games(
    #     states, guesses, char_frequency, max_word_length, max_seq_length)

    # print(f"Batch features shape: {batch_features.shape}")
    # break  # Remove this break to process all batches, it's here just to process the first batch

    print(states[0])

    print(original_seq_lengths[0])

    break


In [None]:
import numpy as np
from tqdm import tqdm

# Assuming `data_loader` is already defined and configured
total_samples_processed = 0
batch_sizes = []

for batch in tqdm(data_loader()):
    batch_size_current = len(batch['guessed_states'])  # Assuming batch is a dictionary with 'guessed_states'
    total_samples_processed += batch_size_current
    batch_sizes.append(batch_size_current)

In [None]:
# Calculate expected values
expected_num_batches = np.ceil(len(data_module.train_dataset) / data_module.batch_size)
actual_num_batches = len(batch_sizes)
final_batch_size = len(data_module.train_dataset) % data_module.batch_size or data_module.batch_size

# Sanity checks
print(f"Expected number of batches: {expected_num_batches}")
print(f"Actual number of batches: {actual_num_batches}")
print(f"Expected final batch size: {final_batch_size}")
print(f"Actual final batch size: {batch_sizes[-1]}")
print(f"Total samples in dataset: {len(data_module.train_dataset)}")
print(f"Total samples processed: {total_samples_processed}")

# Verifying if the dataset was fully covered
assert total_samples_processed == len(data_module.train_dataset), "Mismatch in the number of processed samples and dataset size."

In [None]:
data_loader = data_module.val_dataloader

for batch in data_loader():
    states = batch['guessed_states']
    guesses = batch['guessed_letters']
    max_seq_length = batch['max_seq_len']
    original_seq_lengths = batch['original_seq_lengths']


    batch_features, batch_missed_chars = process_batch_of_games(
                states, guesses, char_frequency,
                    max_word_length,
                    max_seq_length)

    print(f"{batch_features.shape}")

    break

In [None]:
 dataset_size = len(data_module.train_dataloader().dataset)

 dataset_size

In [None]:
batch_size = data_module.train_dataloader().batch_size

In [None]:
batch_size

In [None]:
# Tunable hyperparameters
LEARNING_RATE = 0.01
HIDDEN_DIM = 32 ### 32
NUM_LAYERS = 3
EMBEDDING_DIM = 30

In [None]:
from scr.encoder import *
from scr.trainer_ import *
from scr.decoder import SimpleLSTM

# Fixed parameters
max_word_length = 29  # Maximum word length
num_embeddings = 28   # Vocabulary size (fixed, based on unique characters in the game)
num_features = 154     # Number of features per character
missed_char_dim = 28  # Additional dimension for missed characters
output_dim = 28       # Output dimension for the model

char_feature_dim = 5  # Features per character
embedding_dim = 50
additional_state_features = num_features \
    - max_word_length * char_feature_dim   # Additional features per state

print(f"Addition state fetatures: {additional_state_features}")
# Initialize the Encoder

encoder = Encoder(num_embeddings, embedding_dim, max_word_length, \
    char_feature_dim, additional_state_features)

input_dim = max_word_length * embedding_dim + additional_state_features
# Initialize the SimpleLSTM decoder
decoder = SimpleLSTM(input_dim=input_dim, 
                     hidden_dim=HIDDEN_DIM, 
                     output_dim=output_dim, 
                     num_layers=NUM_LAYERS,
                     missed_char_dim=missed_char_dim)

# Other parts of the code flow...

In [None]:
# Initialize the HangmanModel
lightning_model = HangmanModel(encoder, decoder, 
                    LEARNING_RATE, char_frequency, 
                    max_word_length, test_words=sampled_test_words)

In [None]:
lightning_model.optimizer_type

In [None]:
from scr.data_module import *

# Initialize Data Module
initial_batch_size = 1 # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                custom_collate_fn)
                                # performance_metrics=None)

In [None]:
from scr.custom_sampler import *
from scr.dataset import *

In [None]:
# from scr.feature_engineering import *


for batch in data_module.train_dataloader():
    print(batch.keys())

    states = batch['guessed_states']
    guesses = batch['guessed_letters']
    max_seq_length = batch['max_seq_len']
    original_seq_lengths = batch['original_seq_lengths']

    batch_features, batch_missed_chars = process_batch_of_games(
                                    states, guesses, char_frequency,
                                    max_word_length,
                                    max_seq_length)

    print(f"Batch features shape {batch_features.shape}\n")


    encoded_guess = pad_and_reshape_labels(guesses, max_seq_length)

    print(f"states: {states}")
    print(f"Guesses: {guesses}\n")
    print(f"Guesses shape: {guesses}\n")
    # print(f"batch missed chars: {batch_missed_chars}\n")

    # print(f"Encoded guess: {encoded_guess}\n")


    print(f"batch missed chars shape: {batch_missed_chars.shape}\n")

    print(f"Encoded guess shape: {encoded_guess.shape}\n")


    # Convert the batch to characters
    missed_chars = batch_to_chars(batch_missed_chars)

    print(f"batch missed chars : So far present characters (that NN should not guess again): {missed_chars}\n")

    break

In [None]:
STOP

In [None]:
## callbacks
from pytorch_lightning.callbacks import Callback, EarlyStopping
from scr.custom_callbacks import *

# Setup EarlyStopping to monitor the test_win_rate
early_stop_callback = EarlyStopping(
    monitor='win_rate',
    min_delta=0.00,
    patience=200,
    verbose=True,
    mode='max'  # Maximize the win rate
)

step_level_early_stopping = StepLevelEarlyStopping(
    monitor='val_miss_penalty', # Metric to monitor
    min_delta=0.0,              # Minimum change to qualify as an improvement
    patience=5                  # Number of steps with no improvement after which training will be stopped
)

In [None]:
from pytorch_lightning.profilers import SimpleProfiler
import pytorch_lightning as pl
from scr.custom_callbacks import *
from scr.dataset import *
from scr.data_module import *
from scr.trainer_ import *

torch.cuda.empty_cache()

# # Create Callbacks
# loss_logging_callback = LossLoggingCallback()

# , SchedulerSetupCallback()] # , loss_logging_callback]
callbacks = [early_stop_callback, step_level_early_stopping] 
FAST_DEV_RUN = False
# # Calculate the minimum percentage of validation batches
# min_val_batches = 1 / len(data_module.val_dataloader())

# # Create Trainer with Callbacks
trainer = pl.Trainer(
    default_root_dir=output_dir,
    fast_dev_run=FAST_DEV_RUN, 
    max_epochs=MAX_EPOCH, 
    callbacks=callbacks,
    num_sanity_val_steps=0,
    reload_dataloaders_every_n_epochs=1,
    enable_progress_bar=True 
    # val_check_interval=0.5
    # limit_train_batches=2,  # Limit the number of training batches to 2
    # limit_val_batches=2     # Limit the number of validation batches to 2
)

# # # # # print(f"Running for {NUM_STRATIFIED_SAMPLES} samples...")
# # # print()
# # # # # # # Assuming combined_eval_metrics is a list of dictionaries
# combined_eval_metrics = trainer.validate(model=lightning_model, datamodule=data_module)

# # performance_metrics_dict = {k: v for d in combined_eval_metrics for k, v in d.items()}

# # # # # # print("Converted Performance Metrics Dictb ionary:", performance_metrics_dict)

# # trainer.datamodule.update_performance_metrics(combined_eval_metrics)

# data_module.update_performance_metrics(performance_metrics_dict)

In [None]:
mo

In [None]:
STOP

In [None]:
# Sample data
sample_game = {
    'guessed_states': ['_d__d__e__', '_d__d__e__', '_d__d__e__', '_d__d_me__', \
        '_d__d_me__', '_d__d_me__', '_d__d_men_', '_d__d_men_'],
    'guessed_letters': ['c', 'r', 'm', 'v', 'i', 'n', 'w', 'o']
}

In [None]:
from scr.feature_engineering import *

In [None]:
# Define parameters
max_word_length = 29  # Set as per your model's requirement
max_seq_length = 10  # Set as per your model's requirement

# # Dummy character frequency (example, create as per your data)
# char_frequency = {char: 1 for char in 'abcdefghijklmnopqrstuvwxyz'}

# Call the function and unpack the returned tuple into two variables
char_sequence_tensor, missed_chars_tensor = process_game_sequence(
    game_states=sample_game['guessed_states'], 
    guessed_letters_sequence=sample_game['guessed_letters'],
    char_frequency=char_frequency, 
    max_word_length=max_word_length,
    max_seq_length=max_seq_length
)

# Now you can print the shape of each tensor separately
print(f"Character Sequence Tensor Shape: {char_sequence_tensor.shape}")
print(f"Missed Chars Tensor Shape: {missed_chars_tensor.shape}")

In [None]:
from scr.feature_engineering import *

In [None]:
# Sample data for a batch of games
batch_sample_games = {
    'guessed_states_batch': [
        ['_d__d__e__'],
        # ['_b__b__t__'],
        # ['__ll___p__'],
    ],
    
    'guessed_letters_batch': [
        ['c'],
        # ['a'],
        # ['i']
    ]
}

In [None]:
# # Sample data for a batch of games
# batch_sample_games = {
#     'guessed_states_batch': [
#         ['_d__d__e__', '_d__d__e__', '_d__d__e__', '_d__d_me__', '_d__d_me__', '_d__d_me__', '_d__d_men_', '_d__d_men_'],
#         ['_b__b__t__', '_b__b__t__', '_b__b__t__', '_b__b_t__', '_b__b_te__', '_b__b_te__', '_b__b_test', '_b__b_test'],
#         ['__ll___p__', '__ll___p__', '__ll___p__', '__ll___p__', '__ll___p__', '__ll__op_', '__ll__op_', '__ll__op_']
#     ],
    
#     'guessed_letters_batch': [
#         ['c', 'r', 'm', 'v', 'i', 'n', 'w', 'o'],
#         ['a', 's', 'e', 'f', 't', 'r', 'n', 'u'],
#         ['i', 'q', 'r', 't', 'p', 'o', 'a', 's']
#     ]
# }

# Define parameters
max_word_length = 29  # Set as per your model's requirement
max_seq_length = 10  # Set as per your model's requirement

# # Dummy character frequency (example, create as per your data)
# char_frequency = {char: 1 for char in 'abcdefghijklmnopqrstuvwxyz'}

# Call the function
batch_features, batch_missed_chars = process_batch_of_games(
    batch_sample_games['guessed_states_batch'],
    batch_sample_games['guessed_letters_batch'],
    char_frequency,
    max_word_length,
    max_seq_length=1
)

print()
# Print shapes for sanity check
print(f"Batch Features Shape: {batch_features.shape}")
print(f"Batch Missed Chars Shape: {batch_missed_chars.shape}")

In [None]:
batch_sample_games['guessed_states_batch'][0]

In [None]:
batch_sample_games['guessed_letters_batch'][0]

In [None]:
# Sample data
sample_game = {
    'guessed_states': ['_d__d__e__', '_d__d__e__', '_d__d__e__', '_d__d_me__', \
        '_d__d_me__', '_d__d_me__', '_d__d_men_', '_d__d_men_'],
    'guessed_letters': ['c', 'r', 'm', 'v', 'i', 'n', 'w', 'o']
}

In [None]:
overall_sucess_rate, guess_outcome = analyze_guess_outcomes(sample_game['guessed_states'], \
    sample_game['guessed_letters'], maximum_word_length=29)

In [None]:
overall_sucess_rate

In [None]:
guess_outcome

In [None]:
game_states = ['allMasked', 'early', 'quarterRevealed', 'midRevealed', 
               'midLateRevealed', 'lateRevealed', 'nearEnd']
game_state_to_idx = {state: idx for idx, state in enumerate(game_states)}

def encode_game_state(game_state):
    state_vector = [0] * len(game_states)
    state_index = game_state_to_idx.get(game_state, -1)
    if state_index >= 0:
        state_vector[state_index] = 1
    return state_vector

In [None]:
from scr.data_module import *

# Initialize Data Module
initial_batch_size = 1  # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                custom_collate_fn)
                                # performance_metrics=None)

# for batch in data_module.train_dataloader():
#     # print(batch)

# batch_features, batch_missed_chars = process_batch(next(iter(data_module.train_dataloader())), \
#     char_frequency, max_word_length)

# print(batch_features.shape)
# print(batch_missed_chars.shape)

# # break

for batch in data_module.train_dataloader():
    states = batch['guessed_states']
    # print(states.shape)
    guesses = batch['guessed_letters']
    max_seq_length = batch['max_seq_len']
    original_seq_lengths = batch['original_seq_lengths']

    # Print debug information
    print("DEBUG INFO:")
    # Uncomment these if needed
    # print(f"Miss Penalty: {miss_penalty}")
    # print(f"Type of Miss Penalty: {type(miss_penalty)}")
    print(f"Batch Word Lengths: {batch['word_length']}")
    print(f"Batch Difficulties: {batch['difficulty']}")
    print(f"Batch Outcomes: {batch['outcome']}")
    print(f"Batch Won Flags: {batch['won']}")

    # Print the entire batch
    print("Batch Contents:")
    for key, value in batch.items():
        print(f"{key}: {value}")

    batch_features, batch_missed_chars = process_batch_of_games(
        states, char_frequency,
        max_word_length,
        max_seq_length)

    print()
    print(f"{batch_features.shape}")
    print(f"{batch_missed_chars.shape}")
    break

In [None]:
miss_char.shape

In [None]:
# # # # Validate the model (if needed)
# trainer.validate(model=lightning_model, datamodule=data_module)
print(f"Training Begin for {NUM_STRATIFIED_SAMPLES} words: {len(train_dataset)} Games")
# # # # # # Fit the model
trainer.fit(lightning_model, data_module)

# # Optionally print the profiler summary
# print(profiler.summary())

# Save the entire modell
trained_model_file = models_dir / f"{NUM_STRATIFIED_SAMPLES}_trained_model.pth"
torch.save(lightning_model, trained_model_file)
print(f"Model saved at {trained_model_file}")

In [None]:
batch_features.shape

In [None]:
batch_missed_chars.shape

In [None]:
features, missed_chars = build_enhanced_feature_set(sample, \
    char_frequency, max_word_length)

In [None]:
features.shape

In [None]:
STOP

In [None]:
import torch
import torch.nn as nn

# Sample dimensions
missed_char_dim = 28  # Dimension of missed character vector
hidden_dim = 10       # Arbitrary hidden dimension for output

class TestModel(nn.Module):
    def __init__(self):
        super(TestModel, self).__init__()
        self.miss_linear = nn.Linear(missed_char_dim, hidden_dim)

    def forward(self, missed_chars):
        missed_chars_processed = self.miss_linear(missed_chars)
        return missed_chars_processed

# Create model
model = TestModel()

# Sample data: a batch of size 1 with 28 missed character indicators
# Creating a sample input with 28 values, each being 0 or 1
missed_chars = torch.tensor([0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, \
    1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1], dtype=torch.float)

# Forward pass
output = model(missed_chars)
output.shape

In [None]:
def calculate_difficulty_score(metrics):
    # Extracting the metrics
    win_rate = metrics.get('performance_wins', 0)
    avg_attempts = metrics.get('performance_total_attempts_used', 0)
    miss_penalty = metrics.get('miss_penalty_avg', 0)

    # Weights for each metric (these can be adjusted)
    weight_win_rate = 1.0   # Higher weight as win rate is a strong indicator of difficulty
    weight_avg_attempts = 0.5  # Moderate weight
    weight_miss_penalty = 0.5  # Moderate weight

    # Normalize the metrics (invert win rate as lower win rate indicates higher difficulty)
    normalized_win_rate = (100 - win_rate) / 100
    # normalized_avg_attempts = avg_attempts / 6  # Assuming max avg_attempts is 6
    normalized_miss_penalty = miss_penalty  # Already in range 0 to 1

    # Calculate the composite score
    composite_score = (
        weight_win_rate * normalized_win_rate +
        weight_miss_penalty * normalized_miss_penalty
    )

    return composite_score

# Example usage
metrics = {
    'performance_wins': 0,  # Example values
    # 'performance_total_attempts_used': 6,
    'miss_penalty_avg': 1
}

score = calculate_difficulty_score(metrics)
print("Difficulty Score:", score)

# # Use the score to determine if the word length should be added to target pairs
# if score >= 0.001:  # Define this threshold based on your game's difficulty scale
#     target_pairs.append((int(word_length),))


In [None]:
def calculate_difficulty_score(metrics):
    # Extracting the metrics
    win_rate = metrics.get('performance_wins', 0)
    miss_penalty = metrics.get('miss_penalty_avg', 0)

    # Weights for each metric
    weight_win_rate = 1.0   # Higher weight for win rate
    weight_miss_penalty = 0.5  # Weight for miss penalty

    # Normalize the metrics (invert win rate as lower win rate indicates higher difficulty)
    normalized_win_rate = (100 - win_rate) / 100
    normalized_miss_penalty = miss_penalty  # Already in range 0 to 1

    # Calculate the composite score
    composite_score = (
        weight_win_rate * normalized_win_rate +
        weight_miss_penalty * normalized_miss_penalty
    )

    return composite_score


# Best-case scenario metrics
best_case_metrics = {
    'performance_wins': 100,  # Maximum win rate
    'miss_penalty_avg': 0     # Minimum miss penalty
}

best_case_score = calculate_difficulty_score(best_case_metrics)
print("Best-Case Difficulty Score:", best_case_score)

In [None]:
# Worst-case scenario metrics
worst_case_metrics = {
    'performance_wins': 0,  # Minimum win rate
    'miss_penalty_avg': 1   # Maximum miss penalty
}

worst_case_score = calculate_difficulty_score(worst_case_metrics)
print("Worst-Case Difficulty Score:", worst_case_score)

In [None]:
def calculate_difficulty_score(metrics, weight_win_rate=1.0, weight_miss_penalty=0.5):
    """
    Calculates the difficulty score based on win rate and miss penalty.
    
    :param metrics: Dictionary containing 'performance_wins' and 'miss_penalty_avg'.
    :param weight_win_rate: Weight for the win rate metric.
    :param weight_miss_penalty: Weight for the miss penalty metric.
    :return: Calculated difficulty score.
    """
    # Extracting the metrics
    win_rate = metrics.get('performance_wins', 0)
    miss_penalty = metrics.get('miss_penalty_avg', 0)

    # Normalize the metrics (invert win rate as lower win rate indicates higher difficulty)
    normalized_win_rate = (100 - win_rate) / 100
    normalized_miss_penalty = miss_penalty  # Already in range 0 to 1

    # Calculate the composite score
    composite_score = (
        weight_win_rate * normalized_win_rate +
        weight_miss_penalty * normalized_miss_penalty
    )

    return composite_score

# Example usage with custom weights
metrics = {
    'performance_wins': 0,  # Example values
    'miss_penalty_avg': 1
}

# Custom weights
custom_weight_win_rate = 1.0
custom_weight_miss_penalty = 0.5

score = calculate_difficulty_score(metrics, custom_weight_win_rate, custom_weight_miss_penalty)
print("Difficulty Score with Custom Weights:", score)

In [None]:
aggregated_metrics = {
    5: {'total_games': 100, 'wins': 60, 'total_attempts_used': 300, 'win_rate': 0.6, \
        'average_attempts_used': 3.0, 'miss_penalty': 0.02},
    6: {'total_games': 150, 'wins': 90, 'total_attempts_used': 450, 'win_rate': 0.6, \
        'average_attempts_used': 3.0, 'miss_penalty': 0.03}
}

# Iterating over the dictionary
for word_len, metrics in aggregated_metrics.items():
    print(f"Word Length: {word_len}")
    for key, value in metrics.items():
        print(f"  {key}: {value}")

In [None]:
def select_target_pairs(performance_metrics, batch_size):
    target_pairs = []

    for word_length, metrics in performance_metrics.items():
        # print(metrics)
        win_rate = metrics.get('win_rate', 0)
        avg_attempts = metrics.get('average_attempts_used', 0)

        if win_rate <= 20 and avg_attempts >= 4:
            target_pairs.append(word_length)

    return target_pairs[:batch_size]

# Test data
performance_metrics = {
    5: {'total_games': 100, 'wins': 60, 'total_attempts_used': 300, 'win_rate': 0.6, 
        'average_attempts_used': 4.0, 'miss_penalty': 0.02},
    6: {'total_games': 150, 'wins': 90, 'total_attempts_used': 450, 'win_rate': 0.6, 
        'average_attempts_used': 4.0, 'miss_penalty': 0.03}
}

# Test the function
batch_size = 10
target_pairs = select_target_pairs(performance_metrics, batch_size)
print("Target Pairs:", target_pairs)

In [None]:
# def calculate_miss_penalty(outputs, miss_chars):
#     if outputs.numel() == 0:
#         print("Empty outputs tensor")
#         return torch.tensor(0.0, device=outputs.device)

#     miss_penalty = torch.sum(outputs * miss_chars) / outputs.numel()
#     return miss_penalty

In [None]:
from scr.feature_engineering import *

In [None]:
word = 'cat'

encoded = encode_word(word)

encoded

In [None]:
# Get missed characters tensor
miss_chars = get_missed_characters(word)

In [None]:
miss_chars

In [None]:
# Creating a dummy output tensor for the word 'cat'
word = 'cat'

# Assumptions for the dummy output
batch_size = 1  # Number of words in the batch
max_seq_len = len(word)  # Maximum sequence length (length of the word)
num_characters = 28  # Total number of characters in the vocabulary

# Creating the dummy output tensor
# For simplicity, filling it with random values between 0 and 1
outputs = torch.rand((batch_size, max_seq_len, num_characters))

outputs.shape, outputs

In [None]:
# probabilities = F.softmax(outputs, dim=-1)

# probabilities

In [None]:
# Creating two dummy output tensors for the word 'cat'
# One where all characters are accurately predicted and another where all are wrong

# Accurate Predictions: Setting the confidence for the correct characters ('c', 'a', 't') to 1
# and others to 0 for each position in the word
correct_outputs = torch.zeros((batch_size, max_seq_len, num_characters))
correct_outputs[0, 0, char_to_idx['c']] = 1  # High confidence for 'c' in the first position
correct_outputs[0, 1, char_to_idx['a']] = 1  # High confidence for 'a' in the second position
correct_outputs[0, 2, char_to_idx['t']] = 1  # High confidence for 't' in the third position

# Wrong Predictions: Setting the confidence for incorrect characters to 1 and for correct ones to 0
wrong_outputs = torch.ones((batch_size, max_seq_len, num_characters))
wrong_outputs[0, 0, char_to_idx['c']] = 0  # Zero confidence for 'c' in the first position
wrong_outputs[0, 1, char_to_idx['a']] = 0  # Zero confidence for 'a' in the second position
wrong_outputs[0, 2, char_to_idx['t']] = 0  # Zero confidence for 't' in the third position

correct_outputs.shape, wrong_outputs.shape

In [None]:
# Calculate miss penalty
miss_penalty = calculate_miss_penalty(wrong_outputs, miss_chars)
miss_penalty

##### Data Dir

In [None]:
NUM_STRATIFIED_SAMPLES = 100 # This will be overwritten by Papermill

NUM_WORD_SAMPLE = 1_000 # words for testing

FAST_DEV_RUN = False

MAX_EPOCH = 15

In [None]:
from pathlib import Path
from scr.custom_sampler import *

# Define the base directory and the paths for training and validation parquet files
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
parquet_path = stratified_samples_dir / 'parquets'

# Create directories for train and validation parquets if they don't exist
parquet_path.mkdir(parents=True, exist_ok=True)
# parquet_valid_path.mkdir(parents=True, exist_ok=True)

# Define and create the directory for models
models_dir = Path("/home/sayem/Desktop/Hangman/models")
models_dir.mkdir(parents=True, exist_ok=True)

# Define your output directory
# Define your output directory and logger directory
output_dir = Path("/home/sayem/Desktop/Hangman/training_outputs")
logger_dir = output_dir / "lightning_logs"

# Create the output and logger directories if they don't exist
output_dir.mkdir(parents=True, exist_ok=True)
logger_dir.mkdir(parents=True, exist_ok=True)

# Define the file path for saving the testing words
testing_words_file_path = stratified_samples_dir / "testing_words.txt"

try:
    testing_word_list = read_words(testing_words_file_path)
    print(f"Length of the testing word list: {len(testing_word_list)}")
    sampled_test_words = stratified_sample_by_length_and_uniqueness(testing_word_list, \
        NUM_WORD_SAMPLE)
    print(f"Sampled {len(sampled_test_words)} unique words for testing.")
except FileNotFoundError:
    print(f"File not found: {testing_words_file_path}")

print(len(sampled_test_words))

##### Dataset Loading

In [None]:
# Create datasets directly from the saved parquet files
hangman_dataset = HangmanDataset(parquet_path)
# valid_dataset = HangmanDataset(parquet_valid_path)

from scr.utils import *

# Assuming `hangman_dataset` is an instance of HangmanDataset
# Usage
train_dataset, valid_dataset \
    = split_hangman_dataset(hangman_dataset, 0.8)

print(len(train_dataset))
print(len(valid_dataset))

assert len(train_dataset) > len(valid_dataset)

In [None]:
hangman_dataset[(29,)]

In [None]:
train_dataset[(29,)]

In [None]:
train_dataset.get_all_group_labels()

In [None]:
len(train_dataset)

In [None]:
# Request a sample with word length 29
word_length = 5
sample = hangman_dataset[(word_length,)]
sample

In [None]:
sample

In [None]:
from scr.data_module import *
from scr.dataset import *

# Initialize Data Module
initial_batch_size = 128  # Set your initial batch size

# Initialize Data Module with the required arguments
data_module = HangmanDataModule(train_dataset, valid_dataset, 
                                initial_batch_size, 
                                custom_collate_fn)

In [None]:
train_loader = data_module.train_dataloader()

In [None]:
dummy_performance_metrics = {
    
    3: {'win_rate': 45, 'average_attempts_used': 5},
    4: {'win_rate': 60, 'average_attempts_used': 3},  # This won't be selected due to high win rate
    5: {'win_rate': 30, 'average_attempts_used': 6},
    6: {'win_rate': 48, 'average_attempts_used': 4},
    # ... add more dummy metrics as needed ...
}

In [None]:
# Initialize PerformanceBasedSampler
sampler = PerformanceBasedSampler(dataset=hangman_dataset, 
                                  performance_metrics=dummy_performance_metrics, 
                                  batch_size=10)

# Verify target pairs
print("Target pairs:", sampler.target_pairs)

In [None]:
next(iter(sampler))

In [None]:
train_dataset[(1,)]