##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import pickle
from pathlib import Path
from scr.dataset import *

import gc

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

from scr.utils import print_scenarios
torch.set_float32_matmul_precision('medium')
from pathlib import Path

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)
random.shuffle(word_list)

# base_dataset_dir = Path('dataset/pkl')

##### Reading Data

In [2]:
#### Papermil if need
 
NUM_STRATIFIED_SAMPLES = 50_000 # This will be overwritten by Papermill

# Define the base directory
base_dataset_dir = Path("/media/sayem/510B93E12554BBD1/dataset/")

# Create a subdirectory for the stratified samples
stratified_samples_dir = base_dataset_dir / str(NUM_STRATIFIED_SAMPLES)
stratified_samples_dir.mkdir(parents=True, exist_ok=True)

print(stratified_samples_dir)

/media/sayem/510B93E12554BBD1/dataset/50000


In [3]:
parquet_file_path = Path(stratified_samples_dir) / "parquets"
parquet_file_path.mkdir(exist_ok=True, parents=True)

# Paths to the words files
train_words_file_path = stratified_samples_dir / 'train_words.txt'
test_words_file_path = stratified_samples_dir / 'test_words.txt'

# Read the words from the files
try:
    train_words = read_words(train_words_file_path)
    print(f"Loaded {len(train_words)} train words from {train_words_file_path}")
except FileNotFoundError:
    print(f"File not found: {train_words_file_path}")

Loaded 181840 train words from /media/sayem/510B93E12554BBD1/dataset/50000/train_words.txt


In [4]:
# # For inference
from scr.feature_engineering import *

word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Model Building

##### Dataset Loading and train-test split

In [5]:
# from torch.utils.data import DataLoader
# from scr.dataset import HangmanDataset  # Assuming HangmanDataset is your custom dataset class
# from sklearn.model_selection import train_test_split
# from scr.feature_engineering import process_batch_of_games
# from pathlib import Path

# import pandas as pd
# import os
# from sklearn.model_selection import train_test_split

# import gc


# def combine_and_split_dataset(directory, test_size=0.20, random_state=42):
#     # Read all parquet files into one DataFrame
#     all_data = pd.concat([pd.read_parquet(os.path.join(directory, f)) for f in \
#         os.listdir(directory) if f.endswith('.parquet')], ignore_index=True)

#     # Split the DataFrame into train and validation DataFrames
#     train_df, valid_df = train_test_split(all_data, test_size=test_size, random_state=random_state)

#     # Optionally, save these DataFrames to new parquet files
#     train_df.to_parquet(os.path.join(directory, 'train_combined.parquet'))
#     valid_df.to_parquet(os.path.join(directory, 'valid_combined.parquet'))

#     return 'train_combined.parquet', 'valid_combined.parquet'

# # Split the dataset
# train_file, valid_file = combine_and_split_dataset(parquet_file_path)

# # Create dataset instances
# train_dataset = HangmanDataset([os.path.join(parquet_file_path, train_file)])
# valid_dataset = HangmanDataset([os.path.join(parquet_file_path, valid_file)])


# gc.collect()

In [6]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

import pandas as pd
import os
from sklearn.model_selection import train_test_split

def split_each_parquet(directory, test_size=0.20, random_state=42):
    train_files, valid_files = [], []

    for file in os.listdir(directory):
        if file.endswith('.parquet'):
            file_path = os.path.join(directory, file)
            df = pd.read_parquet(file_path)

            # Skip splitting if DataFrame has only one row
            if len(df) <= 1:
                continue

            # Split the DataFrame
            train_df, valid_df = train_test_split(df, test_size=test_size, random_state=random_state)

            # Save split DataFrames to new parquet files
            train_file = os.path.join(directory, f'train_{file}')
            valid_file = os.path.join(directory, f'valid_{file}')
            train_df.to_parquet(train_file)
            valid_df.to_parquet(valid_file)

            train_files.append(train_file)
            valid_files.append(valid_file)

    return train_files, valid_files

# Example usage
train_files, valid_files = split_each_parquet(parquet_file_path)

# Create dataset instances
train_dataset = HangmanDataset(train_files)
valid_dataset = HangmanDataset(valid_files)

In [None]:
len(train_dataset) > len(valid_dataset)

In [None]:
# Usage
from scr.model import *

lstm_model = SimpleLSTM(input_dim=145, hidden_dim=256, output_dim=28, 
                                num_layers=2, missed_char_dim=28)

from scr.trainer import *

# # Usage
# lstm_model = SimpleLSTM(input_dim=145, hidden_dim=256, output_dim=28, 
#                                 num_layers=2, missed_char_dim=28)

lightning_model = HangmanModel(lstm_model, learning_rate=0.0001, 
                            char_frequency=char_frequency, 
                            max_word_length=max_word_length)

train_loader = DataLoader(train_dataset, batch_size=1024, 
                          collate_fn=custom_collate_fn, 
                          shuffle=True, 
                          num_workers=15,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs
                          
val_loader = DataLoader(valid_dataset, batch_size=512, 
                          collate_fn=custom_collate_fn, 
                          shuffle=True, 
                          num_workers=15,  # Adjust based on your system
                          prefetch_factor=2)  # Adjust based on your needs

In [None]:
from scr.game import *
# Read the words from the file
try:
    test_words = read_words(test_words_file_path)
    print(f"Loaded {len(test_words)} test words from {test_words_file_path}")
except FileNotFoundError:
    print(f"File not found: {test_words_file_path}")

# Example usage
result = play_games_and_calculate_stats(lstm_model, test_words, char_frequency, max_word_length)

print(f"Overall Win Rate: {result['overall_win_rate']}%, Overall Average Attempts: {result['overall_avg_attempts']}")
for length, data in result["length_wise_stats"].items():
    print(f"Length {length}: Win Rate: {data['win_rate']}%, Average Attempts: {data['average_attempts_used']}")

In [None]:
from pytorch_lightning.callbacks import EarlyStopping

# EarlyStopping callback monitors 'val_loss' and stops training after 3 epochs if it doesn't improve
early_stop_callback = EarlyStopping(
    monitor='val_miss_penalty',  # Monitor the validation miss penalty
    min_delta=0.00,  # Minimum change to qualify as an improvement
    patience=3,  # Number of epochs with no improvement after which training will be stopped
    verbose=True,
    mode='min'  # 'min' mode means training will stop when the quantity monitored has stopped decreasing
)

In [None]:
from pytorch_lightning.profilers import SimpleProfiler

# Create a PyTorch Lightning trainer and train the model
profiler = SimpleProfiler()

trainer = pl.Trainer(max_epochs=15, callbacks=[early_stop_callback]) # , profiler=profiler)
trainer.fit(lightning_model, train_loader, val_loader)
# print(profiler.summary())

In [None]:
from pathlib import Path
import torch
import pytorch_lightning as pl

from pathlib import Path
import torch

# Assuming 'lightning_model' is your HangmanModel instance
# And assuming you have a variable 'NUM_STRATIFIED_SAMPLES' indicating the number of stratified samples

# Create a 'models' directory within the base directory
models_dir = Path('./models')
models_dir.mkdir(exist_ok=True)

# Define the path for saving the entire LSTM model object with the number of stratified samples as a prefix
torch_model_save_path = models_dir / f'{NUM_STRATIFIED_SAMPLES}_full_simple_lstm_model.pth'

# Save the entire LSTM model object
torch.save(lightning_model.model, torch_model_save_path)

print(f"Entire LSTM model saved at: {torch_model_save_path}")

##### Testing

In [None]:
# Load the entire LSTM model object
loaded_lstm_model = torch.load(torch_model_save_path)

# If you want to use the model for inference
loaded_lstm_model.eval()  # Set the model to evaluation mode

In [None]:
from scr.game import *

word = 'may'

play_game_with_a_word(loaded_lstm_model, word, char_frequency, max_word_length)

In [None]:
from scr.game import *
# Read the words from the file
try:
    test_words = read_words(test_words_file_path)
    print(f"Loaded {len(test_words)} test words from {test_words_file_path}")
except FileNotFoundError:
    print(f"File not found: {test_words_file_path}")

# Example usage
result = play_games_and_calculate_stats(loaded_lstm_model, test_words, char_frequency, max_word_length)

print(f"Overall Win Rate: {result['overall_win_rate']}%, Overall Average Attempts: {result['overall_avg_attempts']}")
for length, data in result["length_wise_stats"].items():
    print(f"Length {length}: Win Rate: {data['win_rate']}%, Average Attempts: {data['average_attempts_used']}")