##### Imports

In [None]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset

from scr.utils import read_words, save_words_to_file

import pickle
from pathlib import Path
from scr.dataset import *

import gc

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

from scr.utils import print_scenarios
torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)
random.shuffle(word_list)

# base_dataset_dir = Path('dataset/pkl')

##### Reading Data

In [None]:
from pathlib import Path

NUM_STRATIFIED_SAMPLES = 250_000
# # # Define the base directory

base_dataset_dir = Path(f"/media/sayem/510B93E12554BBD1/dataset/{NUM_STRATIFIED_SAMPLES}")


parquet_file_path = base_dataset_dir / "HangmanData.parquet"

# Paths to the words files
train_words_file_path = base_dataset_dir / 'train_words.txt'
test_words_file_path = base_dataset_dir / 'test_words.txt'

# Read the words from the files
try:
    train_words = read_words(train_words_file_path)
    print(f"Loaded {len(train_words)} train words from {train_words_file_path}")
except FileNotFoundError:
    print(f"File not found: {train_words_file_path}")

In [None]:
base_dataset_dir

In [None]:
# # For inference
from scr.feature_engineering import *

word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Model Building

##### Dataset Loading and train-test split

In [None]:
from torch.utils.data import DataLoader

from scr.dataset import HangmanDataset # , custom_collate_fn

from scr.feature_engineering import process_batch_of_games

from sklearn.model_selection import train_test_split

# Load the dataset
hangman_dataset = HangmanDataset(parquet_file_path)  # Replace with your Parquet file path

# Split the dataset
train_dataset, valid_dataset = train_test_split(hangman_dataset, \
    test_size=0.20, random_state=42)

# Now, you can use train_dataset for training and valid_dataset for validation

In [None]:
hangman_dataset[10] #

In [None]:
from scr.trainer import *

# Usage
lstm_model = SimpleLSTM(input_dim=145, hidden_dim=256, output_dim=28, 
                                num_layers=2, missed_char_dim=28)

lightning_model = HangmanModel(lstm_model, learning_rate=0.001, 
                            char_frequency=char_frequency, 
                            max_word_length=max_word_length)

# Create data loaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=32, \
                                collate_fn=custom_collate_fn, \
                                shuffle=True)

val_loader = create_val_loader(valid_dataset)

# Create a PyTorch Lightning trainer and train the model
trainer = pl.Trainer(max_epochs=10)
trainer.fit(lightning_model, train_loader, val_loader)