##### Imports

In [1]:
import sys
from pathlib import Path
import warnings

import warnings
import pandas as pd
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

import sys
# Custom library paths
sys.path.extend(['../', './scr'])

from scr.utils import set_seed
from scr.utils import read_words
from pathlib import Path
import random
from collections import Counter, defaultdict
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset
from scr.feature_engineering import \
    calculate_char_frequencies, calculate_word_frequencies
from scr.utils import read_words, save_words_to_file

import pickle
from pathlib import Path
from scr.dataset import *
from scr.game import *
from scr.plot_utils import *

import gc

set_seed(42)

import torch
import torch.nn as nn
from pathlib import Path
import random

from scr.utils import print_scenarios
torch.set_float32_matmul_precision('medium')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Read and Shuffle Word List
word_list = read_words('data/words_250000_train.txt') # , limit=10000)
# word_list = read_words('data/250k.txt', limit=10000)
random.shuffle(word_list)

# base_dataset_dir = Path('dataset/pkl')

##### Reading Data

In [2]:
from pathlib import Path

NUM_STRATIFIED_SAMPLES = 15000
# # Define the base directory
base_dataset_dir = Path(f'./dataset/{NUM_STRATIFIED_SAMPLES}')
# base_dataset_dir = Path(f"/media/sayem/510B93E12554BBD1/dataset/{NUM_STRATIFIED_SAMPLES}")

# Ensuring the base directory and 'pkl' subdirectory exist
base_dataset_dir.mkdir(parents=True, exist_ok=True)
pkls_dir = base_dataset_dir / 'pkl'
pkls_dir.mkdir(parents=True, exist_ok=True)

# Paths to the words files
train_words_file_path = base_dataset_dir / 'train_words.txt'
test_words_file_path = base_dataset_dir / 'test_words.txt'

# Read the words from the files
try:
    train_words = read_words(train_words_file_path)
    print(f"Loaded {len(train_words)} train words from {train_words_file_path}")
except FileNotFoundError:
    print(f"File not found: {train_words_file_path}")

Loaded 181840 train words from dataset/15000/train_words.txt


In [3]:
base_dataset_dir

PosixPath('dataset/15000')

In [4]:
# # For inference
from scr.feature_engineering import *

word_frequencies = calculate_word_frequencies(word_list)
char_frequency = calculate_char_frequencies(word_list)
max_word_length = max(len(word) for word in word_list)

##### Model Building

In [5]:
from scr.simple_model import SimpleLSTM
from scr.base_model import BaseModel
from pathlib import Path
import torch
from scr.feature_engineering import *

# max_word_length = 29 # TODO will remove later
# Instantiate and test the model
config = {
    'embedding_dim': 200,
    'hidden_dim': 256,
    'num_layers': 2,
    'vocab_size': 27,
    'max_word_length': max_word_length,
    'input_feature_size': 5,
    'use_embedding': True,
    'miss_linear_dim': 50
}

model = SimpleLSTM(config)
optimizer = model.optimizer

# Assuming 'model' is your trained model instance
model.save_model(file_path='models/model.pth')

# Assuming the saved model file is 'models/model.pth'
model_file_path = 'models/model.pth'

# Specify the device to load the model onto
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model
loaded_model = BaseModel.load_model(SimpleLSTM, model_file_path)
# Now `loaded_model` is an instance of `SimpleLSTM` with the state and config loaded

# model = loaded_model

##### Dataset Loading and train-test split

In [6]:
pkls_dir

PosixPath('dataset/15000/pkl')

In [7]:
from scr.dataset import ProcessedHangmanDataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import random
from scr.model_training import *

# # Load the dataset
processed_dataset = ProcessedHangmanDataset(pkls_dir, \
    char_frequency, max_word_length, files_limit=None)
    
print(len(processed_dataset)) 

In [None]:
processed_dataset[100]

IndexError: Index out of range

In [None]:
# Convert PyTorch dataset to a list for train_test_split
dataset_list = [processed_dataset[i] for i in range(len(processed_dataset))]

# Perform an 80%-20% train-test split
train_data, val_data = train_test_split(dataset_list, \
    test_size=0.2, random_state=42)

val_loader = processed_dataset.create_val_loader(val_data)

del dataset_list

In [None]:
# ## Data Balance Sanity check
# from scr.dataset import *

# analyze_dataset_sanity(processed_dataset, batch_size)

In [None]:
# ## Data Balance Sanity check
# from scr.dataset import *

# analyze_word_length_balance(processed_dataset, batch_size)

##### Untrained Model Performence

In [None]:
# # Call the validation function
# validation_results = validate_hangman(model, val_loader, \
#     char_frequency, max_word_length, device, max_attempts=6, normalize=True, \
#         max_games_per_epoch=1000)

# # Access the results
# character_level_results = validation_results["character_level"]
# game_simulation_results = validation_results["game_simulation"]

# from scr.plot_utils import *

# save_path = Path(f"plots/{NUM_STRATIFIED_SAMPLES}_untrained_model_performence_word_stats_plot.png")
# plot_word_stats(game_simulation_results["length_stats"], save_path)

# # # Print results for character-level validation
# # print("Character Level Validation:")
# print(f"Average Loss: {character_level_results['avg_loss']}")
# print(f"Miss Penalty: {character_level_results['avg_miss_penalty']}")
# # print(f"Character Accuracy: {character_level_results['char_accuracy']}")
# # print(f"Word Accuracy: {character_level_results['word_accuracy']}")
# # print("Word Statistics:", character_level_results["word_stats"])  # Updated key

# # # Print results for game simulation
# # print("\nGame Simulation:")
# print(f"Win Rate: {game_simulation_results['win_rate']}")
# print(f"Average Attempts: {game_simulation_results['average_attempts']}")
# print(f"Total Games: {game_simulation_results['total_games']}")
# print(f"Total Wins: {game_simulation_results['total_wins']}")
# print(f"Total Losses: {game_simulation_results['total_losses']}")
# # print("Word Stats:", game_simulation_results["game_stats"])
# # print("Word Length Stats:", game_simulation_results["length_stats"])

In [None]:
# print(f"Total Games: {game_simulation_results['total_games']}")

In [None]:
# game_simulation_results["length_stats"]


In [None]:
# init_performance_dict = game_simulation_results["length_stats"]

# init_performance_dict = {
#     length: {"wins": 1, "losses": 1, "total_attempts": 2, "games": 2}
#     for length in range(1, max_word_length + 1)
# }

In [None]:
# def initialize_performance_dict(dataset, default_games=10):
#     """
#     Initializes the performance dictionary based on the word lengths in the dataset.

#     :param dataset: Dataset object.
#     :param default_games: Total default games for each word length.
#     :return: A dictionary with initial performance metrics for each word length in the dataset.
#     """
#     unique_word_lengths = set()
#     for _, _, additional_info in dataset:
#         word_length = int(additional_info['word_length'])
#         unique_word_lengths.add(word_length)

#     performance_dict = {}
#     default_wins = 1
#     default_losses = 1
#     remaining_games = default_games - default_wins - default_losses

#     for length in unique_word_lengths:
#         performance_dict[length] = {
#             "wins": default_wins + remaining_games // 2,
#             "losses": default_losses + remaining_games // 2,
#             "total_attempts": default_games,
#             "games": default_games
#         }

#     return performance_dict

# # Example usage
# init_performance_dict = initialize_performance_dict(processed_dataset, default_games=10)

In [None]:
# init_performance_dict = game_simulation_results["length_stats"]

In [None]:
batch_size = 256 # hyperparams

# custom_sampler = PerformanceBasedSampler(train_data, batch_size, init_performance_dict)
train_loader = DataLoader(train_data, \
    batch_size=batch_size, \
        collate_fn=processed_dataset.custom_collate_fn)

for batch in train_loader:
    pass

In [None]:
# batch_size = 256 # hyperparams

# custom_sampler = PerformanceBasedSampler(train_data, batch_size, init_performance_dict)
# train_loader = DataLoader(train_data, batch_sampler=custom_sampler, \
#     collate_fn=processed_dataset.custom_collate_fn)

# for batch in train_loader:
#     pass

In [None]:
STOP

In [None]:
# for batch in train_loader:
#     pass

In [None]:
# init_performance_dict

In [None]:
# import matplotlib.pyplot as plt
# from collections import Counter

# def plot_word_length_distribution(batch, batch_idx):
#     # Extracting word lengths from the second tensor in the batch
#     word_lengths = batch[1].tolist()  # Assuming batch[1] contains word lengths
#     counter = Counter(word_lengths)
#     plt.bar(counter.keys(), counter.values())
#     plt.title(f"Word Length Distribution in Batch {batch_idx}")
#     plt.xlabel("Word Length")
#     plt.ylabel("Count")
#     plt.show()


# # Inspect the first batch
# for i, batch in enumerate(train_loader):
#     plot_word_length_distribution(batch, i)
#     break

##### Training

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

def train_model(model, train_data, init_performance_dict, \
    val_loader, num_epochs, optimizer, batch_size, scheduler=None, device=device):
    best_val_loss = float('inf')
    epochs_no_improve = 0
    n_epochs_stop = 10  # Number of epochs to stop after no improvement

    for epoch in range(num_epochs):
        # Update the sampler with the latest performance dictionary
        custom_sampler = PerformanceBasedSampler(train_data, batch_size, \
            init_performance_dict)
            
        train_loader = DataLoader(train_data, batch_sampler=custom_sampler, \
            collate_fn=processed_dataset.custom_collate_fn)

        # train_loader = DataLoader(train_data, batch_size=batch_size, \
        #     collate_fn=processed_dataset.custom_collate_fn)
        
        model.train()

        train_loss, train_miss_penalty = train_on_data_loader(model, \
            train_loader, device, optimizer)

        print(f"Epoch {epoch}: Training Loss: {train_loss}, \
Miss Penalty: {train_miss_penalty}")
        
        model.eval()
        with torch.no_grad():
            validation_results = validate_hangman(model, val_loader, char_frequency, \
                max_word_length, device)
            val_loss = validation_results["character_level"]['avg_loss']
            # Update performance dict for the next epoch
            init_performance_dict = validation_results["game_simulation"]["length_stats"]

        # Scheduler step
        if scheduler:
            scheduler.step(val_loss)

        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve == n_epochs_stop:
                print("Early stopping triggered")
                break

        print(f"Epoch {epoch}: val loss: {val_loss}")
        print()


# # Assuming model, init_train_data, val_loader, num_epochs, optimizer, batch_size are already defined
# init_performance_dict = {length: {"wins": 0, "losses": 0, "total_attempts": 0, "games": 0} \
#     for length in range(1, max_word_length + 1)}
scheduler = ReduceLROnPlateau(optimizer, 'min', \
    patience=5, factor=0.1, verbose=True)

num_epochs = 10
batch_size = 128

train_model(model, train_data, init_performance_dict, \
    val_loader, num_epochs, optimizer, batch_size, scheduler)

In [None]:
# from sklearn.model_selection import KFold
# import torch
# from torch.utils.data import DataLoader, Subset

# def train_model(model, train_data, val_data, num_epochs, optimizer, scheduler, device):
#     train_loader = DataLoader(train_data, batch_size=batch_size, \
#         collate_fn=custom_collate_fn)
#     val_loader = DataLoader(val_data, batch_size=batch_size, \
#         collate_fn=custom_collate_fn)

#     for epoch in range(num_epochs):
#         train_loss, train_miss_penalty = train_epoch(model, train_loader, \
#             optimizer, device)
#         val_loss, val_miss_penalty = validate_epoch(model, \
#             val_loader, device)

#         scheduler.step(val_loss)  # Adjust LR based on validation loss

#         print(f"Epoch {epoch}: Training Loss: {train_loss}, Miss Penalty: \
#             {train_miss_penalty}, Validation Loss: {val_loss}, Validation Miss Penalty: {val_miss_penalty}")
#         # Save model checkpoints if needed

# def k_fold_cross_validate(model, dataset, k, num_epochs, optimizer, scheduler_class, device):
#     kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    
#     for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
#         print(f"Fold {fold}")
#         train_subset = Subset(dataset, train_idx)
#         val_subset = Subset(dataset, val_idx)

#         # Initialize a new model for each fold
#         model = SimpleLSTM(config)
#         model.to(device)
#         optimizer = model.optimizer
#         scheduler = scheduler_class(optimizer)

#         train_subset = Subset(dataset, train_idx)
#         val_subset = Subset(dataset, val_idx)
#         train_model(model, train_subset, val_subset, num_epochs, optimizer, scheduler, device)


#         train_model(model, train_subset, val_subset, num_epochs, optimizer, scheduler, device)

# # Usage example
# num_epochs = 10
# batch_size = 64
# scheduler_class = torch.optim.lr_scheduler.ReduceLROnPlateau  # Example scheduler class
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# k_fold_cross_validate(model, processed_dataset, \
#     5, num_epochs, optimizer, scheduler_class, device)

##### Trained Model Performence

In [None]:
val_loader = processed_dataset.create_val_loader(val_data)


# Call the validation function
validation_results = validate_hangman(model, val_loader, \
    char_frequency, max_word_length, device)

# Access the results
character_level_results = validation_results["character_level"]
game_simulation_results = validation_results["game_simulation"]

from scr.plot_utils import *

save_path = Path(f"plots/{NUM_STRATIFIED_SAMPLES}_trained_model_performence_word_stats_plot.png")
plot_word_stats(game_simulation_results["length_stats"], save_path)

# # Print results for character-level validation
# print("Character Level Validation:")
print(f"Average Loss: {character_level_results['avg_loss']}")
print(f"Miss Penalty: {character_level_results['avg_miss_penalty']}")
# print(f"Character Accuracy: {character_level_results['char_accuracy']}")
# print(f"Word Accuracy: {character_level_results['word_accuracy']}")
# print("Word Statistics:", character_level_results["word_stats"])  # Updated key

# # Print results for game simulation
# print("\nGame Simulation:")
print(f"Win Rate: {game_simulation_results['win_rate']}")
print(f"Average Attempts: {game_simulation_results['average_attempts']}")
print(f"Total Games: {game_simulation_results['total_games']}")
print(f"Total Wins: {game_simulation_results['total_wins']}")
print(f"Total Losses: {game_simulation_results['total_losses']}")
# print("Word Stats:", game_simulation_results["game_stats"])
# print("Word Length Stats:", game_simulation_results["length_stats"])

In [None]:
print("Word Stats:", game_simulation_results["game_stats"])

In [None]:
STOP

In [None]:
batch_features_tensor.shape

In [None]:
len(val_loader)

In [None]:
batch_features_tensor.shape

In [None]:
STOP

In [None]:
STOP

In [None]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()  # Set the model to training mode
    total_loss = 0
    total_miss_penalty = 0

    for batch in data_loader:
        if batch[0] is None:
            continue  # Skip empty batches

        game_states_batch, lengths_batch, missed_chars_batch, labels_batch = batch
        game_states_batch, lengths_batch, missed_chars_batch = \
            game_states_batch.to(device), lengths_batch, missed_chars_batch.to(device)

        optimizer.zero_grad()

        outputs = model(game_states_batch, lengths_batch, missed_chars_batch)
        model_output_shape = outputs.shape
        reshaped_labels = pad_and_reshape_labels(labels_batch, model_output_shape).to(device)

        loss, miss_penalty = model.calculate_loss(outputs, reshaped_labels, \
                                                  lengths_batch, missed_chars_batch, 27)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_miss_penalty += miss_penalty.item()  # Accumulate miss penalty

    avg_loss = total_loss / len(data_loader)
    avg_miss_penalty = total_miss_penalty / len(data_loader)
    return avg_loss, avg_miss_penalty  # Return average loss and miss penalty



def validate_epoch(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    total_miss_penalty = 0

    with torch.no_grad():
        for batch in data_loader:
            if batch[0] is None:
                continue  # Skip empty batches


            game_states_batch, lengths_batch, missed_chars_batch, labels_batch = batch
            game_states_batch, lengths_batch, missed_chars_batch = \
                game_states_batch.to(device), lengths_batch, missed_chars_batch.to(device)

            outputs = model(game_states_batch, lengths_batch, missed_chars_batch)
            model_output_shape = outputs.shape
            reshaped_labels = pad_and_reshape_labels(labels_batch, model_output_shape).to(device)

            loss, miss_penalty = model.calculate_loss(outputs, reshaped_labels, \
                                                      lengths_batch, missed_chars_batch, 27)
            total_loss += loss.item()

            total_miss_penalty += miss_penalty.item()  # Accumulate miss penalty

    avg_loss = total_loss / len(data_loader)
    avg_miss_penalty = total_miss_penalty / len(data_loader)
    return avg_loss, avg_miss_penalty  # 

# for epoch in range(num_epochs):
#     train_loss = train_epoch(model, train_loader, optimizer, device)
#     val_loss = validate_epoch(model, val_loader, device)
    
#     print(f"Epoch {epoch}: Training Loss: {train_loss}, Validation Loss: {val_loss}")
#     # You can add code to save model checkpoints if needed

In [None]:
from sklearn.model_selection import KFold
import torch
from torch.utils.data import DataLoader, Subset

def train_model(model, train_data, val_data, num_epochs, optimizer, scheduler, device):
    train_loader = DataLoader(train_data, batch_size=batch_size, \
        collate_fn=custom_collate_fn)
    val_loader = DataLoader(val_data, batch_size=batch_size, \
        collate_fn=custom_collate_fn)

    for epoch in range(num_epochs):
        train_loss, train_miss_penalty = train_epoch(model, train_loader, \
            optimizer, device)
        val_loss, val_miss_penalty = validate_epoch(model, \
            val_loader, device)

        scheduler.step(val_loss)  # Adjust LR based on validation loss

        print(f"Epoch {epoch}: Training Loss: {train_loss}, Miss Penalty: \
            {train_miss_penalty}, Validation Loss: {val_loss}, Validation Miss Penalty: {val_miss_penalty}")
        # Save model checkpoints if needed

def k_fold_cross_validate(model, dataset, k, num_epochs, optimizer, scheduler_class, device):
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        print(f"Fold {fold}")
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)

        # Initialize a new model for each fold
        model = SimpleLSTM(config)
        model.to(device)
        optimizer = model.optimizer
        scheduler = scheduler_class(optimizer)

        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)
        train_model(model, train_subset, val_subset, num_epochs, optimizer, scheduler, device)


        train_model(model, train_subset, val_subset, num_epochs, optimizer, scheduler, device)

# Usage example
num_epochs = 10
batch_size = 64
scheduler_class = torch.optim.lr_scheduler.ReduceLROnPlateau  # Example scheduler class
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

k_fold_cross_validate(model, processed_dataset, \
    5, num_epochs, optimizer, scheduler_class, device)

In [None]:
STOP

In [None]:
from scr.dataset import ProcessedHangmanDataset
from scr.custom_sampler import PerformanceBasedSampler


print(f"Total number of data points in the dataset: {len(processed_dataset)}")

# Initialize the sampler
sampler = PerformanceBasedSampler(
    processed_dataset, 
    performance_metrics,
    max_word_length=max_word_length
)

print(f"Sampler created with {len(sampler)} indices.")

In [None]:
sample_indices = list(sampler)[:10]  # Get the first 10 sampled indices
print("Sampled indices:", sample_indices)

for idx in sample_indices:
    data_point = processed_dataset[idx]
    print(f"Data at index {idx}: {data_point}")
    break

In [None]:
from scr.utils import print_scenarios

def process_pkl_files(base_dir):
    pkl_list = []

    # Iterate over all batch directories
    for batch_dir in sorted(base_dir.iterdir(), key=lambda x: int(x.name) if x.name.isdigit() else float('inf')):
        if batch_dir.is_dir():
            # List all .pkl files in the current batch directory
            pkl_files = list(batch_dir.glob("*.pkl"))

            for pkl_file in pkl_files:
                try:
                    with open(pkl_file, 'rb') as file:
                        game_data = pickle.load(file)
                except IOError as e:
                    print(f"Error reading file {pkl_file}: {e}")
                    continue

                # Processing each pickle file
                pkl_list.extend(process_pkl_file(pkl_file, game_data))

    return pkl_list

def process_pkl_file(pkl_file, game_data):
    file_scenarios = []
    for data in game_data:
        game_won, guesses = data
        word, initial_state, difficulty, outcome = extract_info_from_filename(pkl_file)
        
        # Create a scenario dictionary for each data tuple
        scenario = {
            'word': word,
            'difficulty': difficulty,
            'outcome': outcome,
            'data': (game_won, guesses)
        }
        file_scenarios.append((pkl_file, scenario))  # Add scenario to the list

    return file_scenarios

def extract_info_from_filename(pkl_file):
    parts = pkl_file.stem.split('_from_')
    word_and_state = parts[0].split('_')
    word = '_'.join(word_and_state[:-1])
    initial_state = word_and_state[-1]
    difficulty, outcome = parts[1].split('_')[-2:]
    return word, initial_state, difficulty, outcome

# def print_scenarios(scenarios):
#     # Assuming this function is defined elsewhere
#     pass

# Process all pickle files
pkl_list = process_pkl_files(base_dataset_dir)

# Accessing an individual pickle file's content by index
index_to_access = 0  # Change this index to access different files
if index_to_access < len(pkl_list):
    file_path, scenario = pkl_list[index_to_access]
    print(f"Contents of {file_path}:")
    print_scenarios([scenario])  # Wrap scenario in a list for the function
else:
    print(f"No pickle file at index {index_to_access}")

No pickle file at index 0



In [None]:
pkl_list = []

# Iterate over all batch directories
for batch_dir in sorted(base_dataset_dir.iterdir(), \
    key=lambda x: int(x.name) if x.name.isdigit() else float('inf')):
    if batch_dir.is_dir():
        # List all .pkl files in the current batch directory
        pkl_files = list(batch_dir.glob("*.pkl"))

        for pkl_file in pkl_files:
            with open(pkl_file, 'rb') as file:
                game_data = pickle.load(file)
                # Extract information from file name
                parts = pkl_file.stem.split('_from_')
                word_and_statet = parts[0].split('_')
                word = '_'.join(word_and_state[:-1])
                initial_state = word_and_state[-1]
                difficulty, outcome = parts[1].split('_')[-2:]

                # Assuming game_data is a list of tuples (game_won, guesses)
                for data in game_data:
                    game_won, guesses = data
                    # Create a scenario dictionary for each data tuple
                    scenario = {
                        'word': word,
                        'difficulty': difficulty,
                        'outcome': outcome,
                        'data': (game_won, guesses)
                    }
                    pkl_list.append((pkl_file, scenario))  # Add scenario to the list

# Accessing an individual pickle file's content by index
index_to_access = 0  # Change this index to access different files
if index_to_access < len(pkl_list):
    file_path, scenario = pkl_list[index_to_access]
    print(f"Contents of {file_path}:")
    print_scenarios([scenario])  # Wrap scenario in a list for the function
else:
    print(f"No pickle file at index {index_to_access}")