In [206]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [207]:
import sys
from pathlib import Path

# Get the parent directory (i.e. project root)
project_root = Path().resolve().parent.parent 
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from functools import partial

from tokenization.byte_pair_encoding.get_tokenizers import train_and_save_tokenizer_for, load_tokenizer_from

from pre_training.text_summarization.dataset import TextSummarizationDataset

from src.embedding import CustomEmbedding
from src.transformer import EncoderDecoderTransformer
from src.utils import padding_collate_fn

from src.train_utils import run_train_epoch
from src.validation_utils import run_gold_validation_loop, run_autoregressive_validation_loop

In [208]:
DF_DATA_PATH = '../../data/SAMSum/'

BPE_IN_PATH = '../../data/SAMSum/train_summary_and_dialogue.txt'
BPE_OUT_PATH = '../../tokenization/trained_tokenizers/SAMSum_BPE'

In [209]:
MAX_CONTEXT_WINDOW = 100

BATCH_SIZE = 64

D_MODEL = 16

In [210]:
train_df = pd.read_json(DF_DATA_PATH + 'train_df.json', orient = 'records', lines = True)
val_df = pd.read_json(DF_DATA_PATH + 'val_df.json', orient = 'records', lines = True)
test_df = pd.read_json(DF_DATA_PATH + 'test_df.json', orient = 'records', lines = True)

In [211]:
bpe_tokenizer = train_and_save_tokenizer_for(in_file_paths = [BPE_IN_PATH], out_file_dir_path = BPE_OUT_PATH, vocab_size = 4_000)
pretrained_bpe_tokenizer = load_tokenizer_from(dir_path = BPE_OUT_PATH, model_max_length = 10000)

VOCAB_SIZE = pretrained_bpe_tokenizer.vocab_size
PAD_TOKEN_IDX = pretrained_bpe_tokenizer.pad_token_id

print(f'The vocab size is {VOCAB_SIZE}.')
print(f'The pad token index is {PAD_TOKEN_IDX}.')




The vocab size is 4000.
The pad token index is 2.


In [212]:
embeddings = CustomEmbedding(VOCAB_SIZE, D_MODEL)

In [213]:
def normalize_prefix_space(texts: list[str]):
    return [' ' + text.lstrip() for text in texts]

In [214]:
FILTER_tokenized_train_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(train_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_train_targets = pretrained_bpe_tokenizer(
    ('<SOS> ' + train_df['summary']).tolist(),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_val_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(val_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_val_targets = pretrained_bpe_tokenizer(
    ('<SOS> ' + val_df['summary']).tolist(),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_test_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(test_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_test_targets = pretrained_bpe_tokenizer(
    ('<SOS> ' + test_df['summary']).tolist(),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

valid_src_train_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_train_sources.data['input_ids']])
valid_src_val_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_val_sources.data['input_ids']])
valid_src_test_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_test_sources.data['input_ids']])

valid_tgt_train_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_train_targets.data['input_ids']])
valid_tgt_val_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_val_targets.data['input_ids']])
valid_tgt_test_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_test_targets.data['input_ids']])

valid_train_df = train_df.iloc[valid_src_train_indices & valid_tgt_train_indices]
valid_val_df = val_df.iloc[valid_src_val_indices & valid_tgt_val_indices]
valid_test_df = test_df.iloc[valid_src_test_indices & valid_tgt_test_indices]

print(f'With a max_context_window of {MAX_CONTEXT_WINDOW}...')
print(f'The number of training samples went from {train_df.shape[0]} to {valid_train_df.shape[0]}')
print(f'The number of validation samples went from {val_df.shape[0]} to {valid_val_df.shape[0]}')
print(f'The number of test samples went from {test_df.shape[0]} to {valid_test_df.shape[0]}')

With a max_context_window of 100...
The number of training samples went from 14732 to 5561
The number of validation samples went from 818 to 325
The number of test samples went from 819 to 306


In [215]:
tokenized_train_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(valid_train_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_train_targets = pretrained_bpe_tokenizer(
    ('<SOS> ' + valid_train_df['summary']).tolist(),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_train_labels = pretrained_bpe_tokenizer(
    normalize_prefix_space((valid_train_df['summary'] + '<EOS>').tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_val_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(valid_val_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_val_targets = pretrained_bpe_tokenizer(
    ('<SOS> ' + valid_val_df['summary']).tolist(),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_val_labels = pretrained_bpe_tokenizer(
    normalize_prefix_space((valid_val_df['summary'] + '<EOS>').tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_test_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(valid_test_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_test_targets = pretrained_bpe_tokenizer(
    ('<SOS> ' + valid_test_df['summary']).tolist(),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_test_labels = pretrained_bpe_tokenizer(
    normalize_prefix_space((valid_test_df['summary'] + '<EOS>').tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

In [216]:
train_ds = TextSummarizationDataset(tokenized_train_sources.data['input_ids'], tokenized_train_targets.data['input_ids'], tokenized_train_labels.data['input_ids'])
val_ds = TextSummarizationDataset(tokenized_val_sources.data['input_ids'], tokenized_val_targets.data['input_ids'], tokenized_val_labels.data['input_ids'])
test_ds = TextSummarizationDataset(tokenized_test_sources.data['input_ids'], tokenized_test_targets.data['input_ids'], tokenized_test_labels.data['input_ids'])

# NOTE: Option to use HuggingFace DataCollatorWithPadding : requires changing TextSummarizationDataset __getitem__
train_dataloader = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True, collate_fn = partial(padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))
val_dataloader = DataLoader(val_ds, batch_size = BATCH_SIZE, collate_fn = partial(padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))
test_dataloader = DataLoader(test_ds, batch_size = BATCH_SIZE, collate_fn = partial(padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))

In [217]:
(source, target), label = next(iter(train_dataloader))
print(source)
print(target)
print(label)

tensor([[1868,  270,   30,  ...,    2,    2,    2],
        [ 525, 1831,   30,  ...,    2,    2,    2],
        [1798,   30,  488,  ...,    2,    2,    2],
        ...,
        [ 367, 1078,   30,  ...,    2,    2,    2],
        [ 393,  627,   74,  ...,    2,    2,    2],
        [1818,  334,   30,  ...,    2,    2,    2]])
tensor([[   0, 1868,  270,  ...,    2,    2,    2],
        [   0,  461,  738,  ...,    2,    2,    2],
        [   0, 1798,  785,  ...,    2,    2,    2],
        ...,
        [   0, 1365, 3535,  ...,    2,    2,    2],
        [   0, 2572,   73,  ...,    2,    2,    2],
        [   0, 1818,  334,  ...,    2,    2,    2]])
tensor([[1868,  270,  309,  ...,    2,    2,    2],
        [ 461,  738,  314,  ...,    2,    2,    2],
        [1798,  785,  280,  ...,    2,    2,    2],
        ...,
        [1365, 3535,  785,  ...,    2,    2,    2],
        [2572,   73,  323,  ...,    2,    2,    2],
        [1818,  334, 1339,  ...,    2,    2,    2]])


In [218]:
loss_fn = nn.CrossEntropyLoss(ignore_index = PAD_TOKEN_IDX, reduction = 'sum')

model = EncoderDecoderTransformer(
    embeddings = embeddings,
    vocab_size = VOCAB_SIZE,
    d_model = D_MODEL,
    num_attention_heads = 4,
    num_encoder_layers = 1,
    num_decoder_layers = 1,
    dim_feedforward = 32,
    dropout = 0.0,
    max_context_window = MAX_CONTEXT_WINDOW,
    use_pre_lnorm = True
)

optim = torch.optim.SGD(model.parameters(), lr = 1e-4, momentum = 0.9, weight_decay = 1e-4)

In [219]:
EPOCHS = 15

training_losses = list()
training_sequence_accuracies = list()
training_token_accuracies = list()

gold_validation_losses = list()
gold_validation_sequence_accuracies = list()
gold_validation_token_accuracies = list()

for i in range(EPOCHS):
    # print(f'Running epoch {i+1}...')

    training_loss, training_sequence_accuracy, training_token_accuracy = run_train_epoch(train_dataloader, model, loss_fn, optim, calculate_sequence_accuracy = True, calculate_token_accuracy = True)

    training_losses.append(training_loss)
    training_sequence_accuracies.append(training_sequence_accuracy)
    training_token_accuracies.append(training_token_accuracy)

    gold_val_loss, gold_val_sequence_accuracy, gold_val_token_accuracy = run_gold_validation_loop(val_dataloader, model, loss_fn, calculate_sequence_accuracy = True, calculate_token_accuracy = True)
    
    gold_validation_losses.append(gold_val_loss)
    gold_validation_sequence_accuracies.append(gold_val_sequence_accuracy)
    gold_validation_token_accuracies.append(gold_val_token_accuracy)

print(training_losses)
print(training_sequence_accuracies)
print(training_token_accuracies)

print()

print(gold_validation_losses)
print(gold_validation_sequence_accuracies)
print(gold_validation_token_accuracies)

  6%|▌         | 5/87 [00:00<00:07, 10.67it/s]


ValueError: Expected input batch_size (2624) to match target batch_size (2560).