In [328]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [329]:
import sys
from pathlib import Path

# Get the parent directory (i.e. project root)
project_root = Path().resolve().parent.parent 
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from functools import partial

from tokenization.byte_pair_encoding.get_tokenizers import train_and_save_tokenizer_for, load_tokenizer_from

from pre_training.text_summarization.dataset import TextSummarizationDataset

from src.embedding import CustomEmbedding
from src.transformer import EncoderDecoderTransformer
from src.utils import padding_collate_fn

from src.train_utils import run_train_epoch
from src.validation_utils import run_gold_validation_loop, run_autoregressive_validation_loop, temp_run_autoregressive_validation_loop

In [None]:
"""
Gliwa, Bogdan, et al. SAMSUM Corpus: A Human-Annotated Dialogue Dataset for Abstractive Summarization. Proceedings of the 2nd Workshop on New Frontiers in Summarization, Association for Computational Linguistics, Nov. 2019, Hong Kong, China, pp. 70–79. https://www.aclweb.org/anthology/D19-5409. doi:10.18653/v1/D19-5409.
"""

DF_DATA_PATH = '../../data/SAMSum/'

BPE_IN_PATH = '../../data/SAMSum/train_summary_and_dialogue.txt'
BPE_OUT_PATH = '../../tokenization/trained_tokenizers/SAMSum_BPE'

In [331]:
MAX_CONTEXT_WINDOW = 100

BATCH_SIZE = 128

D_MODEL = 64

In [332]:
train_df = pd.read_json(DF_DATA_PATH + 'train_df.json', orient = 'records', lines = True)
val_df = pd.read_json(DF_DATA_PATH + 'val_df.json', orient = 'records', lines = True)
test_df = pd.read_json(DF_DATA_PATH + 'test_df.json', orient = 'records', lines = True)

In [333]:
bpe_tokenizer = train_and_save_tokenizer_for(in_file_paths = [BPE_IN_PATH], out_file_dir_path = BPE_OUT_PATH, vocab_size = 4_000)
pretrained_bpe_tokenizer = load_tokenizer_from(dir_path = BPE_OUT_PATH, model_max_length = 10000)






In [334]:
VOCAB_SIZE = pretrained_bpe_tokenizer.vocab_size
PAD_TOKEN_IDX = pretrained_bpe_tokenizer.pad_token_id
SOS_TOKEN_IDX = pretrained_bpe_tokenizer.bos_token_id
EOS_TOKEN_IDX = pretrained_bpe_tokenizer.eos_token_id

print(f'The vocab size is {VOCAB_SIZE}.')
print(f'The pad token index is {PAD_TOKEN_IDX}.')

The vocab size is 4000.
The pad token index is 2.


In [335]:
embeddings = CustomEmbedding(VOCAB_SIZE, D_MODEL)

In [336]:
def normalize_prefix_space(texts: list[str], include_SOS: bool = False):
    return [('<SOS>' if include_SOS else '') + ' ' + text.lstrip() for text in texts]

In [337]:
FILTER_tokenized_train_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(train_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_train_targets = pretrained_bpe_tokenizer(
    normalize_prefix_space(train_df['summary'].tolist(), include_SOS = True),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_val_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(val_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_val_targets = pretrained_bpe_tokenizer(
    normalize_prefix_space(val_df['summary'].tolist(), include_SOS = True),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_test_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(test_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

FILTER_tokenized_test_targets = pretrained_bpe_tokenizer(
    normalize_prefix_space(test_df['summary'].tolist(), include_SOS = True),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

valid_src_train_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_train_sources.data['input_ids']])
valid_src_val_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_val_sources.data['input_ids']])
valid_src_test_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_test_sources.data['input_ids']])

valid_tgt_train_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_train_targets.data['input_ids']])
valid_tgt_val_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_val_targets.data['input_ids']])
valid_tgt_test_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_test_targets.data['input_ids']])

valid_train_df = train_df.iloc[valid_src_train_indices & valid_tgt_train_indices]
valid_val_df = val_df.iloc[valid_src_val_indices & valid_tgt_val_indices]
valid_test_df = test_df.iloc[valid_src_test_indices & valid_tgt_test_indices]

print(f'With a max_context_window of {MAX_CONTEXT_WINDOW}...')
print(f'The number of training samples went from {train_df.shape[0]} to {valid_train_df.shape[0]}')
print(f'The number of validation samples went from {val_df.shape[0]} to {valid_val_df.shape[0]}')
print(f'The number of test samples went from {test_df.shape[0]} to {valid_test_df.shape[0]}')

With a max_context_window of 100...
The number of training samples went from 14732 to 5561
The number of validation samples went from 818 to 325
The number of test samples went from 819 to 306


In [338]:
tokenized_train_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(valid_train_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_train_targets = pretrained_bpe_tokenizer(
    normalize_prefix_space(valid_train_df['summary'].tolist(), include_SOS = True),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_train_labels = pretrained_bpe_tokenizer(
    normalize_prefix_space((valid_train_df['summary'] + '<EOS>').tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_val_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(valid_val_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_val_targets = pretrained_bpe_tokenizer(
    normalize_prefix_space(valid_val_df['summary'].tolist(), include_SOS = True),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_val_labels = pretrained_bpe_tokenizer(
    normalize_prefix_space((valid_val_df['summary'] + '<EOS>').tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_test_sources = pretrained_bpe_tokenizer(
    normalize_prefix_space(valid_test_df['dialogue'].tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_test_targets = pretrained_bpe_tokenizer(
    normalize_prefix_space(valid_test_df['summary'].tolist(), include_SOS = True),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

tokenized_test_labels = pretrained_bpe_tokenizer(
    normalize_prefix_space((valid_test_df['summary'] + '<EOS>').tolist()),
    add_special_tokens = False,
    return_attention_mask = False,
    return_token_type_ids = False
)

In [339]:
train_ds = TextSummarizationDataset(tokenized_train_sources.data['input_ids'], tokenized_train_targets.data['input_ids'], tokenized_train_labels.data['input_ids'])
val_ds = TextSummarizationDataset(tokenized_val_sources.data['input_ids'], tokenized_val_targets.data['input_ids'], tokenized_val_labels.data['input_ids'])
test_ds = TextSummarizationDataset(tokenized_test_sources.data['input_ids'], tokenized_test_targets.data['input_ids'], tokenized_test_labels.data['input_ids'])

# NOTE: Option to use HuggingFace DataCollatorWithPadding : requires changing TextSummarizationDataset __getitem__
train_dataloader = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True, collate_fn = partial(padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))
val_dataloader = DataLoader(val_ds, batch_size = BATCH_SIZE, collate_fn = partial(padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))
test_dataloader = DataLoader(test_ds, batch_size = BATCH_SIZE, collate_fn = partial(padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))

In [340]:
(source, target), label = next(iter(train_dataloader))
print(source)
print(target)
print(label)

tensor([[2256, 2984,   30,  ...,    2,    2,    2],
        [ 525, 2640,   30,  ...,    2,    2,    2],
        [ 980,  265,   30,  ...,    2,    2,    2],
        ...,
        [3034, 1542,   30,  ...,    2,    2,    2],
        [1798,   30, 1123,  ...,    2,    2,    2],
        [ 409, 1002,   30,  ...,    2,    2,    2]])
tensor([[   0, 2256, 2984,  ...,    2,    2,    2],
        [   0, 3207,  382,  ...,    2,    2,    2],
        [   0,  980,  265,  ...,    2,    2,    2],
        ...,
        [   0, 3034, 1542,  ...,    2,    2,    2],
        [   0, 1798,  861,  ...,    2,    2,    2],
        [   0,  636,  323,  ...,    2,    2,    2]])
tensor([[2256, 2984,  765,  ...,    2,    2,    2],
        [3207,  382,  314,  ...,    2,    2,    2],
        [ 980,  265,  323,  ...,    2,    2,    2],
        ...,
        [3034, 1542, 1144,  ...,    2,    2,    2],
        [1798,  861, 2376,  ...,    2,    2,    2],
        [ 636,  323, 2723,  ...,    2,    2,    2]])


In [341]:
loss_fn = nn.CrossEntropyLoss(ignore_index = PAD_TOKEN_IDX, reduction = 'sum')

model = EncoderDecoderTransformer(
    embeddings = embeddings,
    vocab_size = VOCAB_SIZE,
    d_model = D_MODEL,
    num_attention_heads = 8,
    num_encoder_layers = 2,
    num_decoder_layers = 2,
    dim_feedforward = 256,
    dropout = 0.1,
    max_context_window = MAX_CONTEXT_WINDOW,
    use_pre_lnorm = True
)

optim = torch.optim.SGD(model.parameters(), lr = 1e-4, momentum = 0.9, weight_decay = 1e-4)

In [None]:
EPOCHS = 5

training_losses = list()
training_token_accuracies = list()

gold_validation_losses = list()
gold_validation_token_accuracies = list()

for i in range(EPOCHS):
    print(f'Running epoch {i+1}...')

    training_loss, training_sequence_accuracy, training_token_accuracy = run_train_epoch(train_dataloader, model, loss_fn, optim, calculate_token_accuracy = True)

    training_losses.append(training_loss)
    training_token_accuracies.append(training_token_accuracy)

    gold_val_loss, gold_val_sequence_accuracy, gold_val_token_accuracy = run_gold_validation_loop(val_dataloader, model, loss_fn, calculate_token_accuracy = True)
    
    gold_validation_losses.append(gold_val_loss)
    gold_validation_token_accuracies.append(gold_val_token_accuracy)

print(training_losses)
print(training_token_accuracies)

print()

print(gold_validation_losses)
print(gold_validation_token_accuracies)

Running epoch 1...


100%|██████████| 44/44 [00:27<00:00,  1.62it/s]
100%|██████████| 3/3 [00:00<00:00,  5.21it/s]


Running epoch 2...


100%|██████████| 44/44 [00:25<00:00,  1.72it/s]
100%|██████████| 3/3 [00:00<00:00,  5.20it/s]


Running epoch 3...


100%|██████████| 44/44 [00:26<00:00,  1.67it/s]
100%|██████████| 3/3 [00:00<00:00,  5.40it/s]


Running epoch 4...


100%|██████████| 44/44 [00:25<00:00,  1.69it/s]
100%|██████████| 3/3 [00:00<00:00,  5.22it/s]


Running epoch 5...


100%|██████████| 44/44 [00:26<00:00,  1.67it/s]
100%|██████████| 3/3 [00:00<00:00,  4.76it/s]

[138.32659948328762, 122.23588269238334, 115.54853002846274, 110.07702709157526, 105.51277389759598]
[None, None, None, None, None]
[0.04404061686126474, 0.060904859754725066, 0.06758039743256074, 0.07115966981046948, 0.07759141846063108]

[128.48322115384616, 120.41591045673077, 114.58672025240385, 110.29928034855769, 106.43189152644231]
[None, None, None, None, None]
[0.055895406021350603, 0.061892767182439724, 0.068909679740914, 0.0729279117188437, 0.07796569509415857]





In [343]:
IDS_TO_TOKENS = {i: t for t, i in pretrained_bpe_tokenizer.get_vocab().items()}

special_token_idxs = {
    'SOS_TOKEN_IDX': SOS_TOKEN_IDX,
    'EOS_TOKEN_IDX': EOS_TOKEN_IDX,
    'PAD_TOKEN_IDX': PAD_TOKEN_IDX
}

temp_run_autoregressive_validation_loop(pretrained_bpe_tokenizer, val_dataloader, model, VOCAB_SIZE, IDS_TO_TOKENS, special_token_idxs, MAX_CONTEXT_WINDOW)

  0%|          | 0/3 [00:00<?, ?it/s]

Source:               Robert: Hey give me the address of this music shop you mentioned before
Robert: I have to buy guitar cable
Fred: <file_other>
Fred: Catch it on google maps
Robert: thx m8
Fred: ur welcome<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
Predicted Target:    <SOS> Lily will come to buy some beer.<EOS><PAD>

Source:               Keith: Meg, pls buy some milk and cereals, I see now we've run out of them
Megan: hm, sure, I can do that
Megan: but did you check in the drawer next to the fridge?
Keith: nope, let me have a look
Keith: ok, false alarm, we have cereal and milk :D
Megan: <file_gif>
Predicted Target:    <SOS> Lily will come to buy some beer.<EOS><PAD>

Source:               Samantha: <file_video>
Evelyn: LOL
Holly: Is SHE making that noise??
Samatha: Yes (＾▽＾)
Holly: How possible?? :o
Samantha: Idk, I'm also surprised!!
Evelyn: x


