In [145]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [146]:
import sys
import os
from pathlib import Path

# Get the parent directory (i.e. project root)
project_root = Path().resolve().parent.parent 
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from functools import partial

from tqdm import tqdm

from tokenization.byte_pair_encoding.get_tokenizers import train_and_save_tokenizer_for, load_tokenizer_from

from pre_training.text_summarization.dataset import TextSummarizationDataset

from src.embedding import CustomEmbedding
from src.transformer import EncoderDecoderTransformer
from src.utils import list_padding_collate_fn

In [147]:
DF_DATA_PATH = '../../data/SAMSum/'

BPE_IN_PATH = '../../data/SAMSum/train_summary_and_dialogue.txt'
BPE_OUT_PATH = '../../tokenization/trained_tokenizers/SAMSum_BPE'

In [148]:
MAX_CONTEXT_WINDOW = 100

BATCH_SIZE = 2

In [149]:
train_df = pd.read_json(DF_DATA_PATH + 'train_df.json', orient = 'records', lines = True)
val_df = pd.read_json(DF_DATA_PATH + 'val_df.json', orient = 'records', lines = True)
test_df = pd.read_json(DF_DATA_PATH + 'test_df.json', orient = 'records', lines = True)

In [150]:
bpe_tokenizer = train_and_save_tokenizer_for(in_file_paths = [BPE_IN_PATH], out_file_dir_path = BPE_OUT_PATH, vocab_size = 4_000)
pretrained_bpe_tokenizer = load_tokenizer_from(dir_path = BPE_OUT_PATH, model_max_length = 10000)

PAD_TOKEN_IDX = pretrained_bpe_tokenizer.pad_token_id
print(f'The pad token index is {PAD_TOKEN_IDX}.')




The pad token index is 2.


In [151]:
FILTER_tokenized_train_sources = pretrained_bpe_tokenizer(
    train_df['dialogue'].tolist(),
    add_special_tokens = False
)

FILTER_tokenized_train_targets = pretrained_bpe_tokenizer(
    train_df['summary'].tolist(),
    add_special_tokens = False
)

FILTER_tokenized_val_sources = pretrained_bpe_tokenizer(
    val_df['dialogue'].tolist(),
    add_special_tokens = False
)

FILTER_tokenized_val_targets = pretrained_bpe_tokenizer(
    val_df['summary'].tolist(),
    add_special_tokens = False
)

FILTER_tokenized_test_sources = pretrained_bpe_tokenizer(
    test_df['dialogue'].tolist(),
    add_special_tokens = False
)

FILTER_tokenized_test_targets = pretrained_bpe_tokenizer(
    test_df['summary'].tolist(),
    add_special_tokens = False
)

valid_src_train_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_train_sources.data['input_ids']])
valid_src_val_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_val_sources.data['input_ids']])
valid_src_test_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW for example in FILTER_tokenized_test_sources.data['input_ids']])

valid_tgt_train_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW - 1 for example in FILTER_tokenized_train_targets.data['input_ids']])
valid_tgt_val_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW - 1 for example in FILTER_tokenized_val_targets.data['input_ids']])
valid_tgt_test_indices = np.array([len(example) <= MAX_CONTEXT_WINDOW - 1 for example in FILTER_tokenized_test_targets.data['input_ids']])

valid_train_df = train_df.iloc[valid_src_train_indices & valid_tgt_train_indices]
valid_val_df = val_df.iloc[valid_src_val_indices & valid_tgt_val_indices]
valid_test_df = test_df.iloc[valid_src_test_indices & valid_tgt_test_indices]

print(f'With a max_context_window of {MAX_CONTEXT_WINDOW}...')
print(f'The number of training samples went from {train_df.shape[0]} to {valid_train_df.shape[0]}')
print(f'The number of validation samples went from {val_df.shape[0]} to {valid_val_df.shape[0]}')
print(f'The number of test samples went from {test_df.shape[0]} to {valid_test_df.shape[0]}')

With a max_context_window of 100...
The number of training samples went from 14732 to 5580
The number of validation samples went from 818 to 325
The number of test samples went from 819 to 308


In [152]:
tokenized_train_sources = pretrained_bpe_tokenizer(
    valid_train_df['dialogue'].tolist(),
    add_special_tokens = False
)

tokenized_train_targets = pretrained_bpe_tokenizer(
    ('<SOS> ' + valid_train_df['summary']).tolist(),
    add_special_tokens = False
)

tokenized_train_labels = pretrained_bpe_tokenizer(
    (valid_train_df['summary'] + ' <EOS>').tolist(),
    add_special_tokens = False
)

tokenized_val_sources = pretrained_bpe_tokenizer(
    valid_val_df['dialogue'].tolist(),
    add_special_tokens = False
)

tokenized_val_targets = pretrained_bpe_tokenizer(
    ('<SOS> ' + valid_val_df['summary']).tolist(),
    add_special_tokens = False
)

tokenized_val_labels = pretrained_bpe_tokenizer(
    (valid_val_df['summary'] + ' <EOS>').tolist(),
    add_special_tokens = False
)

tokenized_test_sources = pretrained_bpe_tokenizer(
    valid_test_df['dialogue'].tolist(),
    add_special_tokens = False
)

tokenized_test_targets = pretrained_bpe_tokenizer(
    ('<SOS> ' + valid_test_df['summary']).tolist(),
    add_special_tokens = False
)

tokenized_test_labels = pretrained_bpe_tokenizer(
    (valid_test_df['summary'] + ' <EOS>').tolist(),
    add_special_tokens = False
)

In [153]:
print(tokenized_train_sources.data['input_ids'][0])

[1607, 30, 273, 274, 598, 307, 225, 3972, 18, 822, 286, 449, 441, 35, 206, 203, 2115, 30, 893, 5, 206, 203, 1607, 30, 273, 419, 933, 286, 606, 1677]


In [154]:
train_ds = TextSummarizationDataset(tokenized_train_sources.data['input_ids'], tokenized_train_targets.data['input_ids'], tokenized_train_labels.data['input_ids'])
val_ds = TextSummarizationDataset(tokenized_val_sources.data['input_ids'], tokenized_val_targets.data['input_ids'], tokenized_val_labels.data['input_ids'])
test_ds = TextSummarizationDataset(tokenized_test_sources.data['input_ids'], tokenized_test_targets.data['input_ids'], tokenized_test_labels.data['input_ids'])

# train_dataloader = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True)
# val_dataloader = DataLoader(val_ds, batch_size = BATCH_SIZE)
# test_dataloader = DataLoader(test_ds, batch_size = BATCH_SIZE)

train_dataloader = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True, collate_fn = partial(list_padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))
val_dataloader = DataLoader(val_ds, batch_size = BATCH_SIZE, collate_fn = partial(list_padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))
test_dataloader = DataLoader(test_ds, batch_size = BATCH_SIZE, collate_fn = partial(list_padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))

In [155]:
(source, target), label = next(iter(train_dataloader))
print(source)
print(target)
print(label)

([850, 30, 2291, 326, 928, 317, 275, 1089, 2090, 996, 35, 206, 203, 3306, 30, 273, 413, 333, 468, 360, 206, 203, 2043, 389, 30, 610, 16, 275, 704, 1026, 1492, 372, 3323, 350], [0, 460, 704, 1026, 1588, 1565, 372, 285, 330, 1274, 3497, 16, 360, 2204, 3836, 298, 309, 1187, 75, 389, 413, 333, 362, 280, 928, 317, 1089, 366, 529, 319, 996, 18]) [1142, 704, 1026, 1588, 1565, 372, 285, 330, 1274, 3497, 16, 360, 2204, 3836, 298, 309, 1187, 75, 389, 413, 333, 362, 280, 928, 317, 1089, 366, 529, 319, 996, 18, 225, 1]
([1996, 30, 3040, 16, 372, 286, 476, 324, 441, 284, 93, 71, 1587, 655, 35, 206, 203, 703, 30, 504, 383, 655, 377, 798, 606, 35, 206, 203, 1336, 30, 689, 606, 323, 820, 16, 798, 931, 815, 1347, 35, 206, 203, 1996, 30, 360, 286, 372, 1428, 1293, 655, 1175, 944, 286, 981, 660, 1730, 338, 35, 206, 203, 703, 30, 272, 635, 482, 280, 275, 3874, 707, 16, 1352, 1057, 35, 754, 354, 40, 206, 203, 1336, 30, 984, 338, 495, 354, 40, 417, 263, 284, 746, 71, 332, 2175], [0, 1306, 309, 3577, 372, 14

In [None]:
print(type(tokenized_train_sources))
print(type(tokenized_train_sources[0]))

<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'tokenizers.Encoding'>


In [None]:
print(tokenized_train_sources.data.keys())
print(len(tokenized_train_sources.data['input_ids']))
print(tokenized_train_sources.data['input_ids'][0])
print(tokenized_train_sources.encodings[0].tokens)

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
14732
[1607, 30, 273, 274, 598, 307, 225, 3972, 18, 822, 286, 449, 441, 35, 206, 203, 2115, 30, 893, 5, 206, 203, 1607, 30, 273, 419, 933, 286, 606, 1677]
['Amanda', ':', 'ĠI', 'Ġb', 'ak', 'ed', 'Ġ', 'Ġcookies', '.', 'ĠDo', 'Ġyou', 'Ġwant', 'Ġsome', '?', 'č', 'Ċ', 'Jerry', ':', 'ĠSure', '!', 'č', 'Ċ', 'Amanda', ':', 'ĠI', "'ll", 'Ġbring', 'Ġyou', 'Ġtomorrow', 'Ġ:-)']


In [47]:
print(train_encoded[14731].tokens)
print(train_encoded[0])

['Georg', 'ia', ':', 'Ġare', 'Ġyou', 'Ġready', 'Ġfor', 'Ġhotel', 'Ġhun', 'ting', '?', 'ĠWe', 'Ġneed', 'Ġto', 'Ġbook', 'Ġsomething', 'Ġfinally', 'Ġfor', 'ĠL', 'is', 'b', 'on', 'č', 'Ċ', 'Jul', 'i', 'ette', ':', 'Ġsure', 'Ġwe', 'Ġcan', 'Ġgo', 'Ġon', ',', 'Ġshow', 'Ġme', 'Ġwhat', 'Ġyou', 'Ġfound', 'č', 'Ċ', 'Georg', 'ia', ':', 'Ġ<', 'file', '_', 'photo', '>', 'č', 'Ċ', 'Jul', 'i', 'ette', ':', 'Ġn', 'ah', '...', 'Ġit', 'Ġlooks', 'Ġlike', 'Ġan', 'Ġold', 'Ġlady', "'s", 'Ġroom', 'Ġlol', 'č', 'Ċ', 'Georg', 'ia', ':', 'Ġ<', 'file', '_', 'photo', '>', 'č', 'Ċ', 'Jul', 'i', 'ette', ':', 'Ġthat', "'s", 'Ġbetter', '...', 'Ġbut', 'Ġthe', 'Ġbed', 'Ġdoesn', "'t", 'Ġlook', 'Ġvery', 'Ġcomfort', 'able', 'č', 'Ċ', 'Georg', 'ia', ':', 'Ġi', 'Ġkind', 'Ġof', 'Ġlike', 'Ġit', 'Ġand', 'Ġit', "'s", 'Ġreally', 'Ġclose', 'Ġto', 'Ġthe', 'Ġcity', 'Ġc', 'en', 'ter', 'č', 'Ċ', 'Jul', 'i', 'ette', ':', 'Ġshow', 'Ġme', 'Ġthe', 'Ġothers', 'Ġplease', 'č', 'Ċ', 'Georg', 'ia', ':', 'Ġ<', 'file', '_', 'photo', '>', 'č', 'Ċ'