In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path

# Get the parent directory (i.e. project root)
project_root = Path().resolve().parent.parent 
sys.path.insert(0, str(project_root))

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from functools import partial

from pre_training_encoder_only.sorted_integer_list_MLM.dataset import RandomSortedIntegerDataset

from src.embedding import CustomEmbedding
from src.transformer import EncoderTransformer
from src.utils import padding_collate_fn

from src.train_utils import run_train_epoch
from src.validation_utils import run_gold_validation_loop, run_autoregressive_validation_loop



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
n_real_tokens = 10
PAD_TOKEN_IDX = n_real_tokens
CLS_TOKEN_IDX = n_real_tokens + 1
SEP_TOKEN_IDX = n_real_tokens + 2
MASK_TOKEN_IDX = n_real_tokens + 3
vocab_size = n_real_tokens + 4

D_MODEL = 64

embeddings = CustomEmbedding(vocab_size, d_model = D_MODEL) # 3 = PAD, SOS, EOS

In [7]:
MAX_CONTEXT_WINDOW = 50

BATCH_SIZE = 1
MIN_SEQ_LEN = 2

NUM_TRAINING_SEQUENCES = 100
NUM_VALIDATION_SEQUENCES = 10

VOCAB = [i for i in range(n_real_tokens)] # does not include PAD, CLS, SEP, MASK

VOCAB_MAP = dict()

for i, token in enumerate(VOCAB):
    VOCAB_MAP[i] = token

# NOTE: Revisit this step for encoder-decoder sort task later.
VOCAB_MAP[PAD_TOKEN_IDX] = '<PAD>'
VOCAB_MAP[CLS_TOKEN_IDX] = '<CLS>'
VOCAB_MAP[SEP_TOKEN_IDX] = '<SEP>'
VOCAB_MAP[MASK_TOKEN_IDX] = '<MASK>'

train_rand_ds = RandomSortedIntegerDataset(MIN_SEQ_LEN, NUM_TRAINING_SEQUENCES, VOCAB)
train_dataloader = DataLoader(train_rand_ds, batch_size = BATCH_SIZE, shuffle = True, collate_fn = partial(padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))

val_rand_ds = RandomSortedIntegerDataset(MIN_SEQ_LEN, NUM_VALIDATION_SEQUENCES, VOCAB)
val_dataloader = DataLoader(val_rand_ds, batch_size = BATCH_SIZE, collate_fn = partial(padding_collate_fn, pad_token_idx = PAD_TOKEN_IDX))