In [1]:
import os
import pandas as pd

## data imports
from sklearn.model_selection import train_test_split

## torch imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

## transformer related imports
from transformers import AutoTokenizer
from transformers_fmt.model_blocks.transformer import Transformers

## constants
from constants import ROOT_DIR

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

### Prepare Tokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer.add_special_tokens({
    'bos_token' : '[BOS]',
    'eos_token' : '[EOS]'
})

2

In [3]:
def get_data(file_location, chunksize=1000, n_chunks = 100) :

    data = []


    for i, items in enumerate(pd.read_csv(file_location, chunksize=chunksize)) :

        data.append(items)
        if i == n_chunks :
            break

    data = pd.concat(data)
    data.index = range(len(data))

    return data

In [4]:
data = get_data(file_location=os.path.join(ROOT_DIR, 'data/en-fr.csv'), n_chunks = 2)

### Pytorch Dataset

In [5]:
class Data(Dataset) :

    def __init__(self, data) :
        self.data = data

    def __len__(self) :
        return len(self.data)
    
    def __getitem__(self, index) -> any:
        row = self.data.loc[index]
        return {'en' : row['en'], 'fr' : row['fr']}

In [6]:
def preprocess_batch(batch, type = 'input') :

    ## Append the BOS and EOS token based on wether the batch is the encoder input, decoder input(output shifted left)
    ## or the label (output shifted right)
    if type == 'input' :
        input_token_ids = [
            torch.cat(
                (
                    torch.tensor([tokenizer.bos_token_id]), 
                    torch.tensor(inp), 
                    torch.tensor([tokenizer.eos_token_id])
                ),
            ) for inp in batch['input_ids']
        ]

    elif type == 'output' :
        input_token_ids = [
            torch.cat(
                (torch.tensor([tokenizer.bos_token_id]), torch.tensor(inp)),
            ) for inp in batch['input_ids']
        ]

    elif type == 'label' :
        input_token_ids = [
            torch.cat(
                (torch.tensor(inp), torch.tensor([tokenizer.eos_token_id])),
            ) for inp in batch['input_ids']
        ]

    ## pad the token to the maxiumum sentence length
    input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value = tokenizer.pad_token_id)

    return input_token_ids

# def collate_fn(samples):
    
#     eng_samples = [items['en'] for items in samples]
#     fr_samples = [items['fr'] for items in samples]

#     batch = {}

#     for language, sample in {'en' : eng_samples, 'fr' : fr_samples}.items() :

#         sample = tokenizer.batch_encode_plus(sample)
#         batch[language] = preprocess_batch(sample)

#     # samples['fr'] = tokenizer.batch_encode_plus(samples['fr'])
#     return batch  

In [7]:
dataset = Data(data)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Modeling

### Model

In [10]:
model = Transformers(
    n_layer = 6,
    n_heads = 8,
    d_model = 512, 
    d_ff = 2048,
    max_seq_len = 128,
    vocab_size = tokenizer.vocab_size
)

In [12]:
model = model.half()

In [1]:
max_seq_len = 10
debug = True

def generate(model, enc_output, tokenizer, max_seq_len):
    
    input_ids = torch.tensor([[tokenizer.bos_token_id] for _ in range(enc_output.size(0))]).to(device)

    unfinished_sequences = torch.ones(input_ids.size(0), 1).to(device)
    eos_token_id_tensor = torch.tensor([tokenizer.eos_token_id]).to(device)
    # logs('unfinished_sequences size: {}'.format(unfinished_sequences.size()), debug)

    sentence_length = input_ids.size(1)

    while sentence_length <= max_seq_len :

        x = model.embedding(input_ids)
        x = model.positonal_embedding(x)
        x = model.decoder(x, enc_output)
        next_token_logits = x[:, -1, :]
        
        # logs(f'next_token_logits size: {next_token_logits.size()}', debug)

        next_token_logits = F.softmax(next_token_logits, dim=1)
        next_token_indices = torch.argmax(next_token_logits, dim = 1)

        # logs(f'next_token_indices post softmax size: {next_token_indices.size()}', debug)

        # logs(f'next_token_indices * unfinished_sequences size: {(next_token_indices.unsqueeze(1) * unfinished_sequences).size()}', debug)

        # logs(f'tokenizer.pad_token_id * (1 - unfinished_sequences) size: {(tokenizer.pad_token_id * (1 - unfinished_sequences)).size()}', debug)

        next_token_indices = (
            next_token_indices.unsqueeze(1) * unfinished_sequences + tokenizer.pad_token_id * (1 - unfinished_sequences)
        )
        unfinished_sequences = unfinished_sequences.mul(
            next_token_indices.tile(
                eos_token_id_tensor.shape[0]
            ).ne(eos_token_id_tensor).prod(dim = 0)
        )

        if unfinished_sequences.max() == 0 :
            break

        # print(input_ids.size())
        # print(next_token_indices.size())

        input_ids = torch.cat(
            (
                input_ids, 
                next_token_indices
            ), 
            dim = 1).long()

        sentence_length += 1
    
        print(input_ids.size(1))

    return input_ids


### Optimizer and Loss Function

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-4)

### Training Loop

start_epochs = 0
end_epochs = 10






