In [19]:
import os
import pandas as pd
from tqdm import tqdm
from datetime import datetime

## data imports
from sklearn.model_selection import train_test_split

## torch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

## transformer related imports
from transformers import AutoTokenizer
from transformers_fmt.model_blocks.transformer import Transformers

## constants
from constants import ROOT_DIR, DEVICE, LOGS_DIR
from utils.logging import logs

In [20]:
max_seq_len = 20
writer = SummaryWriter(log_dir=LOGS_DIR)

# Dataset

### Prepare Tokenizer

In [21]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer.add_special_tokens({
    'bos_token' : '[BOS]',
    'eos_token' : '[EOS]'
})

2

In [22]:
def get_data(file_location, chunksize=1000, n_chunks = 100) :

    data = []


    for i, items in enumerate(pd.read_csv(file_location, chunksize=chunksize)) :

        data.append(items)
        if i == n_chunks - 1 :
            break

    data = pd.concat(data)
    data.index = range(len(data))

    return data

In [23]:
data = get_data(file_location=os.path.join(ROOT_DIR, 'data/en_fr_100K.csv'), n_chunks = 50)

In [24]:
def remove_long_sentence(max_seq_len, data):
    """
    Remove sentences that are longer than max_seq_len
    """
    data['en_sentence_length'] = data['en'].apply(lambda x : len(x.split()) if type(x) == str else max_seq_len + 1)
    data['fr_sentence_length'] = data['fr'].apply(lambda x : len(x.split()) if type(x) == str else max_seq_len + 1)

    data = data.drop(
        data[(data['en_sentence_length'] > max_seq_len) | (data['fr_sentence_length'] > max_seq_len)].index
    )

    data.index = range(len(data))
    
    return data

In [25]:
data = remove_long_sentence(max_seq_len, data)

In [26]:
data

Unnamed: 0,en,fr,en_sentence_length,fr_sentence_length
0,Site map,Plan du site,2,3
1,Feedback,Rétroaction,1,1
2,Credits,Crédits,1,1
3,Français,English,1,1
4,What is light ?,Qu’est-ce que la lumière?,4,4
...,...,...,...,...
25016,(% change),Tableau A-9A.,2,2
25017,YEAR,ANNÉE,1,1
25018,CONSUMER GOODS & SERVICES (% change),Biens et services de consommation ( % de chang...,6,9
25019,FOOD & NONALCOHOLIC BEVERAGES (% change),Produits alimentaires et boissons non alcoolis...,6,10


In [27]:
train_data, val_test_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

train_data.index = range(len(train_data))
val_data.index = range(len(val_data))
test_data.index = range(len(test_data))

### Pytorch Dataset

In [28]:
class Data(Dataset) :

    def __init__(self, data) :
        self.data = data

    def __len__(self) :
        return len(self.data)
    
    def __getitem__(self, index) -> any:
        row = self.data.loc[index]
        return {'en' : row['en'], 'fr' : row['fr']}

In [29]:
def preprocess_batch(batch, type = 'input') :

    ## Append the BOS and EOS token based on wether the batch is the encoder input, decoder input(output shifted left)
    ## or the label (output shifted right)
    if type == 'input' :
        input_token_ids = [
            torch.cat(
                (
                    torch.tensor([tokenizer.bos_token_id]), 
                    torch.tensor(inp), 
                    torch.tensor([tokenizer.eos_token_id])
                ),
            ) for inp in batch['input_ids']
        ]

    elif type == 'output' :
        input_token_ids = [
            torch.cat(
                (torch.tensor([tokenizer.bos_token_id]), torch.tensor(inp)),
            ) for inp in batch['input_ids']
        ]

    elif type == 'label' :
        input_token_ids = [
            torch.cat(
                (torch.tensor(inp), torch.tensor([tokenizer.eos_token_id])),
            ) for inp in batch['input_ids']
        ]

    ## pad the token to the maxiumum sentence length
    input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value = tokenizer.pad_token_id)

    return input_token_ids

# def collate_fn(samples):
    
#     eng_samples = [items['en'] for items in samples]
#     fr_samples = [items['fr'] for items in samples]

#     batch = {}

#     for language, sample in {'en' : eng_samples, 'fr' : fr_samples}.items() :

#         sample = tokenizer.batch_encode_plus(sample)
#         batch[language] = preprocess_batch(sample)

#     # samples['fr'] = tokenizer.batch_encode_plus(samples['fr'])
#     return batch  

In [30]:
train_dataset = Data(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

val_dataset = Data(val_data)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)

test_dataset = Data(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)

data_loaders = {
    'train': train_dataloader,
    'val': val_dataloader,
    'test': test_dataloader
}

# Modeling

### Model

In [31]:
model = Transformers(
    n_layer = 6,
    n_heads = 8,
    d_model = 512, 
    d_ff = 2048,
    max_seq_len = 512,
    vocab_size = tokenizer.vocab_size,
    device = DEVICE
).to(DEVICE)

In [32]:
def generate(model, enc_output, tokenizer, max_seq_len, device):
    
    input_ids = torch.tensor([[tokenizer.bos_token_id] for _ in range(enc_output.size(0))]).to(device)

    unfinished_sequences = torch.ones(input_ids.size(0), 1).to(device)
    eos_token_id_tensor = torch.tensor([tokenizer.eos_token_id]).to(device)
    # logs('unfinished_sequences size: {}'.format(unfinished_sequences.size()), debug)

    sentence_length = input_ids.size(1)

    while sentence_length <= max_seq_len :

        x = model.embedding(input_ids)
        x = model.positonal_embedding(x)
        x = model.decoder(x, enc_output)
        next_token_logits = x[:, -1, :]
        
        # logs(f'next_token_logits size: {next_token_logits.size()}', debug)

        next_token_logits = F.softmax(next_token_logits, dim=1)
        next_token_indices = torch.argmax(next_token_logits, dim = 1)

        # logs(f'next_token_indices post softmax size: {next_token_indices.size()}', debug)

        # logs(f'next_token_indices * unfinished_sequences size: {(next_token_indices.unsqueeze(1) * unfinished_sequences).size()}', debug)

        # logs(f'tokenizer.pad_token_id * (1 - unfinished_sequences) size: {(tokenizer.pad_token_id * (1 - unfinished_sequences)).size()}', debug)

        next_token_indices = (
            next_token_indices.unsqueeze(1) * unfinished_sequences + tokenizer.pad_token_id * (1 - unfinished_sequences)
        )
        unfinished_sequences = unfinished_sequences.mul(
            next_token_indices.tile(
                eos_token_id_tensor.shape[0]
            ).ne(eos_token_id_tensor).prod(dim = 0)
        )

        if unfinished_sequences.max() == 0 :
            break

        # print(input_ids.size())
        # print(next_token_indices.size())

        input_ids = torch.cat(
            (
                input_ids, 
                next_token_indices
            ), 
            dim = 1).long()

        sentence_length += 1
    
        print(input_ids.size(1))

    return input_ids


### Train / val loop

In [33]:
debug = False

def train_model(model, data_loader, optimizer, criterion, device, epoch, mode = 'train') :

    EPOCH_LOSS = 0

    assert mode in ['train', 'val'], 'Mode should be either "train" or "val"'

    if mode == 'train' :
        model.train()
    elif mode == 'val' :
        model.eval()

    for i, rows in enumerate(data_loader[mode]) :

        try :

            ## preprocess batch for training
            en_token_ids = tokenizer.batch_encode_plus(rows['en'], add_special_tokens = False)
            fr_token_ids = tokenizer.batch_encode_plus(rows['fr'], add_special_tokens = False)
            encoder_inp = preprocess_batch(en_token_ids, type='input').to(device)
            logs(f'encoder_inp size : {encoder_inp.size()}', debug)
            decoder_inp = preprocess_batch(fr_token_ids, type='output').to(device)
            logs(f'decoder_inp size : {decoder_inp.size()}', debug)
            label = preprocess_batch(fr_token_ids, type='label').to(device)

            ## forward pass through the model
            attention_scores, output = model(encoder_inp, decoder_inp)

            ## calculate loss
            loss = criterion(output, label.reshape(-1))

            ## optimize model
            if mode == 'train' :
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            ## accumulate loss
            EPOCH_LOSS += loss.item()

            writer.add_scalar(f'{mode} loss', EPOCH_LOSS/(i + 1), epoch * len(data_loader[mode]) + i)

        except Exception as e :
            print(f'{epoch}_{i} | Exception : {e}')
        
        torch.cuda.empty_cache()

    return EPOCH_LOSS / len(data_loader[mode])

### Test loop

In [34]:
def test_model(model, valid_loader, device, tokenizer, max_seq_len) :

    EPOCH_LOSS = 0

    model.eval()

    for i, rows in enumerate(valid_loader) :

        ## preprocess batch for training
        en_token_ids = tokenizer.batch_encode_plus(rows['en'], add_special_tokens = False)
        fr_token_ids = tokenizer.batch_encode_plus(rows['fr'], add_special_tokens = False)
        encoder_inp = preprocess_batch(en_token_ids, type='input').to(device)
        decoder_inp = preprocess_batch(fr_token_ids, type='output').to(device)
        label = preprocess_batch(fr_token_ids, type='label').to(device)

        ## encode the input
        enc_output, attention_scores = model.encoder_pass(encoder_inp)

        input_tokens = generate(model, enc_output, tokenizer, max_seq_len)

### Optimizer and Loss Function

In [35]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-4)

### Model training

In [36]:
least_val_loss = 1000
MODEL_DIR = os.path.join(ROOT_DIR, 'model_weights')

In [37]:
start_epochs = 0
end_epochs = 100

for epochs in range(start_epochs, end_epochs) :

    print(f'Epochs {epochs + 1}')

    train_loss = train_model(model, data_loaders, optimizer, criterion, DEVICE, epochs, mode = 'train')
    val_loss = train_model(model, data_loaders, optimizer, criterion, DEVICE, epochs, mode = 'val')
    
    if val_loss < least_val_loss :
        try :
            least_val_loss = val_loss
            torch.save(
                model.state_dict(), 
                os.path.join(MODEL_DIR, f'transformer_ep-{epochs + 1}_val-loss-{val_loss:.4f}.pt')
            )
        except Exception as e :
            print(f'{epochs} | Problem in saving model\nException : {e}')

    print(f'Epochs : {epochs + 1} | Train Loss : {train_loss:.4f} | Val Loss : {val_loss:.4f}')
    print('----------------------------------------------------')

Epochs 1
0_0 | Exception : float division by zero


KeyboardInterrupt: 

In [36]:
test_sentence = ['my name is kong', 'this is aditya rustagi', 'i like to eat ice cream']
test_sentence_tokens = tokenizer.batch_encode_plus(test_sentence, add_special_tokens = False)
encoder_inp = preprocess_batch(test_sentence_tokens, type='input').to(DEVICE)

In [37]:
enc_output, attention_scores = model.encoder_pass(encoder_inp)
output_ids = generate(model, enc_output, tokenizer, max_seq_len, DEVICE)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [38]:
tokenizer.batch_decode(output_ids)

['[BOS] Æ \\ Â s [unused67] \\ | Â 5 å [unused87] å [unused87] 5 ̍ [unused87] 5 В Œ |',
 '[BOS] ť s Ό å σ å [unused94] å [unused94] å [unused94] å [unused94] å [unused94] | Â 5 å å',
 '[BOS] Æ s Ό Â \\ Â | Â Τ [unused97] | Â 5 5 В Œ | Â 5 5']