# Import

In [1]:
## For getting proper stacktrace for exceptions, while using gpu

# from helper import cuda_blocking
# cuda_blocking()

In [2]:
## Standard Libraries

import gc
import os

In [3]:
## 3rd party libraries

import nltk
# nltk.download('wordnet')

import numpy as np

import spacy
from spacy.lang.en import English
# !python3 -m spacy download en_core_web_sm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as tdata
import torch.nn.functional as F

In [4]:
## My custom classes and functions from .py files

from path import Path
from utils import TokenizedDataset, train_epoch, dev_epoch, save_model, load_model
from helper import split_df, iter_batch, csv_data

from model import Transformer
from lang import load_lang_and_data, compute_lang_and_data, save_lang_and_data, compute_test

## Path object

In [5]:
## An object that handles the file locations of generated output files

paths = Path('./AssignmentNLP', phase=4, out_ver=3)

I: Path "./AssignmentNLP/output3" already exist
I: Path "./AssignmentNLP/phase4" already exist
I: Path "./AssignmentNLP/phase4/answer" already exist
I: Path "./AssignmentNLP/output3/model" already exist


# Preprocessing

In [6]:
## Parse and split the 'train.csv' file into train set and dev set

df = csv_data(paths('train.csv'), cols=['hindi', 'english'])
train, dev = split_df(df, split=0.8)

In [7]:
## Default Spacy pipeline has many components that delays the parsing
## Hence, use only the necessary components

en_sentencizer = English()                    # just the language with no pipeline
en_sentencizer.add_pipe('sentencizer')
en_lemmatizer = spacy.load('en_core_web_sm', disable=['tok2vec', 'parser', 'ner'])

In [8]:
## Compute, Save and Load functions for computing, saving and loading
##    - Lang object,
##    - splitted train and dev set,
##    - parsed tokens of train and dev set


# (train, dev), (hi_tkns, en_tkns), (dev_hi_tkns, dev_en_tkns), (hi_lang, en_lang) = compute_lang_and_data(train, dev, en_sentencizer, en_lemmatizer)
# save_lang_and_data(train, dev, hi_tkns, en_tkns, dev_hi_tkns, dev_en_tkns, hi_lang, en_lang, paths)
(train, dev), (hi_tkns, en_tkns), (dev_hi_tkns, dev_en_tkns), (hi_lang, en_lang) = load_lang_and_data(paths)

# Model

In [9]:
## Few hyperparameters

# misc.
model_load, model_save, data_parallel = False, True, True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# hyperparameters
num_epochs    = 20
batch_size    = 32
learning_rate = 1e-4

nhead          = 8
embedding_dim  = 512
src_pad_idx    = hi_lang["[PAD]"]

src_vocab_size     = len(hi_lang)
tgt_vocab_size     = len(en_lang)
num_encoder_layers = 6
num_decoder_layers = 6

max_src_seq_len = hi_tkns.shape[1]
max_tgt_seq_len = en_tkns.shape[1]
dim_feedforward = 2048
dropout_p       = 0.10

In [10]:
## Create DataLoader

# Truncate dev set, so that it won't exceed max sequence length of train set
d_hi_tkns, d_en_tkns = dev_hi_tkns[:, :max_src_seq_len], dev_en_tkns[:, :max_tgt_seq_len]

# DataLoader
train_loader = tdata.DataLoader(dataset=TokenizedDataset(  hi_tkns,   en_tkns), batch_size=batch_size, shuffle=True, pin_memory=True)
dev_loader   = tdata.DataLoader(dataset=TokenizedDataset(d_hi_tkns, d_en_tkns), batch_size=batch_size, shuffle=True, pin_memory=True)

In [11]:
## Incase the model parameters are holding GPU memory, release it.

if 'scheduler' in globals():
    del scheduler
    print('I: del scheduler')
    if 'optimizer' in globals():
        del optimizer
        print('I: del optimizer')
        if 'model' in globals():
            del model
            print('I: del model')
    pass

gc.collect()
torch.cuda.empty_cache()

In [12]:
## Transformer model, objective and optimizer

model = Transformer(embedding_dim, src_pad_idx, nhead,
                src_vocab_size, tgt_vocab_size,
                num_encoder_layers, num_decoder_layers,
                max_src_seq_len, max_tgt_seq_len,
                dim_feedforward, dropout_p)

if model_load:
    model = load_model(model, paths('model.bkp', 10), data_parallel)
elif data_parallel:
    model = nn.DataParallel(model)

model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10, verbose=True)
criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)

## Train

In [None]:
## Train model, print Epoch Loss for both Train and Dev set, finally save the model parameters

for epoch_id in range(num_epochs):
    train_mean_loss = train_epoch(model, train_loader, optimizer, scheduler, criterion, device, paths, minibatch=8)
    with torch.no_grad():
        dev_mean_loss = dev_epoch(model, dev_loader, criterion, device, minibatch=0)
    print(f'I: Epoch - {epoch_id}\t\tTrainMeanLoss - {train_mean_loss:.3f}\t\tDevMeanLoss - {dev_mean_loss:.3f}')
    if model_save:
        save_model(model, paths('model.bkp', epoch_id), data_parallel)

if model_save:
    save_model(model, paths('model'), data_parallel)

## Test

In [None]:
## Parse test phase csv, and compute tokens

test      = csv_data(paths('test.csv'), cols=['hindi'])
test_tkns = compute_test(test, hi_lang)

In [None]:
## Greedy strategy for next token prediction

def greedy_prediction(model, src, seq_len):
    model.eval()
    src_len = src.shape[0]
    predictions = [None for _ in range(src_len)]
    src = torch.tensor(src, dtype=torch.int64, device=device)
    tgt = torch.full((src_len, 1), en_lang['[SOS]'], dtype=torch.int64, device=device)
    idx = torch.arange(src_len, dtype=torch.int64)
    for i in range(seq_len):
        print(f'{i:03d}/{seq_len}', end='\r')
        out = model(src, tgt)
        best_guesses = out.argmax(2)[:, -1]
        del out
        
        tgt  = torch.cat((tgt, best_guesses.unsqueeze(1)), dim=1)
        mask = torch.ones(best_guesses.shape[0], dtype=torch.bool)
        for i,(lang_i,_idx) in enumerate(zip(best_guesses, idx)):
            if lang_i == en_lang['[EOS]']:
                mask[i] = False
                predictions[_idx] = tgt[i, :].squeeze().tolist()
        src = src[mask, :]
        tgt = tgt[mask, :]
        idx = idx[mask]

        if not mask.any():
            break
    for i,tkns in zip(idx, tgt):
        predictions[i] = tkns.tolist()
    return predictions

In [None]:
## Function that uses 'greedy_prediction' function and untokenizes the predicted tokens

def pred(model, tkns, seq_len):
    result = []
    pred_all = greedy_prediction(model, tkns, seq_len)
    for p in pred_all:
        assert p[0] == en_lang['[SOS]']
        arr = [en_lang[p[1]].capitalize()]
        for idx in p[2:]:
            if idx == en_lang['[EOS]']:
                break
            arr.append(en_lang[idx])
        result.append(' '.join(arr))
    return result

In [None]:
## Piece of code to translate all the hindi sentences in test phase csv, in batch mode

result = []
with torch.no_grad():
    for s,e in iter_batch(len(test_tkns), 100):
        p = pred(model, test_tkns[s:e], max_tgt_seq_len)
        result.extend(p)

In [None]:
## Finally, save the translations as output

with open(paths('answer.txt', 0), 'w') as outputfile:
    outputfile.writelines('\n'.join(result))