In [None]:
!pip3 install sentencepiece



In [None]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import sentencepiece as spm

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch.autograd import Variable

import math

from tqdm import tqdm

# **Data**

## **Read Data**

### **Create Pairs**

In [None]:
def create_pairs(src_path, tgt_path):
  print("Reading lines...")

  src = open(src_path).readlines()
  tgt = open(tgt_path).readlines()

  pairs = []
  for i in range(len(src)):
    pair = []
    s = src[i].strip().strip('\n')
    t = tgt[i].strip().strip('\n')
    pair.append(s)
    pair.append(t)
    pairs.append(pair)
  
  return pairs

### **Create Vocabulary**
Create Data and Subword Tokenization by Byte Pair Encoding(BPE) using Sentencepiece

In [None]:
def create_vocab(pairs):
  src = [pair[0] for pair in pairs]
  tgt = [pair[1] for pair in pairs]
  with open("src.txt", 'w') as f:
    f.writelines("%s\n" % s for s in src)
  with open("tgt.txt", 'w') as f:
    f.writelines("%s\n" % t for t in tgt)
  
  spm.SentencePieceTrainer.train('--input=src.txt --model_prefix=s --vocab_size=2000 --model_type=bpe --normalization_rule_name=nmt_nfkc_cf --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3')
  spm.SentencePieceTrainer.train('--input=tgt.txt --model_prefix=t --vocab_size=2000 --model_type=bpe --normalization_rule_name=nmt_nfkc_cf --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3')

  source_vocab = spm.SentencePieceProcessor()
  target_vocab = spm.SentencePieceProcessor()

  source_vocab.load('s.model')
  target_vocab.load('t.model')

  return source_vocab, target_vocab


In [None]:
def prepare_data(src_path, tgt_path):
  pairs = create_pairs(src_path, tgt_path)

  print(f"Read {len(pairs)} sentence pairs")

  return pairs

In [None]:
# Path to train files
train_src_path = "train.en"
train_tgt_path = "train.mr"

# Path to valid files
valid_src_path = "tun.en"
valid_tgt_path = "tun.mr"

# Path to test files
test_src_path = "test.en"
test_tgt_path = "test.mr"

train_pairs = prepare_data(train_src_path, train_tgt_path)
source_vocab, target_vocab = create_vocab(train_pairs) 

valid_pairs = prepare_data(valid_src_path, valid_tgt_path)

test_pairs = prepare_data(test_src_path, test_tgt_path)

Reading lines...
Read 46277 sentence pairs
Reading lines...
Read 500 sentence pairs
Reading lines...
Read 2000 sentence pairs


In [None]:
for i in train_pairs:
  t = i[0].split(" ")
  if len(t) < 7:
    print(i)

### **Tokenizer**

In [None]:
def source_tokenizer(sentence):
  tokens = source_vocab.encode_as_ids(sentence)
  tokens = [source_vocab.bos_id()] + tokens + [source_vocab.eos_id()]
  return tokens

def target_tokenizer(sentence):
  tokens = target_vocab.encode_as_ids(sentence)
  tokens = [target_vocab.bos_id()] + tokens + [target_vocab.eos_id()]
  return tokens

## **Prepare Data**

In [None]:
def data_process(pairs):
  data = []
  for line in pairs:
    src = torch.tensor(source_tokenizer(line[0]), dtype = torch.long)
    tgt = torch.tensor(target_tokenizer(line[1]), dtype = torch.long)
    if len(src) <= 100 and len(tgt) <= 100:
      data.append((src, tgt))

  return data

train_data = data_process(train_pairs)
valid_data = data_process(valid_pairs)
test_data = data_process(test_pairs)


In [None]:
PAD_IDX=target_vocab.pad_id()
BATCH_SIZE=256
def generate_batch(pairs):
  src_batch, tgt_batch = [],[]
  for (s,t) in pairs:
    src_batch.append(s)
    tgt_batch.append(t)

  src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
  tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)

  return src_batch, tgt_batch

train_iter = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(valid_data, batch_size = BATCH_SIZE, shuffle=True, collate_fn=generate_batch)


# **Model Architecture**

## **Encoder**

### **Encoder Model**

In [None]:
class Encoder(nn.Module):
  def __init__(self, embed_size, src_vocab_size,
               src_pad_idx,
               nheads, nhid, nencl,
               dropout, max_len, device):
    super(Encoder, self).__init__()
    self.embed_size = embed_size
    self.src_pad_idx = src_pad_idx
    self.device = device

    self.src_word_embed = nn.Embedding(src_vocab_size, embed_size)
    initrange = 0.1
    self.src_word_embed.weight.data.uniform_(-initrange, initrange)

    self.src_pos_enc = PositionalEncoding(embed_size, dropout, max_len)

    encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=nheads, dim_feedforward=nhid, dropout=dropout)
    
    encoder_norm = nn.LayerNorm(embed_size)

    self.transformer_encoder = nn.TransformerEncoder(encoder_layer, nencl, encoder_norm)

  def forward(self, src):
    src_mask = src.transpose(0, 1) == self.src_pad_idx

    src = self.src_word_embed(src) * math.sqrt(self.embed_size)

    src = self.src_pos_enc(src)

    src = src.to(self.device)
    src_mask = src_mask.to(self.device)

    output = self.transformer_encoder(src, src_key_padding_mask=src_mask)

    return output


### **Positional Encoding**

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

## **Decoder**

### **Decoder Model**

In [None]:
class Decoder(nn.Module):
  def __init__(self, embed_size, tgt_vocab_size, 
               tgt_pad_idx,
               nheads, nhid, ndecl,
               dropout, max_len, device):
    super(Decoder, self).__init__()
    self.embed_size = embed_size
    self.device = device

    self.tgt_word_embed = nn.Embedding(tgt_vocab_size, embed_size)
    initrange = 0.1
    self.tgt_word_embed.weight.data.uniform_(-initrange, initrange)

    self.tgt_pos_enc = PositionalEncoding(embed_size, dropout, max_len)

    decoder_layer = nn.TransformerDecoderLayer(d_model=embed_size, nhead=nheads, dim_feedforward=nhid, dropout=dropout)

    decoder_norm = nn.LayerNorm(embed_size)

    self.transformer_decoder = nn.TransformerDecoder(decoder_layer, ndecl, decoder_norm)

  def forward(self, tgt, memory):
    tgt_seq_len = tgt.size(0)

    tgt = self.tgt_word_embed(tgt) * math.sqrt(self.embed_size)

    tgt = self.tgt_pos_enc(tgt)
    #print("tgt")
    #print(tgt)
    mask = (torch.triu(torch.ones(tgt_seq_len, tgt_seq_len)) == 1).transpose(0, 1)
    #tgt_mask = mask.float().masked_fill(mask==0, float('inf')).masked_fill(mask == 1, float(0.0))
    tgt_mask = mask.masked_fill(mask==0, True).masked_fill(mask == 1, False)

    tgt = tgt.to(device)
    tgt_mask = tgt_mask.to(device)
    #print(tgt_mask.shape)

    output = self.transformer_decoder(tgt=tgt, memory=memory, tgt_mask = tgt_mask)
    #print("output")
    #print(output)
    return output

## **Transformer**

In [None]:
class Transformer(nn.Module):
  def __init__(self, embed_size, tgt_vocab_size,
               encoder, decoder):
    super(Transformer, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

    self.fc_out = nn.Linear(embed_size, tgt_vocab_size, bias = False)

  def forward(self, src, tgt):
    
    encoder_output = self.encoder(src)
    #print("encoder_output")
    #print(encoder_output)
    decoder_output = self.decoder(tgt, encoder_output)
    #print("decoder_output")
    #decoder_output = decoder_output.masked_fill(torch.isnan(decoder_output), 0)
    #print(decoder_output)

    output = self.fc_out(decoder_output)

    return output

# **Making Model**

In [None]:
# device
device = torch.device("cuda:0")

# Training Values
num_epochs = 500
learning_rate = 1e-4

# Vocabulary Sizes
src_vocab_size = 2000
tgt_vocab_size = 2000

# Transformer Values
embed_size = 512
nheads = 8
nhid = 512
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.1
MAX_LEN = 100
warmup = 4000

# Pad Indexes
src_pad_idx = PAD_IDX
tgt_pad_idx = PAD_IDX

In [None]:
encoder = Encoder(embed_size, src_vocab_size,
                  src_pad_idx,
                  nheads, nhid, num_encoder_layers,
                  dropout, MAX_LEN, device)

decoder = Decoder(embed_size, tgt_vocab_size, 
                  tgt_pad_idx,
                  nheads, nhid, num_decoder_layers,
                  dropout, MAX_LEN, device)

transformer = Transformer(embed_size, tgt_vocab_size,
                          encoder, decoder)

model = transformer

In [None]:
for p in model.parameters():
  if p.dim() > 1:
    nn.init.xavier_uniform_(p)


In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total number of Model Parameters: {total_params}")
print(f"Total number of Model Parameters: {trainable_params}")

Total number of Model Parameters: 15696896
Total number of Model Parameters: 15696896


## **Noam Optimizer**

In [None]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(embed_size, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [None]:
model.to(device)

optimizer = get_std_opt(model)

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)


# **Translation**

In [None]:
def translate(model, sentence):
  source = torch.tensor(source_tokenizer(sentence), dtype = torch.long)
  if len(source) <= 100:
    source = source.unsqueeze(1).to(device)

    outputs = [target_vocab.bos_id()]

    for i in range(MAX_LEN):
      target = torch.tensor(outputs, dtype=torch.long).unsqueeze(1).to(device)

      with torch.no_grad():
        output = model(source, target)
        #print(output)
      best_guess = output.argmax(2)[-1, :].item()
      outputs.append(best_guess)
      
      if best_guess == target_vocab.eos_id():
        break
    print(outputs)
    translated_sentence = target_vocab.decode_ids(outputs)
  else:
    translated_sentence = ""
  return translated_sentence

In [None]:
sentence = "Digestion becomes fast from walk"
model.eval()
translated_sentence = translate(model, sentence)
print(translated_sentence)

[2, 815, 59, 187, 1279, 63, 1553, 1279, 1913, 1437, 1254, 148, 18, 279, 217, 187, 1279, 1913, 1437, 1254, 148, 47, 18, 279, 217, 1745, 1279, 1913, 1437, 1553, 1279, 1913, 1437, 1553, 1279, 1913, 1437, 1553, 1279, 1913, 1437, 1553, 1279, 1914, 1706, 1553, 1279, 1913, 1437, 1553, 1279, 1914, 1706, 1553, 1279, 1913, 1437, 1553, 1279, 1914, 1706, 561, 20, 3]
फिरण्यापासून पचना लवकर पचनक्रिया तीव्र होते अक्रोडपासून पचनक्रिया तीव्र होते आणि अक्रोड वेगाने पचनक्रिया लवकर पचनक्रिया लवकर पचनक्रिया लवकर पचनक्रिया लवकर पचवढी लवकर पचनक्रिया लवकर पचवढी लवकर पचनक्रिया लवकर पचवढी येते .


# **Traning**

In [None]:
sentence = "Digestion becomes fast from walk"
total_loss = 0
for epoch in range(num_epochs):
  step=0
  i=0
  total_loss = 0

  print(f"Epoch {epoch} / {num_epochs}")
  '''
  if save_model:
    checkpoint = {
        "save_dict" : model.save_dict(),
        "optimizer" : optimizer.state_dict()
    }

  save_checkpoint(checkpoint)
  '''
  model.eval()
  translated_sentence = translate(model, sentence)

  print(translated_sentence)

  model.train()

  for batch_idx, (source, target) in enumerate(train_iter):
    if batch_idx%200 == 1:
      #print(total_loss)
      #print(total_loss/batch_idx)
      print(batch_idx)

    source = source.to(device)
    target = target.to(device)

    # Forward
    output = model(source, target[:-1])

    output = output.reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)
    i = i + target.shape[0]

    loss = criterion(output, target)
    total_loss = loss.item() + total_loss
    loss.backward()

    optimizer.step()
    optimizer.optimizer.zero_grad()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

  print(total_loss)
  print(total_loss/i)


In [None]:
a = torch.tensor([[1,2,3],[4,5,6]])
a[:-1]

tensor([[1, 2, 3]])

# **Evaluation**

In [None]:
# Path to test files
ref = open("test.mr", 'r').readlines()
src = open("test.en", 'r').readlines()
hyp = []
refs = []
for i in range(len(src)):
  if i % 100 == 0:
    print(i)
  h = translate(model, src[i])
  if h != "":
    refs.append(ref[i])
    hyp.append(h)

In [None]:
!pip3 install sacrebleu

In [None]:
import sacrebleu
bleu = sacrebleu.corpus_bleu(hyp, [ref])
print(bleu.score)