In [1]:
import torch
import numpy as np
import math
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [33]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

class SentenceEmbedding(nn.Module):
    "For a given sentence, create an embedding"
    def __init__(self, max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN):
        super().__init__()
        self.vocab_size = len(language_to_index)
        self.max_sequence_length = max_sequence_length
        self.embedding = nn.Embedding(self.vocab_size, d_model)
        self.language_to_index = language_to_index
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        self.START_TOKEN = START_TOKEN
        self.END_TOKEN = END_TOKEN
        self.PADDING_TOKEN = PADDING_TOKEN

    def batch_tokenize(self, batch, start_token, end_token):

        def tokenize(sentence, start_token, end_token):
            sentence_word_indicies = [self.language_to_index[token] for token in list(sentence)]
            if start_token:
                sentence_word_indicies.insert(0, self.language_to_index[self.START_TOKEN])
            if end_token:
                sentence_word_indicies.append(self.language_to_index[self.END_TOKEN])
            for _ in range(len(sentence_word_indicies), self.max_sequence_length):
                sentence_word_indicies.append(self.language_to_index[self.PADDING_TOKEN])
            return torch.tensor(sentence_word_indicies)

        tokenized = []
        for sentence_num in range(len(batch)):
           tokenized.append( tokenize(batch[sentence_num], start_token, end_token) )
        tokenized = torch.stack(tokenized)
        return tokenized.to(get_device())

    def forward(self, x, start_token, end_token): # sentence
        x = self.batch_tokenize(x, start_token, end_token)
        x = self.embedding(x)
        pos = self.position_encoder().to(get_device())
        x = self.dropout(x + pos)
        return x


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out


class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x

class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x

class Encoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])

    def forward(self, x, self_attention_mask, start_token, end_token):
        x = self.sentence_embedding(x, start_token, end_token)
        x = self.layers(x, self_attention_mask)
        return x


class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model , 2 * d_model)
        self.q_layer = nn.Linear(d_model , d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y, mask):
        batch_size, sequence_length, d_model = x.size() # in practice, this is the same for both languages...so we can technically combine with normal attention
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask) # We don't need the mask for cross attention, removing in outer function!
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        return out


class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(parameters_shape=[d_model])
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)

        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)

        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y


class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

class Decoder(nn.Module):
    def __init__(self,
                 d_model,
                 ffn_hidden,
                 num_heads,
                 drop_prob,
                 num_layers,
                 max_sequence_length,
                 language_to_index,
                 START_TOKEN,
                 END_TOKEN,
                 PADDING_TOKEN):
        super().__init__()
        self.sentence_embedding = SentenceEmbedding(max_sequence_length, d_model, language_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, x, y, self_attention_mask, cross_attention_mask, start_token, end_token):
        y = self.sentence_embedding(y, start_token, end_token)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y


class Transformer(nn.Module):
    def __init__(self,
                d_model,
                ffn_hidden,
                num_heads,
                drop_prob,
                num_layers,
                max_sequence_length,
                mr_vocab_size,
                english_to_index,
                marathi_to_index,
                START_TOKEN,
                END_TOKEN,
                PADDING_TOKEN
                ):
        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, english_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, marathi_to_index, START_TOKEN, END_TOKEN, PADDING_TOKEN)
        self.linear = nn.Linear(d_model, mr_vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None,
                enc_start_token=False,
                enc_end_token=False,
                dec_start_token=True, # We should make this true
                dec_end_token=True): # x, y are batch of sentences
        x = self.encoder(x, encoder_self_attention_mask, start_token=enc_start_token, end_token=enc_end_token)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask, start_token=dec_start_token, end_token=dec_end_token)
        out = self.linear(out)
        # out = F.softmax(out, dim=-1)
        return out

In [4]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"ritz4566","key":"f706ec5201697b30f5c680eaeb38e4fe"}'}

In [5]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

In [6]:
! chmod 600 ~/.kaggle/kaggle.json

In [7]:
! kaggle datasets download mathurinache/samanantar

Dataset URL: https://www.kaggle.com/datasets/mathurinache/samanantar
License(s): CC-BY-NC-SA-4.0
Downloading samanantar.zip to /content
100% 3.44G/3.44G [03:36<00:00, 18.4MB/s]
100% 3.44G/3.44G [03:36<00:00, 17.1MB/s]


In [8]:
! unzip samanantar.zip

Archive:  samanantar.zip
  inflating: final_data/en-as/train.as  
  inflating: final_data/en-as/train.en  
  inflating: final_data/en-bn/train.bn  
  inflating: final_data/en-bn/train.en  
  inflating: final_data/en-gu/train.en  
  inflating: final_data/en-gu/train.gu  
  inflating: final_data/en-hi/train.en  
  inflating: final_data/en-hi/train.hi  
  inflating: final_data/en-kn/train.en  
  inflating: final_data/en-kn/train.kn  
  inflating: final_data/en-ml/train.en  
  inflating: final_data/en-ml/train.ml  
  inflating: final_data/en-mr/train.en  
  inflating: final_data/en-mr/train.mr  
  inflating: final_data/en-or/train.en  
  inflating: final_data/en-or/train.or  
  inflating: final_data/en-pa/train.en  
  inflating: final_data/en-pa/train.pa  
  inflating: final_data/en-ta/train.en  
  inflating: final_data/en-ta/train.ta  
  inflating: final_data/en-te/train.en  
  inflating: final_data/en-te/train.te  


In [34]:
english_file = '/content/final_data/en-mr/train.en'
marathi_file = '/content/final_data/en-mr/train.mr'

In [35]:
START_TOKEN = '<START>'
PADDING_TOKEN = '<PADDING>'
END_TOKEN = '<END>'

In [36]:
marathi_vocab = [
    START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?',

    # Devanagari characters for Marathi
    'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'अं', 'अः',
    'क', 'ख', 'ग', 'घ', 'ङ',
    'च', 'छ', 'ज', 'झ', 'ञ',
    'ट', 'ठ', 'ड', 'ढ', 'ण',
    'त', 'थ', 'द', 'ध', 'न',
    'प', 'फ', 'ब', 'भ', 'म',
    'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह',

    # Additional characters and ligatures
    'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'े', 'ै', 'ो', 'ौ', '्', 'ँ', 'ं', 'ः',

    # Special tokens
    PADDING_TOKEN, END_TOKEN
]

english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        '[', '\\', ']', '^', '_', '`',
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                        'y', 'z',
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

In [37]:
index_to_marathi = {k:v for k, v in enumerate(marathi_vocab)}
marathi_to_index = {v:k for k, v in enumerate(marathi_vocab)}
index_to_english = {k:v for k, v in enumerate(english_vocabulary)}
english_to_index = {v:k for k, v in enumerate(english_vocabulary)}

In [38]:
with open(english_file, 'r') as file:
    english_sentences = file.readlines()
with open(marathi_file, 'r') as file:
    marathi_sentences = file.readlines()

# Limit Number of sentences
TOTAL_SENTENCES = 200000
english_sentences = english_sentences[:TOTAL_SENTENCES]
marathi_sentences = marathi_sentences[:TOTAL_SENTENCES]
english_sentences = [sentence.rstrip('\n').lower() for sentence in english_sentences]
marathi_sentences = [sentence.rstrip('\n') for sentence in marathi_sentences]

In [39]:
english_sentences[:10]

['next few months are really crucial for us.',
 "sharad pawar, supriya sule, ajit pawar, praful patel, chhagan bhujbal and nawab malik from the ncp and ahmed patel, mallikarjun kharge, balasaheb thorat, prithviraj chavan from the congress were present in the meeting at the ncp chief's residence in delhi.",
 '"kanwal singh chauhan, the president of sonepat progressive farmers club, said, ""farmers are being misled."',
 'his demise is a huge loss for india.',
 'photographer and raconteur.',
 'he has won several competitions at the state and national level.',
 'this is a worrying increase.',
 'most of the victims were dalits.',
 'this has shocked the congress.',
 'this was not the idea.']

In [40]:
marathi_sentences[:10]

['पुढील तीन महिने आमच्यासाठी खूप महत्त्वपूर्ण आहेत.',
 'या बैठकील पक्षाध्यक्ष शरद पवार, केंद्रीय मंत्री प्रफुल्ल पटेल, उपमुख्यमंत्री अजित पवार, छगन भुजबळ, मधुकरराव पिचड, आर. आर. पाटील, सुनिल तटकरे, भास्कर जाधव आणि जितेंद्र आव्हाड हे नेते उपस्थित होते.',
 'वृत्तसंस्थेने दिलेल्या माहितीनुसार, विरोध करण्याऱ्या शेतकऱ्यांची दिशाभूल केली जात असल्याचा आरोप प्रगतीशील शेतकरी संघटना, सेनीपतचे अध्यक्ष कंवलसिंग चौहान यांनी केला.',
 'त्यांच्या मृत्यूमुळे भारताचे खूप मोठे नुकसान झाले.',
 'छायाचित्रकार आणि छायाचित्रकार.',
 'राज्य व राष्ट्रीय पातळीवरील अनेक मल्ल त्यांनी मेहनतीने घडविले.',
 'ही वाढ चिंताजनक तर आहेच.',
 'त्यातील बहुतांशी घटनांत पीडित व्यक्ती या दलित समाजाच्या होत्या.',
 'त्यामुळे काँग्रेसला मोठा धक्का बसला.',
 'हा कल्पनाविलास नव्हता.']

In [41]:
import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length Marathi: {np.percentile([len(x) for x in marathi_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )

97th percentile length Marathi: 183.0
97th percentile length English: 193.0


In [42]:
max_sequence_length = 200

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(marathi_sentences)):
    marathi_sentence, english_sentence = marathi_sentences[index], english_sentences[index]
    if is_valid_length(marathi_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(marathi_sentence, marathi_vocab):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(marathi_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 200000
Number of valid sentences: 116105


In [43]:
marathi_sentences = [marathi_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

In [44]:
marathi_sentences[:3]

['पुढील तीन महिने आमच्यासाठी खूप महत्त्वपूर्ण आहेत.',
 'छायाचित्रकार आणि छायाचित्रकार.',
 'ही वाढ चिंताजनक तर आहेच.']

In [45]:
d_model = 512
batch_size = 30
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 5
max_sequence_length = 200
mr_vocab_size = len(marathi_vocab)

In [46]:
transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          max_sequence_length,
                          mr_vocab_size,
                          english_to_index,
                          marathi_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)

In [47]:
transformer

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(71, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (attention): MultiHeadAt

In [48]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, marathi_sentences):
        self.english_sentences = english_sentences
        self.marathi_sentences = marathi_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.marathi_sentences[idx]

In [49]:
dataset = TextDataset(english_sentences, marathi_sentences)

In [50]:
len(dataset)

116105

In [51]:
dataset[5]

('the funds collected are used for social purposes.',
 'त्या निधीचा वापर सामाजिक कार्यासाठी करतो.')

In [52]:
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [53]:
for batch_num, batch in enumerate(iterator):
  print(batch)
  if batch_num > 3:
    break

[('next few months are really crucial for us.', 'photographer and raconteur.', 'this is a worrying increase.', 'most of the victims were dalits.', 'this was not the idea.', 'the funds collected are used for social purposes.', '"she said, ""i needed to have my counsellor on the sets of the movie with me."', 'dont remember.', 'the court then reserved its judgment.', 'heavy contingents of police have been deployed across the route of the rally.', 'everyones work is being appreciated.', 'both are currently out on bail.', 'the blast also smashed some windows of the building.', 'cant be tolerated.', 'to an extent!', 'the new zealand...', 'this video is going viral on social media.', 'so, where is that heat coming from?', "the indian music influences a person's thought process, his mind and his mind-set.", 'three persons are missing.', 'dengue cases on the rise in city', 'pawar was speaking at a press conference.', 'just heard something.', 'the event is still going on, reports ani.', 'saroj a

In [54]:
from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=marathi_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [55]:
NEG_INFTY = -1e9

def create_masks(eng_batch, mr_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, mr_sentence_length = len(eng_batch[idx]), len(mr_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      mr_chars_to_padding_mask = np.arange(mr_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, mr_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, mr_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, mr_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [56]:
transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 10

for epoch in range(num_epochs):
  print(f"Epoch Number {epoch}")
  iterator = iter(train_loader)
  for batch_num, batch in enumerate(iterator):
    transformer.train()
    eng_batch, mr_batch = batch
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, mr_batch)
    optim.zero_grad()
    mr_predictions =  transformer(eng_batch,
                                  mr_batch,
                                  encoder_self_attention_mask.to(device),
                                  decoder_self_attention_mask.to(device),
                                  decoder_cross_attention_mask.to(device),
                                  enc_start_token=False,
                                  enc_end_token = False,
                                  dec_start_token=True,
                                  dec_end_token = True)
    labels = transformer.decoder.sentence_embedding.batch_tokenize(mr_batch, start_token=False, end_token=True)
    loss = criterian(mr_predictions.view(-1, mr_vocab_size).to(device), labels.view(-1).to(device)).to(device)
    valid_indices = torch.where(labels.view(-1) == marathi_to_index[PADDING_TOKEN], False, True)
    loss = loss.sum() / valid_indices.sum()
    loss.backward()
    optim.step()

    if batch_num % 100 == 0:
      print(f"Iteration {batch_num} : {loss.item()}")
      print(f"English: {eng_batch[0]}")
      print(f"Marathi: {mr_batch[0]}")
      mr_sentence_predicetd = torch.argmax(mr_predictions[0], axis=-1)
      prediceted_sentence = ""
      for idx in mr_sentence_predicetd:
        if idx == marathi_to_index[END_TOKEN]:
          break
        prediceted_sentence += index_to_marathi[idx.item()]
      print(f"Predicted Marathi: {prediceted_sentence}")

      transformer.eval()
      mr_sentence = ("",)
      eng_sentence = ("the funding will be utilised for product development and technology",)
      for word_counter in range(max_sequence_length):
          encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, mr_sentence)
          predictions = transformer(eng_sentence,
                                    mr_sentence,
                                    encoder_self_attention_mask.to(device),
                                    decoder_self_attention_mask.to(device),
                                    decoder_cross_attention_mask.to(device),
                                    enc_start_token=False,
                                    enc_end_token=False,
                                    dec_start_token=True,
                                    dec_end_token=False)
          next_token_prob_distribution = predictions[0][word_counter] # not actual probs
          next_token_index = torch.argmax(next_token_prob_distribution).item()
          next_token = index_to_marathi[next_token_index]
          mr_sentence = (mr_sentence[0] + next_token, )
          if next_token == END_TOKEN:
            break

      print(f"Evaluation translation (the funding will be utilised for product development and technology) : {mr_sentence}")
      print("-------------------------------------------")

Epoch Number 0
Iteration 0 : 5.482481956481934
English: next few months are really crucial for us.
Marathi: पुढील तीन महिने आमच्यासाठी खूप महत्त्वपूर्ण आहेत.
Predicted Marathi: म"म)ठमथमम"मम"म
Evaluation translation (the funding will be utilised for product development and technology) : ('ाााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााााा',)
-------------------------------------------
Iteration 100 : 3.466341495513916
English: i am very happy to have won this prize.
Marathi: हा पुरस्कार जाहीर झाल्याचा मला खूप आनंद झाला.
Predicted Marathi:           ाा  ाा    ा  ा   ा ा       ा ाा  ा                ा  ा   ा ा ा  ा   ाा   ा    ाा ा  ा  ा  ा   ा्         ा ा    ा ाा   ा ा ा       ा ा       ा ा ा         ा     ााा                 ाा   ा   
Evaluation translation (the funding will be utilised for product development and technology) : ('        

In [57]:
transformer.eval()

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(71, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (attention): MultiHeadAt

In [60]:
transformer.eval()
def translate(eng_sentence):
  eng_sentence = (eng_sentence,)
  mr_sentence = ("",)
  for word_counter in range(max_sequence_length):
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, mr_sentence)
    predictions = transformer(eng_sentence,
                              mr_sentence,
                              encoder_self_attention_mask.to(device),
                              decoder_self_attention_mask.to(device),
                              decoder_cross_attention_mask.to(device),
                              enc_start_token=False,
                              enc_end_token=False,
                              dec_start_token=True,
                              dec_end_token=False)
    next_token_prob_distribution = predictions[0][word_counter]
    next_token_index = torch.argmax(next_token_prob_distribution).item()
    next_token = index_to_marathi[next_token_index]
    mr_sentence = (mr_sentence[0] + next_token, )
    if next_token == END_TOKEN:
      break
  return mr_sentence[0]

In [61]:
translation = translate("hii how are you?")
translation

'त्याची काय आहे?<END>'

In [62]:
translation = translate("how is this the truth?")
print(translation)

त्याची कशी कशी आहे?<END>


In [63]:
translation = translate("my name is ajay")
print(translation)

माझ्या पत्नीची भूमिका<END>


In [67]:
torch.save(transformer, 'transformer.pt')

In [68]:
translation = translate("i cannot stand this smell")
print(translation)

मी काय करत नाही त्यांच्यावर काही करत नाही.<END>


In [69]:
translation = translate("i am here")
print(translation)

मी माझ्या मागे आहे.<END>


In [70]:
translation = translate("click this")
print(translation)

त्यांनी काय केले आहे.<END>


In [71]:
translation = translate("i am well.")
print(translation)

मी तुमच्या मागे आहे.<END>


In [73]:
model = torch.load('transformer.pt', weights_only=False)

In [74]:
model

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(71, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (attention): MultiHeadAt

In [75]:
model.eval()
def translate(eng_sentence):
  eng_sentence = (eng_sentence,)
  mr_sentence = ("",)
  for word_counter in range(max_sequence_length):
    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, mr_sentence)
    predictions = model(eng_sentence,
                              mr_sentence,
                              encoder_self_attention_mask.to(device),
                              decoder_self_attention_mask.to(device),
                              decoder_cross_attention_mask.to(device),
                              enc_start_token=False,
                              enc_end_token=False,
                              dec_start_token=True,
                              dec_end_token=False)
    next_token_prob_distribution = predictions[0][word_counter]
    next_token_index = torch.argmax(next_token_prob_distribution).item()
    next_token = index_to_marathi[next_token_index]
    mr_sentence = (mr_sentence[0] + next_token, )
    if next_token == END_TOKEN:
      break
  return mr_sentence[0]

In [76]:
translation = translate("i am well.")
print(translation)

मी तुमच्या मागे आहे.<END>


In [77]:
translation = translate("i cannot stand this smell")
print(translation)

मी काय करत नाही त्यांच्यावर काही करत नाही.<END>
