# Loading data 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd drive/MyDrive

/content/drive/MyDrive


In [3]:
import pandas as pd
data=pd.read_csv("data.csv",encoding="utf-8")

In [4]:
print(len(data))
data.head(30)

75182


Unnamed: 0,grapheme,phoneme
0,ره,rahmatollAh'alayh
1,ره,rahmatollAh
2,ص,salavAtollAh'alayh
3,ع,'alayhessalAm
4,عج,'ajalallAhota'AlA
5,ق.ا.,qanune'asAsi
6,ق.م.,qanunemadani
7,ق.م.,qablazmilAd
8,ق.م,qablazmilAd
9,ق‌م,qablazmilAd


## Deleting 0 to 19 rows:
because they are abbreviations in persian

In [5]:
data=data.drop(data.index[:20])
print(len(data))
data.head(30)

75162


Unnamed: 0,grapheme,phoneme
20,اب,'ab
21,ابا,'ebA
22,ابا,'abA
23,اباء,ebA'
24,اباالفضل‌العباس,abalfazle'abbAs
25,ابابکر,'abAbakr
26,ابابیل,'abAbil
27,اباجعفر,'abAja'far
28,اباحتی,'ebAhati
29,اباحه,'ebAhe


## Splitting data to train,validation,test

In [6]:
graphemes=list(data["grapheme"])
phonemes=list(data["phoneme"])
len(graphemes),len(phonemes)

(75162, 75162)

In [7]:
all_data=[]
for i in range(len(graphemes)):
  all_data.append((graphemes[i],phonemes[i]))

In [8]:
len(all_data),all_data[70000]

(75162, ('نگاشتن', 'negAStan'))

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_iter,test_iter=train_test_split(all_data,test_size=0.2,random_state=42,shuffle=True)

In [11]:
len(train_iter),len(test_iter)

(60129, 15033)

# Tokenizer:
Since both graphemes and phonemes have no space character we just list the string for tokenizer

In [14]:
def my_tokenizer(word : str):
  return list(word)

# Building vocab and token transform

In [15]:
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List

token_transform = {}
vocab_transform = {}

SRC_LANGUAGE = 'grapheme'
TGT_LANGUAGE = 'phoneme'

token_transform[SRC_LANGUAGE] = my_tokenizer
token_transform[TGT_LANGUAGE] = my_tokenizer

# yield token for build_vocab_from_iterator function
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

#special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    all_iter=all_data
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(all_data, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Creating Transformer Model

In [19]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## PositionalEncoding

In [20]:
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])


## TokenEmbedding

In [21]:
# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


## Seq2Seq Transformer model

In [22]:
# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

# Masking mechanism
During training, we need a subsequent character mask that will prevent model to look into the future characters when making predictions. We will also need masks to hide source and target padding tokens. Below, let’s define a function that will take care of both.

Reference : https://pytorch.org/tutorials/beginner/translation_transformer.html

In [23]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

# Building Model