# Loading data 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd drive/MyDrive

/content/drive/MyDrive


In [3]:
import pandas as pd
data=pd.read_csv("data.csv",encoding="utf-8")

In [4]:
print(len(data))
data.head(30)

75182


Unnamed: 0,grapheme,phoneme
0,ره,rahmatollAh'alayh
1,ره,rahmatollAh
2,ص,salavAtollAh'alayh
3,ع,'alayhessalAm
4,عج,'ajalallAhota'AlA
5,ق.ا.,qanune'asAsi
6,ق.م.,qanunemadani
7,ق.م.,qablazmilAd
8,ق.م,qablazmilAd
9,ق‌م,qablazmilAd


## Deleting 0 to 19 rows:
because they are abbreviations in persian

In [5]:
data=data.drop(data.index[:20])
print(len(data))
data.head(30)

75162


Unnamed: 0,grapheme,phoneme
20,اب,'ab
21,ابا,'ebA
22,ابا,'abA
23,اباء,ebA'
24,اباالفضل‌العباس,abalfazle'abbAs
25,ابابکر,'abAbakr
26,ابابیل,'abAbil
27,اباجعفر,'abAja'far
28,اباحتی,'ebAhati
29,اباحه,'ebAhe


## Splitting data to train,validation,test

In [6]:
graphemes=list(data["grapheme"])
phonemes=list(data["phoneme"])
len(graphemes),len(phonemes)

(75162, 75162)

In [7]:
all_data=[]
for i in range(len(graphemes)):
  all_data.append((graphemes[i],phonemes[i]))

In [8]:
len(all_data),all_data[70000]

(75162, ('نگاشتن', 'negAStan'))

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_iter,test_iter=train_test_split(all_data,test_size=0.2,random_state=42,shuffle=True)

In [11]:
len(train_iter),len(test_iter)

(60129, 15033)

# Tokenizer:
Since both graphemes and phonemes have no space character we just list the string for tokenizer

In [14]:
def my_tokenizer(word : str):
  return list(word)

# Building vocab and token transform

In [15]:
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List

token_transform = {}
vocab_transform = {}

SRC_LANGUAGE = 'grapheme'
TGT_LANGUAGE = 'phoneme'

token_transform[SRC_LANGUAGE] = my_tokenizer
token_transform[TGT_LANGUAGE] = my_tokenizer

# yield token for build_vocab_from_iterator function
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

#special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    all_iter=all_data
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(all_data, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Creating Transformer Model

In [19]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## PositionalEncoding

In [20]:
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])


## TokenEmbedding

In [21]:
# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


## Seq2Seq Transformer model

In [22]:
# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

# Masking mechanism
During training, we need a subsequent character mask that will prevent model to look into the future characters when making predictions. We will also need masks to hide source and target padding tokens. Below, let’s define a function that will take care of both.

Reference : https://pytorch.org/tutorials/beginner/translation_transformer.html

In [23]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

# Building Model

In [25]:
torch.manual_seed(42)

# Hyperparameters
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

# Instantiating Model
g2p_model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

# weights and biases initialization
for p in g2p_model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

g2p_model = g2p_model.to(DEVICE)

# CrossEntropy Loss function
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# Adam optimizer
optimizer = torch.optim.Adam(g2p_model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Collation
our data iterator yields a pair of raw strings. We need to convert these string pairs into the batched tensors that can be processed by our Seq2Seq network defined previously. Below we define our collate function that convert batch of raw strings into batch tensors that can be fed directly into our model.

Reference : https://pytorch.org/tutorials/beginner/translation_transformer.html

In [29]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tesors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

# Training

In [30]:
from torch.utils.data import DataLoader

## Training function

In [31]:
def train_function(model, optimizer):
    model.train()
    losses = 0
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)

## Evaluating function

In [32]:
def evaluate_function(model):
    model.eval()
    losses = 0

    val_iter=test_iter
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(val_dataloader)

## Train model for N epochs

In [35]:
from timeit import default_timer as timer
NUM_EPOCHS = 7

stats_for_plotting={"train_losses":[],"test_losses":[]}
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_function(g2p_model, optimizer)
    end_time = timer()
    val_loss = evaluate_function(g2p_model)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    stats_for_plotting["train_losses"].append(train_loss)
    stats_for_plotting["test_losses"].append(val_loss)


Epoch: 1, Train loss: 1.037, Val loss: 0.383, Epoch time = 35.727s
Epoch: 2, Train loss: 0.406, Val loss: 0.282, Epoch time = 33.293s
Epoch: 3, Train loss: 0.316, Val loss: 0.243, Epoch time = 33.062s
Epoch: 4, Train loss: 0.273, Val loss: 0.220, Epoch time = 32.980s
Epoch: 5, Train loss: 0.245, Val loss: 0.202, Epoch time = 33.119s
Epoch: 6, Train loss: 0.225, Val loss: 0.191, Epoch time = 32.974s
Epoch: 7, Train loss: 0.209, Val loss: 0.184, Epoch time = 33.032s


In [47]:
for epoch in range(8, NUM_EPOCHS+8):
    start_time = timer()
    train_loss = train_function(g2p_model, optimizer)
    end_time = timer()
    val_loss = evaluate_function(g2p_model)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    stats_for_plotting["train_losses"].append(train_loss)
    stats_for_plotting["test_losses"].append(val_loss)
    torch.save(g2p_model.state_dict(),f"./G2P/epoch_{epoch}.pth")

Epoch: 8, Train loss: 0.195, Val loss: 0.176, Epoch time = 33.421s
Epoch: 9, Train loss: 0.184, Val loss: 0.168, Epoch time = 33.136s
Epoch: 10, Train loss: 0.174, Val loss: 0.161, Epoch time = 32.910s
Epoch: 11, Train loss: 0.165, Val loss: 0.155, Epoch time = 33.200s
Epoch: 12, Train loss: 0.157, Val loss: 0.153, Epoch time = 32.937s
Epoch: 13, Train loss: 0.150, Val loss: 0.150, Epoch time = 33.033s
Epoch: 14, Train loss: 0.142, Val loss: 0.146, Epoch time = 33.139s


In [49]:
for epoch in range(15, NUM_EPOCHS+25):
    start_time = timer()
    train_loss = train_function(g2p_model, optimizer)
    end_time = timer()
    val_loss = evaluate_function(g2p_model)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    stats_for_plotting["train_losses"].append(train_loss)
    stats_for_plotting["test_losses"].append(val_loss)
    torch.save(g2p_model.state_dict(),f"./G2P/epoch_{epoch}.pth")

Epoch: 15, Train loss: 0.138, Val loss: 0.144, Epoch time = 33.162s
Epoch: 16, Train loss: 0.132, Val loss: 0.142, Epoch time = 33.229s
Epoch: 17, Train loss: 0.127, Val loss: 0.139, Epoch time = 32.894s
Epoch: 18, Train loss: 0.123, Val loss: 0.139, Epoch time = 33.078s
Epoch: 19, Train loss: 0.118, Val loss: 0.138, Epoch time = 32.969s
Epoch: 20, Train loss: 0.115, Val loss: 0.138, Epoch time = 33.005s
Epoch: 21, Train loss: 0.110, Val loss: 0.136, Epoch time = 33.046s
Epoch: 22, Train loss: 0.107, Val loss: 0.137, Epoch time = 33.071s
Epoch: 23, Train loss: 0.103, Val loss: 0.137, Epoch time = 32.967s
Epoch: 24, Train loss: 0.100, Val loss: 0.139, Epoch time = 32.964s
Epoch: 25, Train loss: 0.097, Val loss: 0.139, Epoch time = 32.965s
Epoch: 26, Train loss: 0.094, Val loss: 0.138, Epoch time = 32.968s
Epoch: 27, Train loss: 0.092, Val loss: 0.134, Epoch time = 32.990s
Epoch: 28, Train loss: 0.089, Val loss: 0.136, Epoch time = 33.099s
Epoch: 29, Train loss: 0.087, Val loss: 0.136, E

In [50]:
stats_for_plotting

{'train_losses': [1.0367890465766825,
  0.4061702524727963,
  0.31627280591650214,
  0.2732579821918873,
  0.2451669233276489,
  0.22498793123250313,
  0.2087269459316071,
  0.1949072889825131,
  0.1836262784105666,
  0.17351482285464065,
  0.1647067409246526,
  0.15725765250464704,
  0.15038710295202884,
  0.14218230090559797,
  0.1376039984536932,
  0.13193702467895568,
  0.12733872214213332,
  0.12256287157218507,
  0.11806193495050389,
  0.11464042195931394,
  0.11045886295589995,
  0.10683923786941996,
  0.10347719591190206,
  0.10037906991040453,
  0.0967609409797699,
  0.09378337535927904,
  0.09201003277396902,
  0.08884370896093388,
  0.08716795068789036,
  0.08473875898313014,
  0.0826206015304048],
 'test_losses': [0.38281249747437945,
  0.2818042916261544,
  0.24295662469783072,
  0.2201470841290587,
  0.2019746884451074,
  0.1910256447933488,
  0.18391394589917134,
  0.17639153374958846,
  0.1677941234182503,
  0.16115214940855058,
  0.15461906563427488,
  0.15309493553082

In [36]:
torch.save(g2p_model.state_dict(),"./G2P/epoch_7.pth")

In [44]:
from tqdm import tqdm
trues=0
all_preds=0
for gs,ps in tqdm(test_iter):
  gt=ps
  myoutput=inference(g2p_model,gs)
  if myoutput.replace(" ","")==gt:
    trues+=1
  all_preds+=1
print(1.0*trues/all_preds)

100%|██████████| 15033/15033 [09:55<00:00, 25.23it/s]

0.5769972726667997





# Inference

## decoder function
it is called greedy decoder because we don't do any beam search for results

In [33]:
# function to generate output sequence
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


## Inference function
gives a grapheme and outputs the phoneme

In [34]:
def inference(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

## Some example: