In [1]:
import numpy as np
import pandas as pd
import re
import string
from string import digits
import torch

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE='cpu'

In [2]:
eng_file_path='eng_trans.txt'
fr_file_path='french_trans.txt'

def read_file_to_list(path):
    with open(path,encoding="utf8") as f:
        content_list = f.readlines()
    content_list = [x.strip() for x in content_list]
    return content_list

In [3]:
'''
read CSV files and push to dataframe
'''
raw_eng=read_file_to_list(eng_file_path)
raw_fr=read_file_to_list(fr_file_path)
data=pd.DataFrame({'english':raw_eng,'french':raw_fr})

In [4]:
#preProcess the data
data['english'] = data.english.apply(lambda x: re.sub("'", '',x).lower())
data['french'] = data.french.apply(lambda x: re.sub("'", '', x).lower())
    
#remove special chars
exclude = set(string.punctuation)#set of all special chars
#remove all the special chars
data['english'] = data.english.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data['french'] = data.french.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    
remove_digits = str.maketrans('','',digits)
data['english'] = data.english.apply(lambda x: x.translate(remove_digits))
data['french'] = data.french.apply(lambda x: x.translate(remove_digits))

data['french'] = data.french.apply(lambda x: x.translate(remove_digits))


# Remove extra spaces
data['english']=data['english'].apply(lambda x: x.strip())
data['french']=data['french'].apply(lambda x: x.strip())
data['english']=data['english'].apply(lambda x: re.sub(" +", " ", x))
data['french']=data['french'].apply(lambda x: re.sub(" +", " ", x))


In [5]:
val_frac = 0.1 #precentage data in val
val_split_idx = int(len(data)*val_frac) #index on which to split
data_idx = list(range(len(data))) #create a list of ints till len of data
np.random.shuffle(data_idx)
train_idx=data_idx[:val_split_idx]
val_idx=data_idx[val_split_idx:]
print()




In [6]:
train_data=data.iloc[train_idx].reset_index().drop('index',axis=1)
val_data=data.iloc[val_idx].reset_index().drop('index',axis=1)

In [7]:
train_data[:10]

Unnamed: 0,english,french
0,hell be back soon,il sera bientôt de retour
1,he came in through the window,il est entré par la fenêtre
2,we think too much and feel too little,nous pensons trop et ressentons trop peu
3,tom has a meeting tomorrow,tom a une réunion demain
4,by the way have you seen him lately,à ce propos lastu vu dernièrement
5,i thought your house was downtown,je pensais que ta maison se trouvait au centre...
6,i was eighteen then,javais alors ans
7,they met on a blind date,ils se sont rencontrés à une rencontre surprise
8,your wife is here,ta femme est ici
9,i cant believe im talking to you about this,je narrive pas à croire que je sois en train d...


In [8]:
SPECIAL_CHAR={'<UNX>':0,'<SOS>':1,'<EOS>':2,'<PAD>':3}

In [9]:
class Vocab:
    def __init__(self,max_size=10000,min_frequency=0):
        self.max_size=max_size
        self.min_frequency=min_frequency
        self.itos={0:'<UNX>',1:'<SOS>',2:'<EOS>',3:'<PAD>'}
        self.stoi={j:i for i,j in self.itos.items()}
        
    def tokenize(self,sentance):
        return sentance.strip().split(' ')
    
    def build_vocab(self,sentance_list):
        freq={}
        idx=4
        for sentance in sentance_list:
            # print(sentance)
            # print(self.tokenize(sentance))
            for word in self.tokenize(sentance):
                # print(word)
                if word in freq.keys():
                    freq[word]+=1
                else:
                    freq[word]=1
        # print(freq)
        # print("##########")
        freq={k:v for k,v in freq.items() if v>=self.min_frequency }
        # print(freq)
        freq = dict(sorted(freq.items(), key = lambda x: -x[1])[:self.max_size-idx])
        # print(freq)
        for i in freq:
            self.itos[idx]=i
            self.stoi[i]=idx
            idx+=1
            
    def numericalize(self,sentance,use_sos_n_eos=True,):
        tokens=self.tokenize(sentance)
        number=[] 
        if use_sos_n_eos:
            number.append(1)
        for token in tokens:
            if token in self.stoi.keys():
                number.append(self.stoi[token])
            else:
                number.append(0)
        if use_sos_n_eos:
            number.append(2)
        return number
    
    def stringify(self,num_list):
        str_ret=[]
        for i in num_list:
            if i in self.itos.keys():
                str_ret.append(self.itos[i])
        return str_ret

In [10]:
vocab=Vocab(max_size=50000,min_frequency=1)
vocab.build_vocab(["quick brown fox jumped the wine"])
vocab.numericalize("quick brown fox jumped the wine")

[1, 4, 5, 6, 7, 8, 9, 2]

In [11]:
vocab=Vocab(max_size=50000,min_frequency=1)
vocab.build_vocab(train_data.english)
vocab.numericalize(val_data.english[4])

[1, 131, 16, 715, 18, 2]

In [12]:
val_data.english[4]

'let me explain this'

In [13]:
vocab.stringify(vocab.numericalize(val_data.english[4]))

['<SOS>', 'let', 'me', 'explain', 'this', '<EOS>']

In [14]:
from torch.utils.data import Dataset

class CusDataset(Dataset):
    def __init__(self,lang1,lang2,lang1_vocab=None,lang2_vocab=None):
        
        self.lang1_vocab=Vocab(max_size=40000,min_frequency=1) if lang1_vocab is None else lang1_vocab
        self.lang2_vocab=Vocab(max_size=40000,min_frequency=1) if lang2_vocab is None else lang2_vocab
        self.lang1=lang1
        self.lang2=lang2
        if lang1_vocab is not None:
            self.lang1_vocab.build_vocab(lang1)
        if lang2_vocab is not None:
            self.lang2_vocab.build_vocab(lang2)
       
    def __getitem__(self,n):
        return torch.Tensor(self.lang1_vocab.numericalize(self.lang1[n])),torch.Tensor(self.lang2_vocab.numericalize(self.lang2[n]))
    def __len__(self):
        return len(self.lang2)

In [15]:
# lang1_vocab=Vocab(max_size=40000,min_frequency=1) 
# lang2_vocab=Vocab(max_size=40000,min_frequency=1)
# lang1_vocab.build_vocab(train_data.english)
# lang2_vocab.build_vocab(train_data.french)

trainDataset=CusDataset(train_data.english,train_data.french)
valDataset=CusDataset(val_data.english,val_data.french,lang1_vocab=trainDataset.lang1_vocab,lang2_vocab=trainDataset.lang2_vocab)

In [16]:
for i,cur_data in enumerate(trainDataset):
    print(cur_data)
    print(trainDataset)
    if i==5:
        break

(tensor([  1., 767.,  29., 120., 266.,   2.]), tensor([  1.,  13., 314., 423.,   5., 777.,   2.]))
<__main__.CusDataset object at 0x0000023953DD09D0>
(tensor([  1.,  10., 239.,  16., 499.,   7., 460.,   2.]), tensor([1.0000e+00, 1.3000e+01, 1.5000e+01, 1.8940e+03, 8.1000e+01, 1.0000e+01,
        5.3600e+02, 2.0000e+00]))
<__main__.CusDataset object at 0x0000023953DD09D0>
(tensor([  1.,  28.,  48., 102., 107.,  50., 141., 102., 166.,   2.]), tensor([1.0000e+00, 2.0000e+01, 3.4680e+03, 1.0200e+02, 4.8000e+01, 0.0000e+00,
        1.0200e+02, 9.8000e+01, 2.0000e+00]))
<__main__.CusDataset object at 0x0000023953DD09D0>
(tensor([  1.,  11.,  69.,   8., 361., 183.,   2.]), tensor([  1.,  14.,  18.,  23., 426., 175.,   2.]))
<__main__.CusDataset object at 0x0000023953DD09D0>
(tensor([1.0000e+00, 8.2000e+01, 7.0000e+00, 1.3700e+02, 1.9000e+01, 5.0000e+00,
        2.5000e+02, 4.3000e+01, 1.7250e+03, 2.0000e+00]), tensor([1.0000e+00, 8.0000e+00, 1.7000e+01, 5.2600e+02, 9.0100e+02, 1.3700e+02,
   

In [17]:
trainDataset.lang1_vocab.stringify(trainDataset[0][0].to(dtype=torch.int32).numpy())

['<SOS>', 'hell', 'be', 'back', 'soon', '<EOS>']

In [18]:
trainDataset.lang2_vocab.stringify(trainDataset[0][1].to(dtype=torch.int32).numpy())

['<SOS>', 'il', 'sera', 'bientôt', 'de', 'retour', '<EOS>']

In [19]:
train_data.french[0]

'il sera bientôt de retour'

In [20]:
from torch.nn.utils.rnn import pad_sequence

class Collate:
    def __init__(self,pad_idx):
        self.pad_idx=pad_idx
    def __call__(self,batch_data):
#         breakpoint()
        source_data=[item[0] for item in batch_data]
        target_data=[item[1] for item in batch_data]
        source = pad_sequence(source_data, batch_first=False, padding_value = self.pad_idx) 
        target = pad_sequence(target_data, batch_first=False, padding_value = self.pad_idx)
        return source,target
        

In [21]:
#temp=next(iter(train_loader))

In [22]:
# for i,cur_batch in enumerate(val_loader):
#     print(cur_batch[0].size())
#     print(cur_batch[0])
#     print(cur_batch[1].size())
#     break

In [23]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout,
                                        batch_first=False)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [50]:
# def masking(src,tgt,pad_idx):
#     src_mask=torch.zeros_like(src)
#     src_mask[src==pad_idx]=-inf
#     tgt_mask=torch.zeros_like(tgt)


def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)
    #breakpoint()
    src_padding_mask = (src == SPECIAL_CHAR['<PAD>']).transpose(0, 1)
    tgt_padding_mask = (tgt == SPECIAL_CHAR['<PAD>']).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [51]:
torch.triu(torch.ones(3,3),diagonal=1)

tensor([[0., 1., 1.],
        [0., 0., 1.],
        [0., 0., 0.]])

In [52]:
model=Seq2SeqTransformer( num_encoder_layers= 3,
                 num_decoder_layers= 3,
                 emb_size= 256,
                 nhead= 4,
                 src_vocab_size= 40000,
                 tgt_vocab_size= 40000,
                 dim_feedforward = 256,
                 dropout = 0.1,
               )

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model = model.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=SPECIAL_CHAR['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [53]:
from torch.utils.data import DataLoader, Dataset
batch_size=10
num_workers=0

In [54]:
q=3.44
print(f"test {q}")

test 3.44


In [55]:
def train(model,loss_fn,optimizer):
    model.train()
    total_loss=0
    train_loader=DataLoader(trainDataset,batch_size = batch_size, num_workers = num_workers,
                        shuffle=True, collate_fn = Collate(pad_idx=SPECIAL_CHAR['<PAD>']),pin_memory=True)
    for j,(src,tgt) in enumerate(train_loader):
        src=src.to(device=DEVICE)
        tgt=tgt.to(device=DEVICE)
        tgt_input=tgt[:-1,:]
        tgt_output=tgt[1:,:].type(torch.LongTensor).to(device=DEVICE)
        
        src_mask,tgt_mask,src_padding_mask,tgt_padding_mask=create_mask(src,tgt_input)
        src_mask=src_mask.to(device=DEVICE)
        tgt_mask=tgt_mask.to(device=DEVICE)
        src_padding_mask=src_padding_mask.to(device=DEVICE)
        tgt_padding_mask=tgt_padding_mask.to(device=DEVICE)
        
        predicted=model(src=src,
                        trg=tgt_input,
                        src_mask= src_mask,
                        tgt_mask= tgt_mask,
                        src_padding_mask=src_padding_mask,
                        tgt_padding_mask=tgt_padding_mask,
                       memory_key_padding_mask=src_padding_mask)
        
#         breakpoint()

        optimizer.zero_grad()
        loss=loss_fn(predicted.reshape(-1,predicted.size()[-1]),tgt_output.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss+=loss
        
    return total_loss/len(train_loader)

def val(model):
    val_loader=DataLoader(valDataset,batch_size = batch_size, num_workers = num_workers,
                        shuffle=True, collate_fn = Collate(pad_idx=SPECIAL_CHAR['<PAD>']),pin_memory=True)
    model.eval()
    val_total_loss=0
    for j,(src,tgt) in enumerate(val_loader):
        src=src.to(device=DEVICE)
        tgt=tgt.to(device=DEVICE)
        tgt_input=tgt[:-1,:]
        tgt_output=tgt[1:,:].type(torch.LongTensor).to(device=DEVICE)
        
        src_mask,tgt_mask,src_padding_mask,tgt_padding_mask=create_mask(src,tgt_input)
        src_mask=src_mask.to(device=DEVICE)
        tgt_mask=tgt_mask.to(device=DEVICE)
        src_padding_mask=src_padding_mask.to(device=DEVICE)
        tgt_padding_mask=tgt_padding_mask.to(device=DEVICE)
        
        predicted=model(src=src,
                        trg=tgt_input,
                        src_mask= src_mask,
                        tgt_mask= tgt_mask,
                        src_padding_mask=src_padding_mask,
                        tgt_padding_mask=tgt_padding_mask,
                       memory_key_padding_mask=src_padding_mask)
        
#         breakpoint()

        val_loss=loss_fn(predicted.reshape(-1,predicted.size()[-1]),tgt_output.reshape(-1))
        val_total_loss+=val_loss
    return val_total/len(val_loader)

In [89]:
EPOCHS=5
for i in range(EPOCHS):
    train_loss=train(model,loss_fn,optimizer)
#     val_loss=val(model)
    val_loss=0
    
    
        
    
    print(f"training loss ---- {train_loss} ::: testing loss ---- {val_loss}")
        
        
        

training loss ---- 4.913414001464844 ::: testing loss ---- 0
training loss ---- 4.404131889343262 ::: testing loss ---- 0
training loss ---- 4.047504425048828 ::: testing loss ---- 0
training loss ---- 3.7489264011383057 ::: testing loss ---- 0
training loss ---- 3.5136923789978027 ::: testing loss ---- 0


In [91]:
def greedy_decode(src,src_mask,start_index,end_index,max_len,model):
    src=src.to(device=DEVICE)
    src_mask=src_mask.to(device=DEVICE)
    mem=model.encode(src,src_mask).to(device=DEVICE)
    ys=torch.Tensor([[start_index]]).type(torch.long).to(device=DEVICE)
    for i in range(max_len):
        tgt_mask=generate_square_subsequent_mask(len(ys)).type(torch.bool).to(DEVICE)
        pred=model.decode(ys,mem,tgt_mask)
        output=model.generator(pred)
        ys=torch.cat((ys,output))
        if output==stop_index:
            break
    
    return ys
    
    
def greedy_decode1(model, src, src_mask, max_len, start_symbol,end_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == end_symbol:
            break
    return ys
    
    

In [94]:
temp="Comment allez vous"
temp="what is your name"
test_in=torch.Tensor(trainDataset.lang1_vocab.numericalize(temp)).to(device=DEVICE)
test_mask=torch.zeros(test_in.size()[0],test_in.size()[0]).to(device=DEVICE)
test_in=test_in.reshape(-1,1)
print()

#greedy_decode(test_in,test_mask,SPECIAL_CHAR['<SOS>'],SPECIAL_CHAR['<EOS>'],50,model)
values=greedy_decode1(model,test_in,test_mask,50,SPECIAL_CHAR['<SOS>'],SPECIAL_CHAR['<EOS>'])




In [95]:
trainDataset.lang2_vocab.stringify(values.reshape(-1).to(dtype=torch.int32).cpu().numpy())

['<SOS>', 'que', 'ton', 'nom', '<EOS>']

In [96]:
values.reshape(-1).to(dtype=torch.int32)

tensor([  1,   7,  77, 267,   2], device='cuda:0', dtype=torch.int32)