In [54]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import time

In [3]:
INPUT_FILE = '../../Data/language_data/eng-fra.txt'

with open(INPUT_FILE,'r',encoding='utf-8') as f:
    lines = f.read().strip().split('\n')
    
lines[:10]

['Go.\tVa !',
 'Run!\tCours\u202f!',
 'Run!\tCourez\u202f!',
 'Wow!\tÇa alors\u202f!',
 'Fire!\tAu feu !',
 "Help!\tÀ l'aide\u202f!",
 'Jump.\tSaute.',
 'Stop!\tÇa suffit\u202f!',
 'Stop!\tStop\u202f!',
 'Stop!\tArrête-toi !']

In [4]:
len(lines)

135842

In [5]:
SAMPLE = 30000

original_word_pairs = [[w for w in l.split('\t')] for l in lines[:SAMPLE]]

In [6]:
data = pd.DataFrame(original_word_pairs,columns=["en","fr"])

In [7]:
data.head(5)

Unnamed: 0,en,fr
0,Go.,Va !
1,Run!,Cours !
2,Run!,Courez !
3,Wow!,Ça alors !
4,Fire!,Au feu !


In [10]:
def unicodeToAscii(s):
    '''
        convert unicode characters to ascii
    '''
    return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c)!='Mn')


def preprocess_sentence(w):
    '''
        pre-process the sentence.
    '''
    w = unicodeToAscii(w.lower().strip()) 
    w = re.sub(r"([?.!,¿])", r" \1 ", w)  ## creating a space between a word and the punctuation following it
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w) ## replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    
    w = w.rstrip().strip()
    
    w = '<start> ' + w + ' <end>'
    
    return w

In [11]:
data["en"] = data.en.apply(lambda x: preprocess_sentence(x))
data["fr"] = data.fr.apply(lambda x: preprocess_sentence(x))

In [12]:
data.head(10)

Unnamed: 0,en,fr
0,<start> go . <end>,<start> va ! <end>
1,<start> run ! <end>,<start> cours ! <end>
2,<start> run ! <end>,<start> courez ! <end>
3,<start> wow ! <end>,<start> ca alors ! <end>
4,<start> fire ! <end>,<start> au feu ! <end>
5,<start> help ! <end>,<start> a l aide ! <end>
6,<start> jump . <end>,<start> saute . <end>
7,<start> stop ! <end>,<start> ca suffit ! <end>
8,<start> stop ! <end>,<start> stop ! <end>
9,<start> stop ! <end>,<start> arrete toi ! <end>


In [20]:
### Building vocabulary index

class LangIndex():
    def __init__(self, lang):
        '''
            lang are the list of phrases from each language
        '''
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        
        self.create_index()
        
    
    def create_index(self):
        
        for phrase in self.lang:
            '''update the indivisual token'''
            self.vocab.update(phrase.split(' '))
            
        self.vocab = sorted(self.vocab)
        
        self.word2idx['<pad>'] = 0  ## padd mapping
        
        ## word to index mapping
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1
            
        for index, word in enumerate(self.vocab):
            self.idx2word[index] = word
            
        

In [23]:
input_lang = LangIndex(data['fr'].values.tolist())
output_lang = LangIndex(data['en'].values.tolist())

In [25]:
## vectorise the input and output langs

input_tensor = [[input_lang.word2idx[f] for f in fr.split(' ')] for fr in data['fr'].values.tolist()]
output_tensor = [[output_lang.word2idx[e] for e in en.split(' ')] for en in data['en'].values.tolist()]

In [27]:
input_tensor[:10]

[[5, 7451, 1, 4],
 [5, 1678, 1, 4],
 [5, 1670, 1, 4],
 [5, 994, 279, 1, 4],
 [5, 599, 3150, 1, 4],
 [5, 7, 4159, 181, 1, 4],
 [5, 6541, 3, 4],
 [5, 994, 6935, 1, 4],
 [5, 6901, 1, 4],
 [5, 458, 7216, 1, 4]]

In [31]:
output_tensor[:10]

[[5, 1686, 3, 4],
 [5, 3279, 1, 4],
 [5, 3279, 1, 4],
 [5, 4410, 1, 4],
 [5, 1477, 1, 4],
 [5, 1850, 1, 4],
 [5, 2147, 3, 4],
 [5, 3731, 1, 4],
 [5, 3731, 1, 4],
 [5, 3731, 1, 4]]

In [37]:
print(output_tensor[0])

for idx in output_tensor[0]:
    print(output_lang.idx2word[idx])

[5, 1686, 3, 4]
?
goal
<end>
<start>


In [38]:
### calculate the max length of input and output tensors

def max_length(tensor):
    return max(len(t) for t in tensor)

max_length_input, max_length_output = max_length(input_tensor), max_length(output_tensor)

print(f'max_length_input: {max_length_input} \t max_length_output: {max_length_output}')

max_length_input: 17 	 max_length_output: 10


In [43]:
def pad_sequences(x, max_len):
    padd_seq = np.zeros((max_len), dtype=np.int64)
    
    if len(x) > max_len:
        padd_seq[:] = x[:max_len]
    else:
        padd_seq[:len(x)] = x
    
    return padd_seq

In [40]:
pad = np.zeros((10), dtype=np.int64)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [41]:
pad[:4] = [5, 1686, 3, 4]

In [42]:
pad

array([   5, 1686,    3,    4,    0,    0,    0,    0,    0,    0])

In [44]:
### add padding to the input and output tensors to make it similar

input_pad_tensor = [pad_sequences(tensor, max_length_input) for tensor in input_tensor]
output_pad_tensor = [pad_sequences(tensor, max_length_output) for tensor in output_tensor]

In [48]:
print(input_pad_tensor[1].tolist())
print(len(input_pad_tensor[1]))
print(len(output_tensor))

[5, 1678, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
17
30000


In [53]:
### split the data into train and test

X_train, X_test, y_train, y_test = train_test_split(input_pad_tensor, output_pad_tensor, test_size=0.2)

print(f'Length of X_train: {len(X_train)}  y_Train: {len(y_train)}')
print(f'Length of X_test : {len(X_test)}    y_test: {len(y_test)}')

Length of X_train: 24000  y_Train: 24000
Length of X_test : 6000    y_test: 6000


In [55]:
### Create pytorch dataset

class NMTDataset(Dataset):  
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
        
        
    def __len__(self):
        return len(self.X)
        
    def __getitem__(self, index):
        x = self.X[index]
        y = self.y[index]
        x_len = self.length[index]
        
        return x,y,x_len
        

In [57]:
train_ds = NMTDataset(X_train, y_train)
test_ds = NMTDataset(X_test, y_test)

In [58]:
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, drop_last=True)
test_ds = DataLoader(test_ds, shuffle=False, drop_last=False)

In [160]:
class Encoder(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, enc_unit, batch_sz):
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.enc_unit = enc_unit
        self.batch_sz = batch_sz
        
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.enc_unit)
        
        
    def forward(self, x, lens, device):
        ## x: batch_size, max_length
        
        x = self.embedding(x) ## x: batch_size, max_length, embedding_dim
        x = nn.utils.rnn.pack_padded_sequence(x, lens) ## unpad
        
        self.hidden = self.initialize_hidden_state(device)
        
        # output: max_length, batch_size, enc_units
        # self.hidden: 1, batch_size, enc_units
        output, self.hidden = self.gru(x, self.hidden)
        output , _ = nn.utils.rnn.pad_packed_sequence(output) ## pad to the max value of output
        
        return output, self.hidden
        
    
    def initialize_hidden_state(self, device):
        return torch.zeros((1, self.batch_sz, self.enc_unit)).to(device)

In [161]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [162]:
BUFFER_SIZE = len(train_ds)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
EMBEDDING_DIM = 256
UNITS = 1024

vocab_inp_size = len(input_lang.word2idx)
vocab_out_size = len(output_lang.word2idx)

print(f'{BUFFER_SIZE} \t {BATCH_SIZE} \t {N_BATCH} \t {EMBEDDING_DIM}')
print(f'{vocab_inp_size} \t {vocab_tar_size}')

24000 	 64 	 375 	 256
7727 	 4445


In [163]:
def sort_batch(x,y, lengths):
    lengths, indx = lengths.sort(dim=0, descending=True)
    x = x[indx]
    y = y[indx]
    return x.transpose(0,1), y, lengths

In [164]:
## Testing the encoder

encoder = Encoder(vocab_inp_size, EMBEDDING_DIM, UNITS, BATCH_SIZE)
encoder.to(device)

it = iter(train_loader)
x1, y1, l1 = next(it)

xs1, ys1, ls1 = sort_batch(x1, y1, l1)

encoder_output, encoder_hidden = encoder(xs1.to(device), ls1, device)

encoder_output.size()

torch.Size([12, 64, 1024])

In [173]:
it = iter(train_loader)
x1, y1, l1 = next(it)

x1[10]

tensor([   5, 4040,  179, 4950, 4160, 5627, 5401,    3,    4,    0,    0,    0,
           0,    0,    0,    0,    0])

In [174]:
y1[10]

tensor([   5, 1965, 1703, 1482, 2870,    3,    4,    0,    0,    0])

In [175]:
l1[10]

tensor(9)

In [138]:
torch.zeros((1, BATCH_SIZE, vocab_inp_size)).shape

torch.Size([1, 64, 7727])

In [182]:
input_lang.idx2word[4040]

'jalouse'

In [183]:
output_lang.idx2word[1965]

'ibaraki'