In [2]:
import torch 
from torch import nn
from nltk import ngrams

## Lectura y preprocesamiento del corpus

In [3]:
with open('Corpus/data_test.txt', 'r') as file:
    corpus_raw_training = file.read()

In [4]:
test_corpus = [sentence.split('\t') for sentence in corpus_raw_training.split('\n')]
test_corpus[0]

['we therefore respect whatever parliament may decide ',
 'quindi noi rispettiamo le eventuali decisioni in materia del parlamento']

In [5]:
en_corpus = []
it_corpus = []

for i, string_pair in enumerate(test_corpus):
    if i != 900:
        en_corpus.append('<BOS> ' + string_pair[0] + ' <EOS>')
        it_corpus.append('<BOS> ' + string_pair[1] + ' <EOS>')
        
print('Ingles:\n', en_corpus[0])
print('\nItaliano:\n', it_corpus[0])

Ingles:
 <BOS> we therefore respect whatever parliament may decide  <EOS>

Italiano:
 <BOS> quindi noi rispettiamo le eventuali decisioni in materia del parlamento <EOS>


In [6]:
# Crear los alfabetos de ambos corpus
en_dict = []
it_dict = []

for en_sent, it_sent in zip(en_corpus, it_corpus):
    for en_word, it_word in zip(en_sent.split(), it_sent.split()):
        if (en_word not in en_dict) and (en_word not in ['<BOS>', '<EOS>']):
            en_dict.append(en_word)
        if (it_word not in it_dict) and (it_word not in ['<BOS>', '<EOS>']):
            it_dict.append(it_word)

en_dict.append('<BOS>')
it_dict.append('<BOS>')
en_dict.append('<EOS>')
it_dict.append('<EOS>')    

print('Palabras en corpus ingles = ', len(en_dict)-2)
print('Palabras en corpus italiano = ', len(it_dict)-2)

Palabras en corpus ingles =  3157
Palabras en corpus italiano =  4318


In [7]:
# Creacion de diccionarios

word2ENindx = {}
indx2ENword = {}

for idx, en_word in enumerate(en_dict):
    word2ENindx[en_word] = idx
    indx2ENword[idx] = en_word 

word2ITindx = {}
indx2ITword = {}

for idx, it_word in enumerate(it_dict):
    word2ITindx[it_word] = idx
    indx2ITword[idx] = it_word     

### Obtener los bigramas de ambos corpus

In [8]:
en_bigrams = []

for en_sent in en_corpus:
    en_bigrams.append(list(ngrams(en_sent.split(), 2)))
    
en_bigrams[0]

[('<BOS>', 'we'),
 ('we', 'therefore'),
 ('therefore', 'respect'),
 ('respect', 'whatever'),
 ('whatever', 'parliament'),
 ('parliament', 'may'),
 ('may', 'decide'),
 ('decide', '<EOS>')]

In [9]:
it_bigrams = []

for it_sent in it_corpus:
    it_bigrams.append(list(ngrams(it_sent.split(), 2)))
    
it_bigrams[0]

[('<BOS>', 'quindi'),
 ('quindi', 'noi'),
 ('noi', 'rispettiamo'),
 ('rispettiamo', 'le'),
 ('le', 'eventuali'),
 ('eventuali', 'decisioni'),
 ('decisioni', 'in'),
 ('in', 'materia'),
 ('materia', 'del'),
 ('del', 'parlamento'),
 ('parlamento', '<EOS>')]

In [10]:
# Vector one hot dado indice 
def oneHot(idx, N):
    vector = torch.zeros(N, dtype=torch.long)
    vector[idx] = 1
    return vector

In [11]:
oneHot(word2ENindx['<EOS>'], len(en_dict))

tensor([0, 0, 0,  ..., 0, 0, 1])

## Implementacion de red neuronal

In [35]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers):
        super(Encoder, self).__init__()
        
        self.embedding = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim) 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers)
        
    def forward(self, x):
        emb_x = self.embedding(x)
        outs, (hidden, cell) = self.lstm(emb_x)
        return hidden, cell

In [40]:
class Decoder(nn.Module):
    def __init__(self, out_dim, embedding_dim, hidden_dim, num_layers):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=out_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers)
        self.out = nn.Linear(in_features=hidden_dim, out_features=out_dim)
        
    def forward(self, x, hidden):
        emb_x = self.embedding(x.unsqueeze(0))
        out, hidden = self.lstm(emb_x, (hidden, cell))
        y_pred = self.out(out.squeeze(0))
        return y_pred, hidden, cell

In [41]:
input_dim = len(en_dict)
out_dim = len(it_dict)
embedding_dim = 300              
hidden_dim = 100 
num_layers = 4

In [42]:
encoder = Encoder(input_dim, embedding_dim, hidden_dim, num_layers)

decoder = Decoder(out_dim, embedding_dim, hidden_dim, num_layers)

In [43]:
print('Encoder layers\n')
print(encoder.embedding)
print(encoder.lstm)

Encoder layers

Embedding(3159, 300)
LSTM(300, 100, num_layers=4)


In [46]:
print('Decoder layers\n')
print(decoder.embedding)
print(decoder.lstm)
print(decoder.out)

Decoder layers

Embedding(4320, 300)
LSTM(300, 100, num_layers=4)
Linear(in_features=100, out_features=4320, bias=True)


In [18]:
x1 = oneHot(word2ENindx['<EOS>'], len(en_dict))

out, hidden = encoder.forward(x1)

print(out.shape)
print(hidden.shape)

torch.Size([1, 3159, 100])
torch.Size([100, 3159, 100])


In [19]:
x2 = oneHot(word2ITindx['<EOS>'], len(it_dict))

print(x2.shape)

out2, hidden2 = decoder.forward(x2, hidden)

torch.Size([4320])


RuntimeError: Expected hidden size (100, 4320, 100), got (100, 3159, 100)

In [31]:
torch.tensor([1,2,3,4]).view(1,1,-1).shape

torch.Size([1, 1, 4])

In [30]:
torch.tensor([1,2,3,4]).unsqueeze(0).shape

torch.Size([1, 4])