In [1]:
import os
import math
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path,fname):
        self.sentences = []
        self.vocab_idx = 0
        self.vocab_map = {'<pad>': 0}
        self.dictionary = Dictionary()
        self.file = os.path.join(path, fname)
        #self.train = self.tokenize(self.file)
        #self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        #self.test = self.tokenize(os.path.join(path, 'test.txt'))
        self.longest_sent = self.longestSentLength(self.file)
        self.data = self.shape_data()
        

    def longestSentLength(self,file):
        assert os.path.exists(file)
        # Add words to the dictionary
        max_len = 0
        with open(file, 'r') as f:
            for line in f:
                self.sentences.append(line) 
                words = line.split()
                if max_len < len(words):
                    max_len = len(words)
        return max_len

    def padding(self,sentence):
        new_sentence = []
        for i in range(0 , self.longest_sent):
            new_sentence.append('<pad>')
        j = 1
        for i in range((self.longest_sent - len(sentence) + 1) , self.longest_sent+1):
            new_sentence[i-1] = sentence[j-1]
            j = j + 1
        return new_sentence
    
    def shape_data(self):
        x = torch.zeros(len(self.sentences),self.longest_sent)
        for i in range(0,len(self.sentences)):
            words = self.sentences[i].split()
            words = self.padding(words)
            for j in range(0,len(words)):
                if self.vocab_map.get(words[j]) == None:
                    self.vocab_idx = self.vocab_idx + 1
                    self.vocab_map[words[j]] = self.vocab_idx
                x[i][j] = self.vocab_map[words[j]]
        print("Number of words = %d" % self.vocab_idx)
        self.no_of_words = self.vocab_idx
        return x.long()
        
    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

class args(object):
    pass
    

In [3]:
###############################################################################
# Load data
###############################################################################
corpusEng = Corpus('./','english.1000.tok')
corpusTr = Corpus('./','turkish.1000.tok')

Number of words = 3059
Number of words = 5252


In [4]:
corpusEng.data[0:20]


    0     0     0  ...      1     2     3
    0     0     0  ...     56    26    42
    0     0     0  ...     58    23    59
       ...          ⋱          ...       
    0     0     0  ...    184    91    42
    0     0     0  ...    190    80    42
    0     0     0  ...    163    10    42
[torch.LongTensor of size 20x105]

In [5]:
vocab_size_pri = corpusEng.vocab_idx
vocab_size_sec = corpusTr.vocab_idx
embedding_dim = 64
batch_size = 1000
max_epoch = 1 # 500

In [6]:
class BiLingual(nn.Module):
    
    def __init__(self, vocab_size_pri,vocab_size_sec ,embedding_dim,batch_size):
        super(BiLingual, self).__init__()
        self.embeddings_pri = nn.Embedding(vocab_size_pri, embedding_dim)
        self.embeddings_sec = nn.Embedding(vocab_size_sec, embedding_dim)
        
    def cAdd(self,embeds):
        btch_len = embeds.size()[0]
        sntc_len = embeds.size()[1]
        ret = []
        for i in range(btch_len):
            splt=torch.split(embeds[i],sntc_len,1)
            tot = autograd.Variable(torch.zeros(embedding_dim),requires_grad=False)
            for j in range(sntc_len):
                tot = tot + embeds[i][j]
            ret.append(tot)
        ret=torch.stack(ret,0)
        return ret
       
    def forwardPri(self, inputs):
        embeds_pri = self.embeddings_pri(inputs)
        out_pri = self.cAdd(embeds_pri)
        return out_pri
    
    def forwardSec(self, inputs):
        embeds_sec = self.embeddings_sec(inputs)
        out_sec = self.cAdd(embeds_sec)
        return out_sec

In [None]:
def testtest():
    inds = torch.range(1, number_of_sentences,batch_size).long()
    shuffle = torch.randperm(inds.size()[0])
    start = inds[shuffle[0]]-1
    endd = inds[shuffle[0]]+batch_size-1
    inputEng = autograd.Variable(corpusEng.data[start:endd])
    inputTr = autograd.Variable(corpusTr.data[start:endd])
    
    embeddings_pri = nn.Embedding(vocab_size_pri, embedding_dim)
    embeddings_sec = nn.Embedding(vocab_size_sec, embedding_dim)
    
    embeds_pri = embeddings_pri(inputEng)
    embeds_sec = embeddings_sec(inputTr)
    
    btch_len = embeds_pri.size()[0]
    sntc_len = embeds_pri.size()[1]
    ret = []
    for i in range(btch_len):
            splt=torch.split(embeds_pri[i],sntc_len,1)
            tot = autograd.Variable(torch.zeros(embedding_dim))
            for j in range(sntc_len):
                tot = tot + embeds_pri[i][j]
            ret.append(tot)
    ret=torch.stack(ret,0)

    

In [7]:
# Set the random seed manually for reproducibility.
torch.manual_seed(1111)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1111)
torch.cuda.is_available(),torch.cuda.device_count()

(True, 1)

In [None]:
inputEng

In [None]:
embeddings_pri = nn.Embedding(vocab_size_pri, embedding_dim) 

In [None]:
vocab_size_pri, embedding_dim

In [None]:
embeddings_pri

In [None]:
embeds_pri = embeddings_pri(inputEng)

In [8]:
loss_function = nn.L1Loss()
losses = []
model = BiLingual(vocab_size_pri+1,vocab_size_sec+1,embedding_dim,batch_size)
optimizer = optim.SGD(model.parameters(), lr=0.001)
number_of_sentences = math.floor((len(corpusEng.sentences)/batch_size)*batch_size)
number_of_sentences/batch_size,number_of_sentences

(1.0, 1000)

In [None]:
for epoch in range(max_epoch):
    total_loss = 0.0
    inds = torch.range(1, number_of_sentences,batch_size).long()
    shuffle = torch.randperm(inds.size()[0])
    for j in range(int(number_of_sentences/batch_size)):
    
        start = inds[shuffle[j]]-1
        endd = inds[shuffle[j]]+batch_size-1
        print(' start %d end %d' % (start,endd))
        inputEng = autograd.Variable(corpusEng.data[start:endd])
        inputTr = autograd.Variable(corpusTr.data[start:endd])
        print("a")
        model.zero_grad()
        print("b")
        outputPri = model.forwardPri(inputEng)
        print("c")
        outputSec = model.forwardSec(inputTr)
        print("d")
        outputSec2 = autograd.Variable(outputSec.data.float(),requires_grad=False)
        outputSec3 = autograd.Variable(torch.Tensor([2.0]))
        print("e")
        lossPri = loss_function(outputPri,outputSec2)
        print("f")
        lossPri.backward()
        print('step %d ' % (j))
        print(lossPri.data)
#        outputPri = model.forwardPri(inputEng)
#        outputSec = model.forwardSec(inputTr)
#        lossSec = loss_function(outputSec,outputPri)
#        lossSec.backward()
        
        #optimizer.step()
        total_loss += lossPri.data

    losses.append(total_loss)
print (losses) # The loss decreased every iteration over the training data!

 start 0 end 1000
a
b
c
d
e
f


In [None]:
total_loss

In [None]:
losses = []

In [10]:
autograd.Variable(torch.Tensor([2.0]))

Variable containing:
 2
[torch.FloatTensor of size 1]