In [1]:
import torch
import torchnlp

In [2]:
from torchtext import data

In [3]:
from torch import nn
from torch.nn import functional as F
from torch.nn.utils import clip_grad_norm
from torchnlp.encoders import LabelEncoder
from torchnlp.encoders.text import CharacterEncoder
from torch.utils.data import Dataset, DataLoader

In [4]:
from torch.autograd import Variable

In [5]:
import csv
import sys
import os
import numpy as np
import pandas as pd
import pickle as pi
import time
import math

In [6]:
from collections import defaultdict
import random

In [7]:
import gc
gc.collect()

40

In [8]:
sentences_path = './sentences.csv'
pairs_path = './paired-langs/'
use_cuda = True
torch.cuda.device_count()
epochs = 20

In [9]:
pairs_list = []
for file_path in os.listdir(pairs_path):
    if ".txt" in file_path:
        pairs_list += [file_path[:3]]

In [10]:
def showall():
    for lang in pairs_list:
        full_path = os.path.join(pairs_path, lang + '.txt')
        with open(full_path, 'r') as f:
            first5 = []
            for i in range(5):
                first5 += [next(f)]
            print(lang)
            yield first5

In [11]:
# return (lang, input, output)
def get_data():
    sentences = []
    for lang in pairs_list:
        full_path = os.path.join(pairs_path, lang + '.txt')
        with open(full_path, 'r') as f:
            for line in f:
                sentences.append((lang, *line.strip('\n').split('\t')))
    return sentences

In [12]:
s = get_data()

In [13]:
# get character encoder from all possible characters
cols = list(zip(*s))
encoder = CharacterEncoder(list(cols[1]) + list(cols[2]))

In [14]:
# get lang encoding
lang_encoder = LabelEncoder(pairs_list)

In [15]:
# encode s
def encode_dataset(ds, seq_start, seq_end):
    return list(map(lambda r: (lang_encoder.encode(r[0]), encoder.encode(r[1]), torch.cat([torch.LongTensor([seq_start]), encoder.encode(r[2]), torch.LongTensor([seq_end])])),s))

In [16]:
s = encode_dataset(s, encoder.stoi['<s>'], encoder.stoi['</s>'])

In [17]:
def dataset_iter(ds, batch_size=100, padding_idx=1):
    l = len(ds)
    batch_bounds = zip(list(range(0,l,batch_size)), list(range(batch_size, l, batch_size)) + [l])
    for batch_bound in batch_bounds:
        batch_start, batch_end = batch_bound
        exs = ds[batch_start:batch_end]
        lang, src, tgt = zip(*exs)
#         print(src[0].shape)
        src = nn.utils.rnn.pad_sequence(src, padding_value=padding_idx, batch_first=True)
#         print(src.shape)
        tgt = nn.utils.rnn.pad_sequence(tgt, padding_value=padding_idx, batch_first=True)
        exs = list(zip(lang,src,tgt))
        yield exs

In [18]:
# split data into train, val
val_fraction = 0.1
random.shuffle(s)
split_index = int(-len(s)*val_fraction)
val = s[split_index:]
train = s[:split_index]

#val_iter, train_iter = data.BucketIterator.splits((val, train), batch_size=100, sort_key=len)

In [19]:
class CharEncoder(nn.Module):
    def __init__(self, num_chars, embedding_c, h1_dim, num_layers):
        super().__init__()
        
        self.char_encoder = nn.Embedding(num_chars, embedding_c, padding_idx=0)        
        self.rnn = nn.LSTM(embedding_c, h1_dim, num_layers, bidirectional=True)
        self.num_layers = num_layers
        self.h1_dim = h1_dim
        self.directions = 2
        self.hidden_flat_size = h1_dim*num_layers*self.directions
        
    def forward(self, x):
        embedded = self.char_encoder(x)
        encoded, hidden = self.rnn(embedded)
        return encoded, hidden

In [20]:
class CharDecoder(nn.Module):
    def __init__(self, num_chars, embedding_c, h1_dim, num_layers):
        super().__init__()
        self.h1_dim = h1_dim
        self.num_layers = num_layers        
        self.hidden_flat_size = h1_dim*num_layers
        self.out = nn.Linear(h1_dim, num_chars)
        self.softmax = nn.Softmax(dim=1)
        self.embedding = nn.Embedding(num_chars, embedding_c, padding_idx=0)
        self.rnn = nn.LSTM(embedding_c, h1_dim, num_layers)
        
    def forward(self, x, hidden):        
        embedded = self.embedding(x)
#         print(embedded.shape)
        embedded = embedded.unsqueeze(0)
        output, hidden = self.rnn(embedded, hidden)
        output = output.squeeze(0)
        output = self.softmax(self.out(output))
        return output, hidden

In [21]:
class Chars2Chars(nn.Module):
    def __init__(self, encoder, decoder, vocab_size, num_langs, embedding_l):
        super(Chars2Chars, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.vocab_size = vocab_size
        self.lang_embedding = nn.Embedding(num_langs, embedding_l)
        self.lang_in = nn.Linear(self.encoder.hidden_flat_size + embedding_l, self.decoder.hidden_flat_size)
        
    def forward(self, langs, input_seqs, target_seqs, batch_size, target_length):
        if use_cuda:
            outputs = Variable(torch.zeros(target_length, batch_size, self.vocab_size)).cuda()
        else:
            outputs = Variable(torch.zeros(target_length, batch_size, self.vocab_size))
            
        #print(input_seqs.shape)
        encoder_output, hidden = self.encoder(input_seqs)
        
        # calculate lang embedding -> starting hidden state
        lang_rep = self.lang_embedding(langs)
        hidden, _ = hidden
        #print(lang_rep.shape)
        lang_and_enc = torch.cat([lang_rep, hidden.view((batch_size,self.encoder.hidden_flat_size))],dim=1)
        #print(lang_and_enc.shape)
        hidden_start = self.lang_in(lang_and_enc)
        hidden = hidden_start.view((self.decoder.num_layers, batch_size, self.decoder.h1_dim))
        cell = torch.zeros(hidden.shape)
        output = Variable(target_seqs.data[0, :])
        if use_cuda:
            output = output.cuda()
            cell = cell.cuda()
        hidden = (hidden, cell)
        for t in range(1, target_length):
            output, hidden = self.decoder(output, hidden)
            outputs[t] = output
            #best = output[0].data.max(1)[1]
            if use_cuda:
                output = Variable(target_seqs.data[t]).cuda()
                #output = Variable(best).cuda()
            else:
                output = Variable(target_seqs.data[t])
                #output = Variable(best)
        return outputs
    
    def batch_train(self, optimizer, train_iter, vocab_size, grad_clip=2, padding_idx=0):
        self.train()
        total_loss = 0
        pad = padding_idx
        curr_time = time.time()
        for b, batch in enumerate(train_iter):
            langs, source, target = zip(*batch)
            langs, source, target = torch.stack(langs), torch.stack(source), torch.stack(target)
            source = source.T
            target = target.T
            if use_cuda:
                langs, source, target = langs.cuda(), source.cuda(), target.cuda()
            optimizer.zero_grad()
            output = self.forward(langs, source, target, len(batch), target.shape[0])
            loss = F.cross_entropy(output[1:].view(-1, vocab_size),
                                   target[1:].contiguous().view(-1),
                                   ignore_index=pad)
            loss.backward()
            clip_grad_norm(self.parameters(), grad_clip)
            optimizer.step()
            total_loss += loss.item()

            if b % 1000 == 0 and b != 0:
                total_loss = total_loss / 1000
                print("[%d][loss:%5.2f][pp:%5.2f][time:%5.2f]" %
                      (b, total_loss, math.exp(total_loss), time.time() - curr_time))
                total_loss = 0
                curr_time = time.time()
                
    def infer_greedy(self, lang, input_seq, vocab_size, max_length = 300, start_idx = 3, padding_idx=0):
        self.eval()
        if use_cuda:
            outputs = torch.zeros(max_length, self.vocab_size).cuda()
        else:
            outputs = torch.zeros(max_length, self.vocab_size)
        lang_rep = self.lang_embedding(lang)
        encoder_output, hidden = self.encoder(input_seq)
        hidden, _ = hidden
        lang_and_enc = torch.cat([lang_rep, hidden.view((1, self.encoder.hidden_flat_size))],dim=1)
        #print(lang_and_enc.shape)
        hidden_start = self.lang_in(lang_and_enc)
        hidden = hidden_start.view((self.decoder.num_layers, 1, self.decoder.h1_dim))
        cell = torch.zeros(hidden.shape)
        output = torch.LongTensor([start_idx]).view((1,1,1))
        if use_cuda:
            output = output.cuda()
            cell = cell.cuda()
        hidden = (hidden, cell)
        t = 0
        while t < max_length and output.item() != start_idx:
            output, hidden = self.decoder(output, hidden)
            outputs[t] = output.view((-1)).max()
            if use_cuda:
                #output = Variable(target_seqs.data[t]).cuda()
                output = outputs[t]
            else:
                #output = Variable(target_seqs.data[t])
                output = outputs[t]
                
        return outputs
    
    def predict(self, val_iter, vocab_size, padding_idx=0):
        self.eval()
        pad = padding_idx
        total_loss = 0
        for batch in val_iter:
            langs, source, target = zip(*batch)
            langs, source, target = torch.stack(langs), torch.stack(source), torch.stack(target)
            source = source.T
            target = target.T
            if use_cuda:
                langs, source, target = langs.cuda(), source.cuda(), target.cuda()
#             if use_cuda:
#                 source = Variable(source.data.cuda(), volatile=True)
#                 target = Variable(target.data.cuda(), volatile=True)
#                 langs = Variable(langs.data.cuda(), volatile=True)
            output = self.forward(langs, source, target, len(batch), target.shape[0])
            loss = F.cross_entropy(output[1:].view(-1, vocab_size),
                                   target[1:].contiguous().view(-1),
                                   ignore_index=pad)
            total_loss += loss.item()
        return total_loss / len(val_iter)

In [22]:
c_encoder = CharEncoder(encoder.vocab_size, 300, 100, 2)
c_decoder = CharDecoder(encoder.vocab_size, 300, 100, 2)
chars2chars = Chars2Chars(c_encoder, c_decoder, encoder.vocab_size, len(lang_encoder.vocab), 100)

In [23]:
chars2chars = chars2chars.cuda()
optimizer = torch.optim.Adam(chars2chars.parameters(), lr=0.01)

In [24]:
best_val_loss = None
for epoch in range(epochs):
    di_v = dataset_iter(val, batch_size=100, padding_idx=encoder.stoi['<pad>'])
    di_t = dataset_iter(train, batch_size=100, padding_idx=encoder.stoi['<pad>'])
    chars2chars.batch_train(optimizer, di_t, len(encoder.vocab), padding_idx=encoder.stoi['<pad>'])
    val_loss = chars2chars.predict(di_v, len(encoder.vocab))
    
    print("[Epoch:%d] val_loss:%5.3f | val_pp:%5.2f"
        % (i, val_loss, math.exp(val_loss)))

    if not best_val_loss or val_loss < best_val_loss:
        print("[!] saving model...")
        if not os.path.isdir(".save"):
            os.makedirs(".save")
        torch.save(chars2chars.state_dict(), './.save/chars2chars_%d.pt' % (i))
        best_val_loss = val_loss
        
    



RuntimeError: CUDA out of memory. Tried to allocate 764.00 MiB (GPU 0; 5.93 GiB total capacity; 4.34 GiB already allocated; 177.31 MiB free; 587.10 MiB cached)

In [126]:
torch.cuda.empty_cache()

In [None]:

best_val_loss = None
for i in range(epoch_num):
    seq2seq.batch_train(optimizer, train_iter,len(EN.vocab), grad_clip = 2)
    val_loss = seq2seq.predict(val_iter, len(EN.vocab))
    print("[Epoch:%d] val_loss:%5.3f | val_pp:%5.2f"
          % (i, val_loss, math.exp(val_loss)))

    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        print("[!] saving model...")
        if not os.path.isdir(".save"):
            os.makedirs(".save")
        torch.save(seq2seq.state_dict(), './.save/seq2seq_%d.pt' % (i))
        best_val_loss = val_loss
    scheduler.step()
test_loss = seq2seq.predict(test_iter, len(EN.vocab))
print("[TEST] loss:%5.2f" % test_loss)

In [None]:
ex = val[0]
out = chars2chars.infer_greedy(ex[0].view((1)).cuda(), ex[1].view((-1,1)).cuda(), len(encoder.vocab), 300, encoder.stoi['<s>'])