In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import open
import glob
import pickle
import random
from nltk.tokenize import sent_tokenize, word_tokenize
import math

import unicodedata
import string
import bcolz

import spacy
#spacy_en = spacy.load('en')

import operator
import time

from IPython.core.debugger import Tracer

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load and Prep Data

In [None]:
# def tokenizer(text): # create a tokenizer function
#     return [tok.text for tok in spacy_en.tokenizer(text)]

# TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
# LABEL = data.Field(sequential=False, use_vocab=False)

In [None]:
# TEXT.build_vocab(train, vectors="glove.6B.100d")
# train_iter, val_iter, test_iter = data.Iterator.splits(
#         (train, val, test), sort_key=lambda x: len(x.Text),
#         batch_sizes=(32, 256, 256), device=-1)
# vocab = TEXT.vocab
# self.embed = nn.Embedding(len(vocab), emb_dim)
# self.embed.weight.data.copy_(vocab.vectors)


In [4]:
def findFiles(path): 
    return glob.glob(path)

def unicodeToAscii(s):
    all_letters = string.ascii_letters + " .,;'"
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def filterPair(p):
    return len(word_tokenize(p[0])) < MAX_LENGTH and \
        len(word_tokenize(p[1])) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


### Bert Experiments

In [35]:
tokenizer = torch.hub.load(
    'huggingface/pytorch-transformers', 
    'tokenizer', 
    'bert-base-uncased')

Using cache found in /home/ubuntu/.cache/torch/hub/huggingface_pytorch-transformers_master


HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [46]:
list(tokenizer.vocab.keys())[5000:5005]

['knight', 'lap', 'survey', 'ma', '##ow']

In [47]:
#tokenizer.convert_tokens_to_ids()
text = "this is a sentence. this is another. paradoxical"
marked_text = "[CLS] " + text + " [SEP]"
tokens = tokenizer.tokenize(marked_text)
indexed = tokenizer.convert_tokens_to_ids(tokens)

In [51]:
for tup in zip(tokens, indexed):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
this          2,023
is            2,003
a             1,037
sentence      6,251
.             1,012
this          2,023
is            2,003
another       2,178
.             1,012
paradox      20,506
##ical        7,476
[SEP]           102


### GloVe

In [5]:
# words = []
# idx = 0
# word2idx = {}
# vectors = bcolz.carray(np.zeros(1), 
#                        rootdir=f'6B.50.dat', 
#                        mode='w')

# with open('glove.6B.50d.txt', 'rb') as f:
#     for l in f:
#         line = l.decode().split()
#         word = line[0]
#         words.append(word)
#         word2idx[word] = idx
#         idx += 1
#         vect = np.array(line[1:]).astype(np.float)
#         vectors.append(vect)
    
# vectors = bcolz.carray(
#     vectors[1:].reshape((400000, 50)), 
#     rootdir=f'6B.50.dat', mode='w')
# vectors.flush()
# pickle.dump(words, open('6B.50_words.pkl', 'wb'))
# pickle.dump(word2idx, open('6B.50_idx.pkl', 'wb'))

In [5]:
vectors = bcolz.open('6B.50.dat')[:]
words = pickle.load(open('6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open('6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [6]:
SOS_token = 0
EOS_token = 1

class Style:
    def __init__(self, name):
        self.name = name
#         self.word2index = {}
#         self.word2count = {}
#         self.index2word = {0: "SOS", 1: "EOS"}
#         self.n_words = 2  # Count SOS and EOS
        self.sentences = []
        self.word2index = { k : v for k , v in sorted(word2idx.items(), key=operator.itemgetter(1))}
        self.word2count = { word : 1 for word in words }
        self.index2word = { i+2 : word for word, i in word2idx.items() }
        self.n_words = 400001

    def addSentence(self, sentence):
        sentence = normalizeString(sentence.strip())
        self.sentences.append(sentence)
        for word in word_tokenize(sentence):
            self.addWord(word)
            

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def readFile(path):
    style = Style(path)
    
    with open(path) as f:
        for line in f:
            style.addSentence(line)
   
    return style

In [7]:
style0 = readFile("sas_data.0")
style1 = readFile("sas_data.1")

In [15]:
MAX_LENGTH = 30

In [16]:
def indexesFromSentence(style, sentence):
    return [style.word2index[word] for word in word_tokenize(sentence)]


def tensorFromSentence(style, sentence):
    indexes = indexesFromSentence(style, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)



In [14]:
tensorFromSentence(style0, style0.sentences[205])

tensor([[  81],
        [  54],
        [  52],
        [ 408],
        [   7],
        [2121],
        [   4],
        [1361],
        [  81],
        [ 197],
        [ 181],
        [ 236],
        [5863],
        [1006],
        [   6],
        [7314],
        [   2],
        [   1]], device='cuda:0')

In [None]:
#training_pairs = [tensorFromSentence(random.choice(pairs)) for i in range(n_iters)]

### Embeddings

In [9]:
# matrix_len = len(set({**style0.word2index, **style1.word2index}.keys()))
matrix_len = len(style0.word2index)

weight_matrix = np.zeros((matrix_len, 50))
words_found = 0

for i, word in enumerate(style0.word2index):
# for i, word in enumerate({**style0.word2index, **style1.word2index}):
    try: 
        weight_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weight_matrix[i] = np.random.normal(scale=0.6, size=(50, ))
        


In [10]:
weight_tensor = torch.from_numpy(weight_matrix)

### Define Models

In [11]:
matrix_len

401615

In [12]:
weight_matrix.shape

(401615, 50)

In [13]:
weight_tensor.shape

torch.Size([401615, 50])

In [22]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
#         self.embedding = nn.EmbeddingBag.from_pretrained(weight_tensor)
        self.embedding.weight.data.copy_(torch.from_numpy(weight_matrix))
        self.embedding.weight.requires_grad = False # do not train embeddings
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [25]:


class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        output = embedded[0].unsqueeze(0)
#         output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

### Training

In [30]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        try:
            encoder_output, encoder_hidden = encoder(
                input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] = encoder_output[0, 0]
        except Exception as e:
            Tracer()()
         

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor)
            decoder_input = target_tensor  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor)
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [19]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [27]:
def trainIters(encoder, decoder, n_iters, style_class, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    training_sentences = [
        tensorFromSentence(style_class, style_class.sentences[i])
        for i in range(n_iters)
    ]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_sent = training_sentences[iter - 1]
        input_tensor = training_sent[0]
        target_tensor = training_sent[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [31]:
# hidden_size = 256
hidden_size = 50
encoder1 = EncoderRNN(matrix_len, hidden_size).to(device)
attn_decoder1 = DecoderRNN(hidden_size, matrix_len, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, len(style1.sentences)-1, style1, print_every=100)

0m 5s (- 6m 5s) (100 1%) 9.2465
0m 8s (- 4m 57s) (200 2%) 6.8242
0m 11s (- 4m 32s) (300 4%) 5.8105


KeyboardInterrupt: 

### Evaluation

In [165]:
tensorFromSentence(input_lang, pair[0])

NameError: name 'pair' is not defined