In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import open
import glob

import unicodedata
import string

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load and Prep Data

In [13]:
def findFiles(path): 
    return glob.glob(path)

def unicodeToAscii(s):
    all_letters = string.ascii_letters + " .,;'"
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


### Bert Experiments

In [35]:
tokenizer = torch.hub.load(
    'huggingface/pytorch-transformers', 
    'tokenizer', 
    'bert-base-uncased')

Using cache found in /home/ubuntu/.cache/torch/hub/huggingface_pytorch-transformers_master


HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [46]:
list(tokenizer.vocab.keys())[5000:5005]

['knight', 'lap', 'survey', 'ma', '##ow']

In [47]:
#tokenizer.convert_tokens_to_ids()
text = "this is a sentence. this is another. paradoxical"
marked_text = "[CLS] " + text + " [SEP]"
tokens = tokenizer.tokenize(marked_text)
indexed = tokenizer.convert_tokens_to_ids(tokens)

In [51]:
for tup in zip(tokens, indexed):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[CLS]           101
this          2,023
is            2,003
a             1,037
sentence      6,251
.             1,012
this          2,023
is            2,003
another       2,178
.             1,012
paradox      20,506
##ical        7,476
[SEP]           102


In [19]:
SOS_token = 0
EOS_token = 1


class Style:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS
        self.sentences = []

    def addSentence(self, sentence):
        sentence = normalizeString(sentence.strip())
        self.sentences.append(sentence)
        for word in sentence.split(' '):
            self.addWord(word)
            

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def readFile(path):
    style = Style(path)
    
    with open(path) as f:
        for line in f:
            style.addSentence(line)
   
    return style

In [23]:
style0 = readFile("sas_data.0")
style1 = readFile("sas_data.1")

In [None]:


#pairs = filterPairs(pairs)
for pair in pairs:
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])
print("Counted words:")
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)
return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

In [26]:
def indexesFromSentence(style, sentence):
    return [style.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(style, sentence):
    indexes = indexesFromSentence(style, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


In [31]:
style0.sentences[205]

'you would also need a scale to tell you how much each sample range in grams .'

In [30]:
tensorFromSentence(style0, style0.sentences[205])

tensor([[ 56],
        [ 20],
        [ 77],
        [ 21],
        [ 41],
        [286],
        [ 23],
        [159],
        [ 56],
        [ 71],
        [ 72],
        [ 67],
        [ 91],
        [461],
        [ 12],
        [454],
        [ 14],
        [  1]], device='cuda:0')

In [None]:
#training_pairs = [tensorFromSentence(random.choice(pairs)) for i in range(n_iters)]

### Embeddings

Using cache found in /home/ubuntu/.cache/torch/hub/huggingface_pytorch-transformers_master


HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…




### Define Models

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

### Training

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

### Evaluation