In [1]:
from getInputs import WordIndex
from formatScript import makeDialogue
import numpy as np

In [2]:
import os
script_dir = './scripts/'

In [3]:
script_list = [script_dir + name for name in os.listdir(script_dir)]

In [4]:
dialogues = []
unread = []
for script in script_list:
    try:
        with open(script) as f:
            dialogues.extend(makeDialogue(f.read()))
    except:
        unread.append(script)

In [5]:
len(dialogues)

201250

In [6]:
len(unread)

33

In [7]:
len(script_list)

342

In [8]:
max([len(dialogue[0]) for dialogue in dialogues])

1180

In [9]:
#let's filter longer lines for now and also those weirdly short ones that went under the radar
def lenBetween(lines, minimum, maximum):
    len_first = len(lines[0])
    len_second = len(lines[1])
    return (len_first >= minimum) and (len_first <= maximum) and (len_second >= minimum) and (len_second <= maximum)

dialogues = [dialogue for dialogue in dialogues if lenBetween(dialogue, 3, 15)]

In [10]:
len(dialogues)

107259

In [11]:
#need to export this too
def indexEmbed(dialogues):
    language = WordIndex()

    for dialogue in dialogues:
        for line in dialogue:
            language.fillCounts(line)

    return language.getIndicies()

In [12]:
language = indexEmbed(dialogues)

In [420]:
#maybe refactor this there has to be a simple way to reference random points off list even though it's tuples
def chooseDialogues(dialogues, batch_size):
    lines = len(dialogues)
    chosen = [0 for i in range(batch_size)]
    for i in range(batch_size):
        chosen[i] = dialogues[np.random.randint(0, lines)]
    return chosen

# Building an embedding from scratch

In [358]:
import torch
from torch import nn
from torch.optim import Adam
from itertools import zip_longest

In [14]:
def padLines(lines, token = 0):
    return list(zip_longest(*lines, fillvalue=token))

def prepareBatch(dialogues, token = 0):
    inputs = [[language[word] for word in dialogue[0]] for dialogue in dialogues]
    inputs.sort(key=lambda x: len(x), reverse=True)
    lengths = torch.tensor([len(sentence) for sentence in inputs])
    inputs = padLines(inputs)
    inputs = torch.tensor(inputs)
    outputs = [[language[word] for word in dialogue[1]] for dialogue in dialogues]
    outputs = padLines(outputs)
    outputs = torch.tensor(outputs)
    mask = [[bool(word != token) for word in row] for row in outputs]
    mask = torch.ByteTensor(mask)
    return inputs, lengths, outputs, mask

In [15]:
list(padLines([dialogue[0] for dialogue in dialogues[:5]]))

[('BOS', 'BOS', 'BOS', 'BOS', 'BOS'),
 ('could', 'is', 'diego', 'diego', 'diego'),
 ('it', 'that', 'EOS', 'arent', 'take'),
 ('be', 'the', 0, 'you', 'fernando'),
 ('so', 'man', 0, 'going', 'to'),
 ('EOS', 'i', 0, 'to', 'the'),
 (0, 'knew', 0, 'kiss', 'dining'),
 (0, 'treasurer', 0, 'your', 'hall'),
 (0, 'sanchez', 0, 'brother', 'he'),
 (0, 'EOS', 0, 'EOS', 'must'),
 (0, 0, 0, 0, 'be'),
 (0, 0, 0, 0, 'hungry'),
 (0, 0, 0, 0, 'EOS')]

Each tuple should be a step across every single line in the batch, so zip_longest fits the bill

In [16]:
prepareBatch(dialogues[:5])

(tensor([[ 1,  1,  1,  1,  1],
         [18, 10, 18,  3, 18],
         [25, 11, 19,  4,  2],
         [26, 12, 20,  5,  0],
         [22, 13, 21,  6,  0],
         [12, 14, 22,  2,  0],
         [27, 15, 23,  0,  0],
         [28, 16,  8,  0,  0],
         [29, 17, 24,  0,  0],
         [30,  2,  2,  0,  0],
         [ 5,  0,  0,  0,  0],
         [31,  0,  0,  0,  0],
         [ 2,  0,  0,  0,  0]]),
 tensor([13, 10, 10,  6,  3]),
 tensor([[ 1,  1,  1,  1,  1],
         [ 7,  7, 18, 18,  7],
         [ 8,  8, 19, 25, 32],
         [ 9,  9, 20, 26,  2],
         [ 2,  2, 21, 22,  0],
         [ 0,  0, 22, 12,  0],
         [ 0,  0, 23, 27,  0],
         [ 0,  0,  8, 28,  0],
         [ 0,  0, 24, 29,  0],
         [ 0,  0,  2, 30,  0],
         [ 0,  0,  0,  5,  0],
         [ 0,  0,  0, 31,  0],
         [ 0,  0,  0,  2,  0]]),
 tensor([[1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1],
         [1, 1, 1, 1, 0],
         [0, 0, 1, 1, 0],
   

In [357]:
class Encoder(nn.Module):
    #as always, need to define forward pass of nn's in torch
    def __init__(self, layers, nodes, embedding, dropout=0):
        super(Encoder, self).__init__()
        self.layers = layers
        self.nodes = nodes
        self.embedding = embedding
        self.gru = nn.GRU(nodes, nodes, layers, dropout=dropout, bidirectional=True)
        
    def forward(self, inputs, lengths, h_in=None):
        embedded = self.embedding(inputs)
        transformed = nn.utils.rnn.pack_padded_sequence(embedded, lengths)
        output, h_out = self.gru(transformed, h_in)
        output, _ = nn.utils.rnn.pad_packed_sequence(output)
        output = output[:,:,:self.nodes] + output[:,:,self.nodes:]
        return output, h_out
        

In [319]:
#defined for a single step
class Decoder(nn.Module):
    def __init__(self, layers, nodes, out_size, embedding, dropout=0):
        super(Decoder, self).__init__()
        self.layers = layers
        self.nodes = nodes
        self.embedding = embedding
        self.gru = nn.GRU(nodes, nodes, layers, dropout=dropout)
        self.out = nn.Linear(nodes, out_size)
        
    def forward(self, inputs, h_in):
        embedded = self.embedding(inputs)
        output, h_out = self.gru(embedded, h_in)
        output = self.out(output)
        output = torch.nn.functional.softmax(output, dim = 2)
        return output, h_out
        

In [415]:
nodes = 100
layers = 2
batch_size = 5
num_words = len(language)
embedding = nn.Embedding(num_words, nodes)
encoder = Encoder(layers, nodes, embedding)
decoder = Decoder(layers, nodes, num_words, embedding)

In [416]:
learn_rate = 0.0001
e_optimizer = Adam(encoder.parameters(), lr=learn_rate)
d_optimizer = Adam(decoder.parameters(), lr=10*learn_rate)

In [321]:
in_batch, in_lens, out_batch, out_mask = prepareBatch(dialogues[:batch_size])
enc_outs, enc_hidden = encoder.forward(in_batch, in_lens)

In [322]:
dec_ins = torch.ones(1, batch_size, dtype=torch.long)

In [323]:
dec_ins

tensor([[1, 1, 1, 1, 1]])

In [324]:
dec_outs, dec_hidden = decoder.forward(dec_ins, enc_hidden[:layers])

In [330]:
a = dec_outs[0]

In [338]:
a.shape

torch.Size([5, 28617])

In [337]:
b.view(-1,1).shape

torch.Size([5, 1])

In [339]:
b = out_batch[1].view(-1,1)

In [341]:
c = torch.gather(a, 1, b)

In [342]:
c

tensor([[0.0000],
        [0.0000],
        [0.0000],
        [0.0000],
        [0.0000]], grad_fn=<GatherBackward>)

In [344]:
torch.log(c).mean()

tensor(-10.2772, grad_fn=<MeanBackward1>)

In [283]:
out_batch.shape

torch.Size([13, 5])

In [419]:
#also returns total number of items loss is computed over, used to find average loss
def maskedLoss(result, target, mask):
    n_terms = mask.sum().item()
    probs = torch.gather(result, 1, target.view(-1,1))#gets probabilities for classes it should've predicted
    loss = -torch.log(probs).masked_select(mask).mean()
    return loss

In [406]:
def trainIter(encoder, decoder, e_optimizer, d_optimizer, dialogues, grad_max=100, print_now=False):
    e_optimizer.zero_grad()
    d_optimizer.zero_grad()
    loss = 0
    in_batch, in_lens, out_batch, out_mask = prepareBatch(dialogues)
    enc_outs, enc_hidden = encoder.forward(in_batch, in_lens)
    dec_ins = torch.ones(1, batch_size, dtype=torch.long)
    dec_h_ins = enc_hidden[:layers]
    for target, mask in zip(out_batch[1:,:], out_mask[1:, :]):
        dec_outs, dec_hidden = decoder.forward(dec_ins, dec_h_ins)
        loss += maskedLoss(dec_outs[0], target, mask)
    loss.backward()
    nn.utils.clip_grad_norm_(encoder.parameters(), grad_max)
    nn.utils.clip_grad_norm_(decoder.parameters(), grad_max)
    e_optimizer.step()
    d_optimizer.step()
    if print_now:
        print(loss)

In [260]:
#this one actually goes through an entire batch to decode a sentence
#grabs most likely
class GreedySearch(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearch, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, batch):
        in_batch, in_lens, out_batch, out_mask = prepareBatch(batch)
        enc_outs, enc_hidden = self.encoder(in_batch, in_lens)
        dec_ins = torch.ones(1, in_batch.shape[1], dtype=torch.long)
        dec_hidden = enc_hidden[:self.encoder.layers]
        words = torch.ones(out_batch.shape[1]).long().unsqueeze(0)
        scores = torch.ones(out_batch.shape[1]).unsqueeze(0)
        for _ in range(in_batch.shape[0]-1):
            dec_outs, dec_hidden = self.decoder.forward(dec_ins, dec_hidden[:self.decoder.layers])
            score, dec_ins = torch.max(dec_outs, dim=2)
            words = torch.cat((words, dec_ins))
            scores = torch.cat((scores,score))
        return words, scores, out_mask

In [423]:
for i in range(100):
    chosen = chooseDialogues(dialogues, batch_size)
    trainIter(encoder, decoder, e_optimizer, d_optimizer, chosen, print_now=(i%10==0))

tensor(74.0871, grad_fn=<ThAddBackward>)
tensor(80.9185, grad_fn=<ThAddBackward>)
tensor(58.6604, grad_fn=<ThAddBackward>)
tensor(56.4268, grad_fn=<ThAddBackward>)
tensor(63.4824, grad_fn=<ThAddBackward>)
tensor(68.6804, grad_fn=<ThAddBackward>)
tensor(72.5847, grad_fn=<ThAddBackward>)
tensor(60.0998, grad_fn=<ThAddBackward>)
tensor(57.7197, grad_fn=<ThAddBackward>)
tensor(64.1294, grad_fn=<ThAddBackward>)
