In [74]:
from Vocabulary import vocabulary
from preprocessing import normalizeString, unicodetoascii
import itertools
import torch
import random
from model import *
from NLLLoss import maskNLLLoss

In [3]:
PAD = 0
SOS = 1
EOS = 2

In [4]:
datafile = "data/formatted_movie_lines.txt"

In [5]:
lines = open(datafile, encoding="utf-8").read().strip().split("\n\n")
pairs = [[normalizeString(s) for s in pair.split("\t")] for pair in lines]

In [6]:
len(pairs)

221282

In [7]:
corpus = vocabulary("Cornell Movie Dialogues")

In [8]:
#a bit more cleaning, so well remove any sentances that are too long
def filterpair(p, max_length=10):
    return len(p[0].split()) <= max_length and len(p[1].split()) <= max_length

pairs = [pair for pair in pairs if filterpair(pair)]

In [9]:
len(pairs)

75026

In [10]:
pairs[:10]

[['that s because it s such a nice one .', 'forget french .'],
 ['there .', 'where ?'],
 ['you have my word . as a gentleman', 'you re sweet .'],
 ['hi .', 'looks like things worked out tonight huh ?'],
 ['you know chastity ?', 'i believe we share an art instructor'],
 ['have fun tonight ?', 'tons'],
 ['well no . . .', 'then that s all you had to say .'],
 ['then that s all you had to say .', 'but'],
 ['but', 'you always been this selfish ?'],
 ['do you listen to this crap ?', 'what crap ?']]

In [11]:
def trimRareWords(vocab, pairs, min_count = 3):
    
    vocab.trim(min_count=min_count)
    keep_pairs = []
    for pair in pairs:
        input_ = pair[0]
        reply_ = pair[1]
        keepinput, keepreply = True, True
        for word in input_.split(" "):
            if word not in vocab.word2index:
                keepinput = False
                break
        for word in reply_.split(" "):
            if word not in vocab.word2index:
                keepreply = False
                break
        if keepinput and keepreply:
            keep_pairs.append(pair)

    print(f"After trimming kept {len(keep_pairs)} out of {len(pairs)}")
    
    return keep_pairs

In [12]:
for pair in pairs:
    corpus.addSentance(pair[0])
    corpus.addSentance(pair[1])

print(corpus.num_words)

20093


In [13]:
cleaned_pairs = trimRareWords(corpus, pairs)

After trimming kept 62810 out of 75026


In [14]:
cleaned_pairs[:10]

[['that s because it s such a nice one .', 'forget french .'],
 ['there .', 'where ?'],
 ['you have my word . as a gentleman', 'you re sweet .'],
 ['hi .', 'looks like things worked out tonight huh ?'],
 ['have fun tonight ?', 'tons'],
 ['well no . . .', 'then that s all you had to say .'],
 ['then that s all you had to say .', 'but'],
 ['but', 'you always been this selfish ?'],
 ['do you listen to this crap ?', 'what crap ?'],
 ['what good stuff ?', 'the real you .']]

In [27]:
def indexfromSentance(vocab:vocabulary, sentance:str):
    return [vocab.word2index[word] for word in sentance.split(" ")] + [EOS]

In [15]:
corpus.indexfromSentance(cleaned_pairs[1][0])

[14, 11, 2]

In [31]:
inputs = []
for pair in cleaned_pairs[:10]:
    inputs.append(indexfromSentance(corpus, pair[0]))

In [16]:
def zeropading(l, fillvalue = 0):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [17]:
def binarymatrix(l, value=0):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD:
                m[i].append(0)
            else:
                m[i].append(1)

    return m

In [35]:
l = zeropading(inputs)

In [37]:
inputs

[[3, 4, 5, 6, 4, 7, 8, 9, 10, 11, 2],
 [14, 11, 2],
 [17, 18, 19, 20, 11, 21, 8, 22, 2],
 [25, 11, 2],
 [18, 40, 31, 16, 2],
 [42, 43, 11, 11, 11, 2],
 [44, 3, 4, 45, 17, 46, 47, 48, 11, 2],
 [49, 2],
 [54, 17, 55, 47, 52, 56, 16, 2],
 [57, 58, 59, 16, 2]]

In [36]:
l

[(3, 14, 17, 25, 18, 42, 44, 49, 54, 57),
 (4, 11, 18, 11, 40, 43, 3, 2, 17, 58),
 (5, 2, 19, 2, 31, 11, 4, 0, 55, 59),
 (6, 0, 20, 0, 16, 11, 45, 0, 47, 16),
 (4, 0, 11, 0, 2, 11, 17, 0, 52, 2),
 (7, 0, 21, 0, 0, 2, 46, 0, 56, 0),
 (8, 0, 8, 0, 0, 0, 47, 0, 16, 0),
 (9, 0, 22, 0, 0, 0, 48, 0, 2, 0),
 (10, 0, 2, 0, 0, 0, 11, 0, 0, 0),
 (11, 0, 0, 0, 0, 0, 2, 0, 0, 0),
 (2, 0, 0, 0, 0, 0, 0, 0, 0, 0)]

In [38]:
binary = binarymatrix(l)

In [39]:
binary

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
 [1, 0, 1, 0, 1, 1, 1, 0, 1, 1],
 [1, 0, 1, 0, 1, 1, 1, 0, 1, 1],
 [1, 0, 1, 0, 0, 1, 1, 0, 1, 0],
 [1, 0, 1, 0, 0, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 0, 0, 1, 0, 1, 0],
 [1, 0, 1, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [21]:
#Returns padded input sequence tensor as well as tensor of lengths for each of the padded seq in the batc
def inputVar(l:list, vocab:vocabulary):
    indexes_batch = [vocab.indexfromSentance(sentance) for sentance in l]
    lengths = torch.tensor([len(index_array) for index_array in indexes_batch])
    padlist = zeropading(indexes_batch)
    padvar = torch.LongTensor(padlist)
    return padvar, lengths

In [22]:
# Returns padded target sequence tensor, padding mask and maax target length
def outputVar(l:list, vocab:vocabulary):
    indexes_batch = [vocab.indexfromSentance(sentance) for sentance in l]
    max_target_len = max([len(index_array) for index_array in indexes_batch])
    padlist = zeropading(indexes_batch)
    mask = binarymatrix(padlist)
    mask = torch.ByteTensor(mask)
    padvar = torch.LongTensor(padlist)
    return padvar, mask, max_target_len

In [23]:
#Prepares the data for training for a given batch of pairs
def batch2traindata(vocab, pair_batch):
    #Sort the question answers pairs in descending order
    pair_batch.sort(key=lambda x:len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, vocab)
    output, mask, max_target_len = outputVar(output_batch, vocab)
    return inp, lengths, output, mask, max_target_len

In [47]:
#Validation of preprocessing steps
batch_size = 5
input_seq, lengths, target_seq, target_mask, max_target_length = batch2traindata(corpus, [random.choice(cleaned_pairs) for _ in range(batch_size)])

In [48]:
print(input_seq)

tensor([[ 153,  113,   34,  550, 1483],
        [  34,   34,  108, 6394,   16],
        [ 101,   67,  285,   16,    2],
        [ 102,  882,    6,    2,    0],
        [ 307, 1114,  158,    0,    0],
        [  82,  225,   11,    0,    0],
        [  60,   16,    2,    0,    0],
        [ 246,    2,    0,    0,    0],
        [  11,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])


In [49]:
print(lengths)

tensor([10,  8,  7,  4,  3])


In [50]:
print(target_seq)

tensor([[ 860,  344,  327,   66,  266],
        [8058, 1184, 1544, 3183,   32],
        [ 640,   11, 2398,   67,   11],
        [  11,   17,   27, 6263,  143],
        [   2,  543,   93,   73,   10],
        [   0,  183,   11,    2,   16],
        [   0,  522,    2,    0,    2],
        [   0,   47,    0,    0,    0],
        [   0, 1704,    0,    0,    0],
        [   0,   11,    0,    0,    0],
        [   0,    2,    0,    0,    0]])


In [51]:
print(target_mask)

tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 1, 0, 1],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0]], dtype=torch.uint8)


In [52]:
print(max_target_length)

11


In [54]:
gru = torch.nn.GRU(input_size = 5, hidden_size = 3, num_layers=1)

In [63]:
x = torch.randn(3, 5, 5)

In [64]:
x

tensor([[[ 0.7606, -0.6435, -0.1769, -1.2964, -0.9328],
         [-0.5316, -0.5640,  0.0882,  1.3835,  0.3539],
         [ 1.8324, -0.2292, -0.8020, -1.4747, -0.0541],
         [-0.6323, -0.7169, -0.2268,  1.9690,  0.1143],
         [ 0.6190,  0.8411, -2.2479,  0.2791, -0.6485]],

        [[-0.5636, -0.2744,  1.0614,  1.6762, -0.2682],
         [-1.8209,  0.5697, -0.6093, -1.2058,  0.9373],
         [-0.3847, -0.2412, -0.3489,  0.3419, -1.9268],
         [ 0.0416,  0.3064, -0.1255, -0.9199, -0.6506],
         [-0.2896, -1.4179,  1.7530,  0.1146,  0.2073]],

        [[-1.7934,  1.5147,  1.4379,  0.9670,  1.0614],
         [-0.4330,  0.3590, -0.3347,  0.6064,  0.1310],
         [ 0.1455, -0.1465,  0.2352, -0.0569,  0.2094],
         [-0.5671, -0.6823,  0.7690, -1.1584,  0.2873],
         [ 1.7295, -1.0829, -0.5514,  2.2988, -0.1104]]])

In [70]:
a = torch.rand(6, 7) #(batch_size, seq_len or max words per bach)
lengths = [7,7,6,5,4,2]

In [71]:
lengths

[7, 7, 6, 5, 4, 2]

In [79]:
a

tensor([[0.6346, 0.4711, 0.9906, 0.7849, 0.7462, 0.5860, 0.5907],
        [0.8719, 0.9433, 0.5214, 0.8487, 0.1704, 0.8424, 0.5195],
        [0.4170, 0.1870, 0.0618, 0.3893, 0.1823, 0.8014, 0.0000],
        [0.3333, 0.0549, 0.3940, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3459, 0.1796, 0.0064, 0.5855, 0.0000, 0.0000, 0.0000],
        [0.3830, 0.3656, 0.8027, 0.0000, 0.0000, 0.0000, 0.0000]])

In [80]:
targets = torch.nn.utils.rnn.pack_padded_sequence(a, lengths, batch_first=True)

In [85]:
targets[0]

tensor([0.6346, 0.8719, 0.4170, 0.3333, 0.3459, 0.3830, 0.4711, 0.9433, 0.1870,
        0.0549, 0.1796, 0.3656, 0.9906, 0.5214, 0.0618, 0.3940, 0.0064, 0.7849,
        0.8487, 0.3893, 0.0000, 0.5855, 0.7462, 0.1704, 0.1823, 0.0000, 0.5860,
        0.8424, 0.8014, 0.5907, 0.5195])

In [86]:
targets[1]

tensor([6, 6, 5, 5, 4, 3, 2])

In [89]:
cutoff = 0
for i in targets[1]:
    print(targets[0][cutoff:cutoff+i])
    cutoff += i

tensor([0.6346, 0.8719, 0.4170, 0.3333, 0.3459, 0.3830])
tensor([0.4711, 0.9433, 0.1870, 0.0549, 0.1796, 0.3656])
tensor([0.9906, 0.5214, 0.0618, 0.3940, 0.0064])
tensor([0.7849, 0.8487, 0.3893, 0.0000, 0.5855])
tensor([0.7462, 0.1704, 0.1823, 0.0000])
tensor([0.5860, 0.8424, 0.8014])
tensor([0.5907, 0.5195])


In [90]:
b = torch.rand(5,5) #(batch_size, seq_len)

In [92]:
#so each value in any row is basically the encoded word
b

tensor([[0.5633, 0.5129, 0.8349, 0.5787, 0.7113],
        [0.7678, 0.4710, 0.0760, 0.6818, 0.1525],
        [0.0555, 0.3180, 0.1772, 0.6100, 0.9287],
        [0.7592, 0.5092, 0.7764, 0.2982, 0.7805],
        [0.8202, 0.4984, 0.6043, 0.1431, 0.4397]])

In [93]:
x = torch.nn.functional.softmax(b, dim=1)

In [94]:
x

tensor([[0.1839, 0.1749, 0.2413, 0.1867, 0.2132],
        [0.2701, 0.2008, 0.1352, 0.2479, 0.1460],
        [0.1322, 0.1719, 0.1493, 0.2301, 0.3165],
        [0.2248, 0.1751, 0.2287, 0.1418, 0.2296],
        [0.2687, 0.1947, 0.2165, 0.1365, 0.1836]])

In [95]:
sum(x[0])

tensor(1.)

In [96]:
#Custom Negetive Log Liklehood Loss
def maskNLLLoss(decoder_out, target, mask, device):
    ntotal = mask.sum()
    target = target.view(-1,1)
    #Decoder output shape: (batch_size, vocab_size), targets_size = (batch_size,1)
    gathered_tensor = torch.gather(decoder_out,1,target)
    cross_entropy = -torch.log(gathered_tensor)
    loss = cross_entropy.masked_select(mask)
    loss = loss.mean()
    loss = loss.to(device)
    return loss, ntotal.item()

### Demo Training

In [62]:
batch_size = 5
batches = batch2traindata(corpus, [random.choice(cleaned_pairs) for _ in range(batch_size)])
input_array, lengths, target_array, target_mask, max_target_len = batches
print(f"Input Array Shape: {input_array.shape}")
print(f"Lengths Shape: {lengths.shape}")
print(f"Target Array Shape: {target_array.shape}")
print(f"Mask Shape: {target_mask.shape}")
print(f"Max Target Length: {max_target_len}")

Input Array Shape: torch.Size([11, 5])
Lengths Shape: torch.Size([5])
Target Array Shape: torch.Size([11, 5])
Mask Shape: torch.Size([11, 5])
Max Target Length: 11


In [82]:
#Defining trainiing parameters
hidden_size = 500
enocder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding = torch.nn.Embedding(corpus.num_words, hidden_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [84]:
encoder = Encoder(hidden_size, embedding, enocder_n_layers, dropout)
decoder = Decoder(attn_model, embedding, hidden_size, corpus.num_words, decoder_n_layers, dropout)
encoder.to(device)
decoder.to(device)

Decoder(
  (embedding): Embedding(9030, 500)
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(500, 500, num_layers=2, dropout=0.1)
  (concat): Linear(in_features=1000, out_features=500, bias=True)
  (attention): Attention()
  (out): Linear(in_features=500, out_features=9030, bias=True)
)

In [85]:
encoder.train()
decoder.train()
encoder_optimizer = torch.optim.Adam(encoder.parameters(), 0.0001)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), 0.0001)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
input_array = input_array.to(device)
lengths = lengths.to("cpu")
target_array = targer_array.to(device)
target_mask = target_mask.to(device)

In [86]:
encoder_outputs, encoder_hidden = encoder(input_array, lengths)

In [87]:
print(f"Encoder Output Shape: {encoder_outputs.shape}")
print(f"Encoder Hidden Shapes: {encoder_hidden.shape}")

Encoder Output Shape: torch.Size([11, 5, 500])
Encoder Hidden Shapes: torch.Size([4, 5, 500])


In [94]:
decoder_input = torch.LongTensor([[1 for _ in range(batch_size)]])
decoder_input = decoder_input.to(device)
print(decoder_input.shape)

torch.Size([1, 5])


In [95]:
#Setting the inital decoder hidden states as the encoder last hidden states obviously making it so that it fits using the n_layers
decoder_hidden = encoder_hidden[:decoder.n_layers]
print(decoder_hidden.shape)

torch.Size([2, 5, 500])


In [96]:
loss = 0
loss_array = []
n_totals = 0

In [99]:
#Assuming we are using teacher forcing
for t in range(max_target_len):
    # print(decoder_input.device, decoder_hidden.device, encoder_outputs.device)
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
    print(f"Decoder Output Shape: {decoder_output.shape}")
    print(f"Decoder Hidden Shape: {decoder_hidden.shape}")
    #Teacher Forcing: New Input is current target
    decoder_input = target_array[t].view(1,-1)
    print(f"The target variable at the current timestep before reshaping:", target_array[t])
    print(f"The Shape of the target variable before reshaping {target_array.shape}")
    print(f"Target variable shape after reshaping {decoder_input.shape}")
    #Calculate and Accumulate loss
    print(f"Mask at the current timestep {target_mask[t]}")
    print(f"Mask at the current timestep shape {target_mask[t].shape}")
    mask_loss, total_items = maskNLLLoss(decoder_output, target_array[t], target_mask[t].bool(), device)
    print(f"Mask Loss {mask_loss}")
    print(f"Total Items: {total_items}")
    loss += mask_loss
    loss_array.append(mask_loss.item() * total_items)
    print(f"Loss Array {loss_array}")
    n_totals += total_items
    print(f"Total Items {n_totals}")
    encoder_optimizer.step()
    decoder_optimizer.step()
    returned_loss = sum(loss_array) / n_totals
    print(f"Returned Loss {returned_loss}")
    print("----------------------End of Timestep--------------------------------------")

Decoder Output Shape: torch.Size([5, 9030])
Decoder Hidden Shape: torch.Size([2, 5, 500])
The target variable at the current timestep before reshaping: tensor([443,  50,  43,   3,  57], device='cuda:0')
The Shape of the target variable before reshaping torch.Size([11, 5])
Target variable shape after reshaping torch.Size([1, 5])
Mask at the current timestep tensor([1, 1, 1, 1, 1], device='cuda:0', dtype=torch.uint8)
Mask at the current timestep shape torch.Size([5])
Mask Loss 9.112746238708496
Total Items: 5
Loss Array [45.56373119354248]
Total Items 5
Returned Loss 9.112746238708496
----------------------End of Timestep--------------------------------------
Decoder Output Shape: torch.Size([5, 9030])
Decoder Hidden Shape: torch.Size([2, 5, 500])
The target variable at the current timestep before reshaping: tensor([ 93, 425,   3, 363,  54], device='cuda:0')
The Shape of the target variable before reshaping torch.Size([11, 5])
Target variable shape after reshaping torch.Size([1, 5])
Mask

In [101]:
len(cleaned_pairs)

62810

In [102]:
from dataset import ChatDataset
chatdata = ChatDataset(cleaned_pairs, corpus)

In [103]:
chatdata[0]

['that s because it s such a nice one .', 'forget french .']

In [109]:
from torch.utils.data import DataLoader

In [110]:
def collate_fn(batch):
    return batch2traindata(corpus, batch)

In [111]:
dataloader = DataLoader(chatdata, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [117]:
for batch in dataloader:
    input_array, lengths, target_array, target_mask, max_target_len = batch
    print(f"Input Array Shape: {input_array.shape}")
    print(f"Lengths Shape: {lengths.shape}")
    print(f"Target Array Shape: {target_array.shape}")
    print(f"Mask Shape: {target_mask.shape}")
    print(f"Max Target Length: {max_target_len}")
    print(input_array)
    print("---------------------ENDOFBATCH------------------------------")
    break

Input Array Shape: torch.Size([11, 32])
Lengths Shape: torch.Size([32])
Target Array Shape: torch.Size([11, 32])
Mask Shape: torch.Size([11, 32])
Max Target Length: 11
tensor([[  83,   87,   34,   15,  104,  393, 1748, 3885,   72,  174,   49,   17,
          438,   34,   14,   28,   57,  281,   57,    6,  459,   85,   15,    8,
           34,   43,  174,  344, 8301,  930,   57, 8974],
        [1127,    4,  303,  141,  477, 8073,   11,   34,  276,    8,   17, 1170,
           96,  125,    4,  111,   82,  733,  111,    4,   17,    8,  111,  184,
          101, 5276,  416,  974,   11,   73,   16, 8975],
        [  11,   52,   47,   52,  958,   11,   11,  116,  102,  188,  101,  117,
          846,  509,  361,  709, 6110,    4,   17,  524, 1375, 1273,   60,   63,
          102,   63, 3224,  425,    2,    2,    2,    2],
        [  11,  167, 1308, 5596, 1148,   11,   11,   17,  183,  183,  102,  522,
            4,  198,    8,   63,    4,  361,  306,   14,   93,  253,  685,  903,
          

In [125]:
a, b = decoder_output.topk(1)

In [123]:
b.shape

torch.Size([5, 1])

In [122]:
a

tensor([[0.0001],
        [0.0002],
        [0.0001],
        [0.0001],
        [0.0001]], device='cuda:0', grad_fn=<TopkBackward0>)

In [124]:
decoder_output

tensor([[1.1553e-04, 1.1294e-04, 1.0690e-04,  ..., 1.1395e-04, 1.0120e-04,
         1.1549e-04],
        [1.0311e-04, 1.0790e-04, 1.0737e-04,  ..., 1.1787e-04, 9.2372e-05,
         1.1191e-04],
        [1.0052e-04, 9.9224e-05, 1.0639e-04,  ..., 1.0647e-04, 1.0601e-04,
         1.0701e-04],
        [1.1131e-04, 1.1383e-04, 1.0399e-04,  ..., 1.1313e-04, 9.5457e-05,
         1.0915e-04],
        [1.1124e-04, 1.1287e-04, 1.1108e-04,  ..., 1.0921e-04, 9.8871e-05,
         1.1760e-04]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [127]:
len(dataloader)

1963