# Character Level Model
We didn't see much success with our previous models. We topped out at about 40% accuracy even having duplicates in the dataset.

My theory is that the model struggles because the input and output spaces are so large: ~77k input tokens ~60k output tokens. This means the network must be quite large which makes it difficult to train.

Next, I'd like to recondiser this problem as a character level problem. Given the answer,clue pair `neural,kind of network` we can consider the problem on a character by character basis:

In [122]:
answer = 'neural'
clue = 'kind of network'

print('||||| ', end='')

for c in clue:
    print(f'{c} --> ', end='')

print('||||| ', end='')

for c in answer:
    print(f'{c} --> ', end='')

print('||||| ')

||||| k --> i --> n --> d -->   --> o --> f -->   --> n --> e --> t --> w --> o --> r --> k --> ||||| n --> e --> u --> r --> a --> l --> ||||| 


Now, let's replace those `|||||` separators with things that are meaningful. We'll use `<BOC>`, `<EOC>`, and `<EOA>` tokens.

In [123]:
tokens = ['<BOC>']
tokens.extend(list(answer))
tokens.append('<EOC>')
tokens.extend(list(answer))
tokens.append('<EOA>')

for i, token in enumerate(tokens):
    print(f'{token}', end='')
    if i < len(tokens) - 1:
        print(' --> ', end='')

<BOC> --> n --> e --> u --> r --> a --> l --> <EOC> --> n --> e --> u --> r --> a --> l --> <EOA>

So that looks good, but let's think about the `<EOC>` token - what if we replaced that with a special token indicating the _length of the expected answer_. This serves as a useful indicator to the network as to what should be generated next.

In [124]:
tokens = ['<BOC>']
tokens.extend(list(clue))
tokens.append('<' + str(len(answer)) + '>')
tokens.extend(list(answer))
tokens.append('<EOA>')

for i, token in enumerate(tokens):
    print(f'{token}', end='')
    if i < len(tokens) - 1:
        print(' --> ', end='')

<BOC> --> k --> i --> n --> d -->   --> o --> f -->   --> n --> e --> t --> w --> o --> r --> k --> <6> --> n --> e --> u --> r --> a --> l --> <EOA>

## Working with our Dataset
Let's load up a few data rows and see how this would work

In [125]:
import pandas as pd

df = pd.read_csv("cleaned_data/dupes_10_or_less_tokens.csv", keep_default_na=False)
df.head(5)

Unnamed: 0,answer,clue
0,pat,"action done while saying ""good dog"""
1,rascals,mischief-makers
2,pen,it might click for a writer
3,sep,fall mo.
4,eco,kind to mother nature


In [126]:
def build_tokens(row):
    clue = row['clue']
    answer = row['answer']
    tokens = ['<BOC>']
    tokens.extend(list(clue))
    tokens.append('<' + str(len(answer)) + '>')
    tokens.extend(list(answer))
    tokens.append('<EOA>')
    return tokens

token_batch = []
for i in range(5):
    tokens = build_tokens(df.iloc[i])
    token_batch.append(tokens)
    print(tokens)

['<BOC>', 'a', 'c', 't', 'i', 'o', 'n', ' ', 'd', 'o', 'n', 'e', ' ', 'w', 'h', 'i', 'l', 'e', ' ', 's', 'a', 'y', 'i', 'n', 'g', ' ', '"', 'g', 'o', 'o', 'd', ' ', 'd', 'o', 'g', '"', '<3>', 'p', 'a', 't', '<EOA>']
['<BOC>', 'm', 'i', 's', 'c', 'h', 'i', 'e', 'f', '-', 'm', 'a', 'k', 'e', 'r', 's', '<7>', 'r', 'a', 's', 'c', 'a', 'l', 's', '<EOA>']
['<BOC>', 'i', 't', ' ', 'm', 'i', 'g', 'h', 't', ' ', 'c', 'l', 'i', 'c', 'k', ' ', 'f', 'o', 'r', ' ', 'a', ' ', 'w', 'r', 'i', 't', 'e', 'r', '<3>', 'p', 'e', 'n', '<EOA>']
['<BOC>', 'f', 'a', 'l', 'l', ' ', 'm', 'o', '.', '<3>', 's', 'e', 'p', '<EOA>']
['<BOC>', 'k', 'i', 'n', 'd', ' ', 't', 'o', ' ', 'm', 'o', 't', 'h', 'e', 'r', ' ', 'n', 'a', 't', 'u', 'r', 'e', '<3>', 'e', 'c', 'o', '<EOA>']


That looks good, let's try this out with the first few rows in our dataset.

In [127]:
rows = df.iloc[:100]
all_chars = ''.join(rows['answer']) + ''.join(rows['clue'])

vocab = sorted(list(set(all_chars)))
vocab += ['<BOC>', '<EOA>']

max_answer_length = rows['answer'].str.len().max()
for i in range(1, max_answer_length + 1):
    vocab.append(f'<{i}>')

vocab_stoi = {s:i+1 for i,s in enumerate(vocab)}
vocab_itos = {i:s for s,i in vocab_stoi.items()}
vocab_size = len(vocab)
vocab_size

53

Now that we have our vocabulary built let's build a model for it.

In [128]:
import torch
import torch.nn as nn

class TextGenerator(nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1):
        super(TextGenerator, self).__init__()

        # identiy matrix for generating one-hot vectors
        self.ident = torch.eye(vocab_size)

        # recurrent neural network
        self.rnn = nn.GRU(vocab_size, hidden_size, n_layers, batch_first=True)

        # a fully-connect layer that outputs a distribution over
        # the next token, given the RNN output
        self.decoder = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, inp, hidden=None):
        inp = self.ident[inp]                  # generate one-hot vectors of input
        output, hidden = self.rnn(inp, hidden) # get the next output and hidden state
        output = self.decoder(output)          # predict distribution over next tokens
        return output, hidden

model = TextGenerator(vocab_size, 64)

Let's see what our network will produce given the start token.

In [129]:
criterion = nn.CrossEntropyLoss()
bos_input = torch.Tensor([vocab_stoi['<BOC>']]).long().unsqueeze(0)
output, hidden = model(bos_input, hidden=None)
output # distribution over the first token

tensor([[[-0.0178,  0.0924, -0.0770,  0.0569, -0.0954,  0.1026, -0.0615,
          -0.1311,  0.0406,  0.0392,  0.0010, -0.0470, -0.0253,  0.0990,
           0.1132,  0.0917, -0.0628, -0.1088, -0.0408,  0.1090, -0.0776,
          -0.1293,  0.0132, -0.0651, -0.0441, -0.1235,  0.0489, -0.0926,
          -0.0686, -0.1014, -0.0057, -0.1262,  0.1100,  0.0821, -0.0654,
          -0.1416, -0.1254, -0.0809,  0.0261, -0.0597, -0.0665, -0.1357,
           0.1129, -0.0992,  0.0552, -0.0168, -0.0897,  0.0574, -0.0602,
           0.0288, -0.0957, -0.0481,  0.1097]]], grad_fn=<ViewBackward0>)

In [130]:
first_clue = rows['clue'][0]
first_answer = rows['answer'][0]
target = torch.Tensor([vocab_stoi[first_clue[0]]]).long().unsqueeze(0)
criterion(output.reshape(-1, vocab_size), # reshape to 2D tensor
          target.reshape(-1))             # reshape to 1D tensor

tensor(3.9763, grad_fn=<NllLossBackward0>)

Instead of passing this token back to the NN to genereate the next one, we actually pass in the ground truth data and see the results.

In [131]:
# Use teacher-forcing: we pass in the ground truth `target`,
# rather than using the NN predicted distribution
output, hidden = model(target, hidden)
output # distribution over the second token

tensor([[[-0.0246,  0.1438, -0.0671,  0.0665, -0.0980,  0.1539, -0.0488,
          -0.1359,  0.0687, -0.0104, -0.0483, -0.0631, -0.0049,  0.0842,
           0.1212,  0.1134, -0.0409, -0.0921, -0.0227,  0.1279, -0.0951,
          -0.1102,  0.0191, -0.0683, -0.0436, -0.1478,  0.0586, -0.1148,
          -0.0461, -0.0801, -0.0176, -0.1271,  0.0776,  0.0634, -0.1016,
          -0.1497, -0.1664, -0.0726,  0.0216, -0.0529, -0.0896, -0.1100,
           0.1092, -0.0889,  0.0584, -0.0679, -0.0795,  0.0676, -0.0585,
           0.0418, -0.0949, -0.0583,  0.1216]]], grad_fn=<ViewBackward0>)

In [132]:
target = torch.Tensor([vocab_stoi[first_clue[1]]]).long().unsqueeze(0)
criterion(output.reshape(-1, vocab_size), # reshape to 2D tensor
          target.reshape(-1))             # reshape to 1D tensor

tensor(3.8305, grad_fn=<NllLossBackward0>)

In [133]:
for i in range(2, len(first_clue)):
    output, hidden = model(target, hidden)
    target = torch.Tensor([vocab_stoi[first_clue[i]]]).long().unsqueeze(0)
    loss = criterion(output.reshape(-1, vocab_size), # reshape to 2D tensor
                     target.reshape(-1))             # reshape to 1D tensor
    print(i, output, loss)

2 tensor([[[-0.0317,  0.1444, -0.0016,  0.0600, -0.1215,  0.1242, -0.0587,
          -0.1374,  0.1119,  0.0735,  0.0022, -0.0196, -0.0028,  0.0944,
           0.1236,  0.1139, -0.1002, -0.0901, -0.0004,  0.1442, -0.1161,
          -0.0799,  0.0437, -0.1013, -0.0683, -0.1529,  0.0656, -0.1302,
          -0.0156, -0.0974,  0.0040, -0.1278,  0.0999,  0.0033, -0.0997,
          -0.1592, -0.1424, -0.0563,  0.0772, -0.0236, -0.0481, -0.1121,
           0.1024, -0.0921,  0.0446,  0.0118, -0.1160,  0.0841, -0.1158,
           0.0573, -0.1103, -0.0902,  0.0991]]], grad_fn=<ViewBackward0>) tensor(4.0847, grad_fn=<NllLossBackward0>)
3 tensor([[[-0.0092,  0.1451,  0.0026,  0.0823, -0.1214,  0.1331, -0.0897,
          -0.1330,  0.0821,  0.0672, -0.0024, -0.0094, -0.0151,  0.0892,
           0.1090,  0.1147, -0.1087, -0.0733, -0.0005,  0.1319, -0.1366,
          -0.1210,  0.0094, -0.1123, -0.0509, -0.1379,  0.0388, -0.0994,
           0.0027, -0.1276, -0.0286, -0.1184,  0.1325,  0.0168, -0.0963,
   

Now, we'd expected the answer length token.

In [134]:
answer_token = f'<{len(first_answer)}>'
answer_token

'<3>'

In [135]:
output, hidden = model(target, hidden)
target = torch.Tensor([vocab_stoi[answer_token]]).long().unsqueeze(0)
loss = criterion(output.reshape(-1, vocab_size), # reshape to 2D tensor
                 target.reshape(-1))             # reshape to 1D tensor
print(i, output, loss)

34 tensor([[[-0.0425,  0.1387, -0.0277,  0.0906, -0.1266,  0.1276, -0.1000,
          -0.1590,  0.0964,  0.1226, -0.0134, -0.0744, -0.0434,  0.0870,
           0.1391,  0.0849, -0.1287, -0.1005,  0.0016,  0.1386, -0.0804,
          -0.1239,  0.0313, -0.0982, -0.0442, -0.1599,  0.0415, -0.0826,
           0.0145, -0.1008, -0.0824, -0.1046,  0.1095, -0.0065, -0.1026,
          -0.1756, -0.0787, -0.0439,  0.1088, -0.0291, -0.0454, -0.1067,
           0.0831, -0.1080,  0.0511, -0.0123, -0.1071,  0.1047, -0.0739,
           0.0446, -0.1206, -0.0734,  0.1319]]], grad_fn=<ViewBackward0>) tensor(3.8722, grad_fn=<NllLossBackward0>)


Then, on to the answer

In [136]:
for i in range(len(first_answer)):
    output, hidden = model(target, hidden)
    target = torch.Tensor([vocab_stoi[first_answer[i]]]).long().unsqueeze(0)
    loss = criterion(output.reshape(-1, vocab_size), # reshape to 2D tensor
                     target.reshape(-1))             # reshape to 1D tensor
    print(i, output, loss)

0 tensor([[[-0.0544,  0.1547, -0.0541,  0.1441, -0.1214,  0.1471, -0.1003,
          -0.1201,  0.0877,  0.0842, -0.0173, -0.0832, -0.0372,  0.0544,
           0.1741,  0.0905, -0.1312, -0.0980, -0.0154,  0.1541, -0.0847,
          -0.1468,  0.0150, -0.0603, -0.0431, -0.1722,  0.0678, -0.0733,
          -0.0132, -0.1018, -0.0878, -0.1071,  0.1158, -0.0248, -0.0991,
          -0.1741, -0.1290, -0.0874,  0.0828,  0.0061, -0.0631, -0.0697,
           0.1006, -0.0706,  0.0327, -0.0043, -0.0982,  0.1172, -0.0567,
           0.0412, -0.1139, -0.0592,  0.1274]]], grad_fn=<ViewBackward0>) tensor(4.0298, grad_fn=<NllLossBackward0>)
1 tensor([[[-1.7521e-02,  1.7617e-01, -2.4879e-02,  1.2930e-01, -9.3419e-02,
           1.5921e-01, -1.1288e-01, -8.4005e-02,  1.1554e-01,  5.4016e-02,
          -3.3791e-03, -6.7495e-02, -9.4511e-03,  4.5579e-02,  1.6342e-01,
           6.2786e-02, -1.6795e-01, -8.7427e-02, -1.0543e-04,  1.5991e-01,
          -1.2854e-01, -1.0165e-01,  5.7618e-03, -6.0862e-02, -7.899

Finally, we'd expect the `<EOA>` token.

In [137]:
output, hidden = model(target, hidden)
target = torch.Tensor([vocab_stoi["<EOA>"]]).long().unsqueeze(0)
loss = criterion(output.reshape(-1, vocab_size), # reshape to 2D tensor
                 target.reshape(-1))             # reshape to 1D tensor
print(i, output, loss)

2 tensor([[[-0.0013,  0.1725, -0.0017,  0.0967, -0.1000,  0.1498, -0.0957,
          -0.1276,  0.0858,  0.0337, -0.0220, -0.0270, -0.0096,  0.0711,
           0.1131,  0.0948, -0.1101, -0.0705, -0.0088,  0.1455, -0.1491,
          -0.1231, -0.0018, -0.0962, -0.0506, -0.1395,  0.0421, -0.0946,
          -0.0020, -0.1256, -0.0534, -0.1088,  0.1242,  0.0180, -0.1075,
          -0.1695, -0.1415, -0.0977,  0.0819, -0.0364, -0.0537, -0.0866,
           0.0773, -0.0780,  0.0668, -0.0122, -0.1290,  0.1002, -0.0497,
           0.0686, -0.0962, -0.1009,  0.1423]]], grad_fn=<ViewBackward0>) tensor(3.9923, grad_fn=<NllLossBackward0>)


We don't actually need to loop like this though, we can just pass in the sequence:

In [138]:
indices = [vocab_stoi[ch] for ch in token_batch[0]]
tensor = torch.Tensor(indices).long().unsqueeze(0)

print(tensor.shape)

output, hidden = model(tensor[:,:-1]) # <EOS> is never an input token
target = tensor[:,1:]                 # <BOS> is never a target token
loss = criterion(output.reshape(-1, vocab_size), # reshape to 2D tensor
                 target.reshape(-1))             # reshape to 1D tensor

torch.Size([1, 41])


In [139]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
for it in range(500):
    optimizer.zero_grad()
    output, _ = model(tensor[:,:-1])
    loss = criterion(output.reshape(-1, vocab_size),
                 target.reshape(-1))
    loss.backward()
    optimizer.step()

    if (it+1) % 100 == 0:
        print("[Iter %d] Loss %f" % (it+1, float(loss)))

[Iter 100] Loss 2.502622
[Iter 200] Loss 0.904687
[Iter 300] Loss 0.151702
[Iter 400] Loss 0.040548
[Iter 500] Loss 0.019633


## Full dataset

In [140]:
def build_tokens(row):
    clue = row['clue']
    answer = row['answer']
    tokens = ['<BOC>']
    tokens.extend(list(clue))
    tokens.append('<EOC>')
    tokens.append('<' + str(len(answer)) + '>')
    tokens.extend(list(answer))
    tokens.append('<EOA>')
    return tokens

token_batch = []
for i in range(5000):
    tokens = build_tokens(df.iloc[i])
    token_batch.append(tokens)

rows = df.iloc[:5000]
all_chars = ''.join(rows['answer']) + ''.join(rows['clue'])

vocab = sorted(list(set(all_chars)))
vocab += ['<BOC>', '<EOC>', '<EOA>']

max_answer_length = rows['answer'].str.len().max()
for i in range(1, max_answer_length + 1):
    vocab.append(f'<{i}>')

vocab_stoi = {s:i for i,s in enumerate(vocab)}
vocab_itos = {i:s for s,i in vocab_stoi.items()}
vocab_size = len(vocab)
vocab_size


76

In [141]:
def train(model, data, batch_size=1, num_epochs=1, lr=0.001, print_every=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    it = 0
    
    for e in range(num_epochs):
        # get training set
        avg_loss = 0

        for token in data:
            indicies = torch.tensor(list(map(lambda x: vocab_stoi[x], token)))
            target = indicies[1:]
            inp = indicies[:-1]
            # target = token[:, 1:] # skip the first character as that's not a target
            # inp = token[:, :-1] # skil the last value as it's never an input
            # cleanup
            optimizer.zero_grad()
            # forward pass
            output, _ = model(inp)
            loss = criterion(output.reshape(-1, vocab_size), target.reshape(-1))
            # backward pass
            loss.backward()
            optimizer.step()

            avg_loss += loss
            it += 1 # increment iteration count
            if it % print_every == 0:
                print("[Iter %d] Loss %f" % (it+1, float(avg_loss/print_every)))
                print(token)
                #print(indicies.shape)
                #print(output.shape)
                predicted_indicies = output.argmax(1)
                #print(predicted_indicies.shape)
                print(list(map(lambda x: vocab_itos[x.item()], predicted_indicies)))
                # get answer length, would be nice to have this beforehand
                #print("    " + sample_sequence(model, 140, 0.8))
                avg_loss = 0

model = TextGenerator(vocab_size, 128)
train(model, token_batch, batch_size=1, num_epochs=1, lr=0.003, print_every=1000)

## Open Questions
* Should we use separate vocabs for answers??

## References
* https://www.cs.toronto.edu/~lczhang/360/lec/w08/gen.html