In [None]:
import random, os, sys
from collections import deque
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
def randomize_cases(sentence):
    sent_len = len(sentence)
    num_changes = random.randint(int(sent_len/2), sent_len-2)
    change_indices = []
    for x in range(num_changes):
        new_index = random.randint(0, sent_len-1)
        if new_index not in change_indices:
            change_indices.append(new_index)
    for i in change_indices:
        letter = sentence[i]
        if letter.isupper() == True:
            letter = letter.lower()
        else:
            letter = letter.upper()
        sentence = ''.join([sentence[:i], letter, sentence[i+1:]])
    return sentence

In [None]:
def make_batches(data, batch_len):
    num_batches = int(len(data)/batch_len)
    batches = [data[i:i+batch_len] for i in range(0, len(data), batch_len)]
    ys = [data[i+batch_len+1] for i in range(0, len(data), batch_len)]
    return batches, ys

In [None]:
def get_random_batch(data, batch_len):
    start_index = random.randint(0, len(data)-batch_len-2)
    batch = data[start_index:start_index+batch_len]
    y = data[start_index+batch_len]
    return batch, y

In [None]:
def vectorize_batch(data, vocab):
    vec = np.zeros((len(data), len(vocab)))
    for i, d in enumerate(data):
        vec[i][vocab[d]] = 1
    return vec

In [None]:
def vectorize_item(item, vocab):
    vec = np.zeros(len(vocab))
    vec[vocab[item]] = 1
    return vec

In [None]:
def unvectorize_item(one_hot, vocab_inv):
    index = np.argmax(one_hot)
    return vocab_inv[index]

In [None]:
def unvectorize_batch(batch, vocab_inv):
    ret = []
    for b in batch:
        index = np.argmax(b)
        ret.append(vocab_inv[index])
    return ret

In [None]:
class simple_rnn(nn.Module):
    def __init__(self, input_len, rnn_size, num_layers, output_len):
        super(simple_rnn, self).__init__()
        self.gru = nn.GRU(input_len, rnn_size, num_layers)
        self.out = nn.Linear(rnn_size, output_len)
 
    def forward(self, x):
        x1, hidden  = self.gru(x)
        x_flat = x1.reshape(x1.size()[1:])
        x2 = self.out(x_flat)
        x3 = F.softmax(x2, dim=1)
        return x3

In [None]:
# Prepare training data
sample_sentence = "All work and no play makes Jack a dull boy. "
create_randomized_training_set = True
train_file = "train.txt"
num_sentences = 100
train = sample_sentence
if create_randomized_training_set == True:
    for n in range(num_sentences):
        randomized = randomize_cases(sample_sentence)
        train += randomized
with open(train_file, "w") as f:
    f.write(train)
print(len(train))

In [None]:
v = []
for n in range(len(train)-1):
    letter = train[n]
    if letter not in v:
        v.append(letter)
vocab = {}
vocab_inv = {}
for index, letter in enumerate(v):
    vocab[letter] = index
    vocab_inv[index] = letter
print(len(vocab))

In [None]:
batch, y = get_random_batch(train, 25)
print(batch)
print("y: \"" + y + "\"")

In [None]:
vecs = vectorize_batch(batch, vocab)
print(vecs[0:5])

In [None]:
test_unvec = ""
for v in vecs[0:10]:
    test_unvec += unvectorize_item(v, vocab_inv)
print(test_unvec)

In [None]:
input_len = len(vecs[0])
output_len = len(vecs[0])
# Number of cells in each rnn layer
rnn_size = 100
# Number of layers
num_layers = 3
model = simple_rnn(input_len, rnn_size, num_layers, output_len)
model = model.float()
optimizer = optim.Adam(model.parameters(), lr=0.0004, amsgrad=True)

In [None]:
tvecs = torch.FloatTensor(vecs).unsqueeze(0)
print(tvecs.shape)
target = model(tvecs)[0]
print(target)
tnp = np.array(target.detach())
print(unvectorize_item(tnp, vocab_inv))
print(y)
y_vec = torch.FloatTensor(vectorize_item(y, vocab))
print(y_vec)
loss = (y_vec - target).pow(2).mean()
print(loss)

In [None]:
# Training
# Maximum number of iterations to run
num_iters = 100000
# Number of characters to feed into rnn each iteration
batch_len = 25
# If we exceed this, we finish training
target_accuracy = 0.95
# A variable to record guesses, such that we can calculate accuracy as we train
guesses = deque()
losses = deque()
acc = 0.0
for iteration in range(num_iters):
    batch, y = get_random_batch(train, batch_len)
    vecs = vectorize_batch(batch, vocab)
    tvecs = torch.FloatTensor(vecs).unsqueeze(0)
    target = model(tvecs)
    t_vec = target[-1].double()
    y_vec = torch.tensor(vectorize_item(y, vocab), requires_grad=False, dtype=torch.float64)
    tnp = target[-1].detach().numpy()
    tch = unvectorize_item(tnp, vocab_inv)
    loss = nn.MSELoss()(t_vec, y_vec)
    corr = 0
    if tch == y:
        corr = 1
    guesses.append(corr)
    losses.append(float(loss))
    if len(guesses) > 1000:
        guesses.popleft()
    if len(losses) > 1000:
        losses.popleft()
    if iteration % 1000 == 0:
        if len(guesses) > 0:
            correct = sum(guesses)
        av_loss = np.mean(losses)
        msg = "Iter: " + str(iteration) + " Loss: " + "%.4f"%float(av_loss) 
        msg += " Correct: " + str(correct) + "/" + str(len(guesses))
        msg += " Input: \"" + batch + "\" : \"" 
        msg += " Ouput: \"" + tch + "\" Expected: \"" + y + "\""
        print(msg)
        if acc > len(guesses)*0.95:
            break
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
# Inference
output = ""
start_char = random.sample(vocab.keys(), 1)[0]
print("Starting character: \"" + start_char + "\"")
output += start_char
start_vec = np.array(([vectorize_item(start_char, vocab)]))
infer_len = 100
t_sv = torch.FloatTensor(start_vec).unsqueeze(0)
for n in range(infer_len):
    next_vec = model(t_sv)
    nv = np.array(next_vec[-1].detach())
    tnv = unvectorize_item(nv, vocab_inv)
    output += tnv
    t_sv = next_vec.unsqueeze(0)
print("\"" + output + "\"")