In [1]:
# Import Clean Data
#! git clone https://github.com/someshsingh22/Sherlocked

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import Counter
import os
import re
from argparse import Namespace
import time

In [3]:
flags = Namespace(
    train_file="../input/cano.txt",
    seq_size=64,
    batch_size=256,
    embedding_size=1024,
    lstm_size=1024,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
)

In [4]:
def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r') as f:
        text = f.read()
    text = text.split()

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text

In [5]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

In [6]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,lstm_size,batch_first=True,num_layers=2,dropout=0.2)
        self.dense = nn.Linear(lstm_size, n_vocab)
    
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state
    
    def zero_state(self, batch_size):
        return (torch.zeros(2, batch_size, self.lstm_size),
                torch.zeros(2, batch_size, self.lstm_size))

In [7]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(flags.train_file, flags.batch_size, flags.seq_size)
net = RNNModule(n_vocab, flags.seq_size,flags.embedding_size, flags.lstm_size)
net = net.to(device)
criterion, optimizer = get_loss_and_train_op(net, 0.01)
iteration = 0

Vocabulary size 21252


In [9]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])
    for _ in range(40):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    return ' '.join(words).encode('utf-8')

In [10]:
call=0
epochs=150
for e in range(epochs):
    call+=1
    start=time.time()
    batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
    state_h, state_c = net.zero_state(flags.batch_size)

    # Transfer data to GPU
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for x, y in batches:
        iteration += 1

        # Tell it we are in training mode
        net.train()

        # Reset all gradients
        optimizer.zero_grad()

        # Transfer data to GPU
        x = torch.tensor(x).to(device)
        y = torch.tensor(y).to(device)

        logits, (state_h, state_c) = net(x, (state_h, state_c))
        loss = criterion(logits.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()

        loss_value = loss.item()

        # Update the network's parameters
        optimizer.step()
        loss.backward()

        _ = torch.nn.utils.clip_grad_norm_(net.parameters(), flags.gradients_norm)
        optimizer.step()
    if(call%10==0):
        x=predict(device, net, "Wait for me".split(), n_vocab, vocab_to_int, int_to_vocab, top_k=5)
        y=predict(device, net, "Sherlock rubbed his".split(), n_vocab, vocab_to_int, int_to_vocab, top_k=5)
        print("-----\n",x,"\n-----")
        print("-----\n",y,"\n-----")
    print('Epoch: {}/{}'.format(e, epochs),'Loss: {}'.format(loss_value),'Time Taken : {}'.format(time.time()-start))
    for g in optimizer.param_groups:
        g['lr'] *= 0.99
    if loss_value<=0.75:
        break

Epoch: 0/150 Loss: 21.151498794555664 Time Taken : 30.79806923866272
Epoch: 1/150 Loss: 15.387161254882812 Time Taken : 30.704171180725098
Epoch: 2/150 Loss: 14.619292259216309 Time Taken : 30.700597524642944
Epoch: 3/150 Loss: 14.61365795135498 Time Taken : 30.705626010894775
Epoch: 4/150 Loss: 13.542423248291016 Time Taken : 30.701051950454712
Epoch: 5/150 Loss: 14.439783096313477 Time Taken : 30.690541744232178
Epoch: 6/150 Loss: 13.303353309631348 Time Taken : 30.694021224975586
Epoch: 7/150 Loss: 12.390857696533203 Time Taken : 30.70057964324951
Epoch: 8/150 Loss: 11.845221519470215 Time Taken : 30.6942081451416
-----
 b'Wait for me I I but but I and was was was but but was but was his I and his but and and and his but his and but and was but was was was I was his and but I his was' 
-----
-----
 b'Sherlock rubbed his . his his was his I I I and was but was and I and and I but but and his but was and his I but his was his but his his and but but I was was was and' 
-----
Epoch: 9/

In [11]:
flags = Namespace(
    train_file="../input/cano.txt",
    seq_size=64,
    batch_size=256,
    embedding_size=512,
    lstm_size=512,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
)

In [12]:
def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r') as f:
        text = f.read()
    text = text.split()

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text

In [13]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

In [14]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,lstm_size,batch_first=True,num_layers=2,dropout=0.2)
        self.dense = nn.Linear(lstm_size, n_vocab)
    
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state
    
    def zero_state(self, batch_size):
        return (torch.zeros(2, batch_size, self.lstm_size),
                torch.zeros(2, batch_size, self.lstm_size))

In [15]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(flags.train_file, flags.batch_size, flags.seq_size)
net = RNNModule(n_vocab, flags.seq_size,flags.embedding_size, flags.lstm_size)
net = net.to(device)
criterion, optimizer = get_loss_and_train_op(net, 0.01)
iteration = 0

Vocabulary size 21252


In [17]:
call=0
epochs=150
for e in range(epochs):
    call+=1
    start=time.time()
    batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
    state_h, state_c = net.zero_state(flags.batch_size)

    # Transfer data to GPU
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for x, y in batches:
        iteration += 1

        # Tell it we are in training mode
        net.train()

        # Reset all gradients
        optimizer.zero_grad()

        # Transfer data to GPU
        x = torch.tensor(x).to(device)
        y = torch.tensor(y).to(device)

        logits, (state_h, state_c) = net(x, (state_h, state_c))
        loss = criterion(logits.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()

        loss_value = loss.item()

        # Update the network's parameters
        optimizer.step()
        loss.backward()

        _ = torch.nn.utils.clip_grad_norm_(net.parameters(), flags.gradients_norm)
        optimizer.step()
    if(call%10==0):
        x=predict(device, net, "Wait for me".split(), n_vocab, vocab_to_int, int_to_vocab, top_k=5)
        y=predict(device, net, "Sherlock rubbed his".split(), n_vocab, vocab_to_int, int_to_vocab, top_k=5)
        print("-----\n",x,"\n-----")
        print("-----\n",y,"\n-----")
    print('Epoch: {}/{}'.format(e, epochs),'Loss: {}'.format(loss_value),'Time Taken : {}'.format(time.time()-start))
    for g in optimizer.param_groups:
        g['lr'] *= 0.99
    if loss_value<=0.75:
        break

Epoch: 0/150 Loss: 7.83593225479126 Time Taken : 17.747049570083618
Epoch: 1/150 Loss: 7.407349109649658 Time Taken : 17.769440412521362
Epoch: 2/150 Loss: 7.672333240509033 Time Taken : 17.766834259033203
Epoch: 3/150 Loss: 7.494349002838135 Time Taken : 17.767624139785767
Epoch: 4/150 Loss: 7.570348739624023 Time Taken : 17.767653703689575
Epoch: 5/150 Loss: 7.573446750640869 Time Taken : 17.766326665878296
Epoch: 6/150 Loss: 7.751138210296631 Time Taken : 17.764432907104492
Epoch: 7/150 Loss: 7.373069763183594 Time Taken : 17.77657389640808
Epoch: 8/150 Loss: 7.513638973236084 Time Taken : 17.771028757095337
-----
 b'Wait for me in the the of the and you the and in you and in the the in you and of you in and you of the the you you in and the you in the of in in the the you of' 
-----
-----
 b'Sherlock rubbed his the in of you in in you and you you of the and in the you you of of of and the of in you and in the you and you you the in in in the and and in and' 
-----
Epoch: 9/150 Loss