<a href="https://colab.research.google.com/github/someshsingh22/Sherlocked/blob/master/Sherlocked.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
! git clone https://github.com/someshsingh22/Sherlocked

Cloning into 'Sherlocked'...
remote: Enumerating objects: 51, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 51 (delta 13), reused 39 (delta 1), pack-reused 0[K
Unpacking objects: 100% (51/51), done.


In [0]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import Counter
import os
import re
from argparse import Namespace
import time
from tqdm import tqdm

In [0]:
data_dir="./Sherlocked/Dataset/Clean/{}.txt"
flags = Namespace(
    train_file=data_dir.format("cano"),
    seq_size=64,
    batch_size=256,
    embedding_size=256,
    lstm_size=1024,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='./checkpoint',)

In [0]:
def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r') as f:
        text = f.read()
    text = text.split()

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text

In [0]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

In [0]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size, lstm_layers=2, is_bidirectional=False):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm_layers=lstm_layers
        self.is_bidirectional=is_bidirectional
        self.lstm = nn.LSTM(embedding_size,lstm_size,batch_first=True,num_layers=lstm_layers,dropout=0.1,bidirectional=is_bidirectional)
        self.dense = nn.Linear(lstm_size, n_vocab)
    
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state
    
    def zero_state(self, batch_size):
        return (torch.zeros(lstm_layers*(2 if is_bidirectional else 1), batch_size, self.lstm_size),
                torch.zeros(self.lstm_layers*(2 if is_bidirectional else 1), batch_size, self.lstm_size))

In [0]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    return criterion, optimizer

In [0]:
def lr_update(optimizer,decay):
  

SyntaxError: ignored

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(flags.train_file, flags.batch_size, flags.seq_size)
net = RNNModule(n_vocab, flags.seq_size,flags.embedding_size, flags.lstm_size)
net = net.to(device)
criterion, optimizer = get_loss_and_train_op(net, 0.01)

Vocabulary size 21252


In [0]:
in_text.shape

(256, 2944)

In [0]:
iteration = 0
epochs=50
for e in range(epochs):
  start=time.time()
  batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
  state_h, state_c = net.zero_state(flags.batch_size)

  # Transfer data to GPU
  state_h = state_h.to(device)
  state_c = state_c.to(device)
  for x, y in batches:
    iteration += 1

    # Tell it we are in training mode
    net.train()

    # Reset all gradients
    optimizer.zero_grad()

    # Transfer data to GPU
    x = torch.tensor(x).to(device)
    y = torch.tensor(y).to(device)

    logits, (state_h, state_c) = net(x, (state_h, state_c))
    loss = criterion(logits.transpose(1, 2), y)

    state_h = state_h.detach()
    state_c = state_c.detach()

    loss_value = loss.item()

    # Update the network's parameters
    optimizer.step()
    loss.backward()

    _ = torch.nn.utils.clip_grad_norm_(net.parameters(), flags.gradients_norm)
    optimizer.step()
  print('Epoch: {}/{}'.format(e, epochs),'Loss: {}'.format(loss_value),'Time Taken : {}'.format(time.time()-start))

Epoch: 0/50 Loss: 5.400121212005615 Time Taken : 24.643052339553833
Epoch: 1/50 Loss: 5.212407112121582 Time Taken : 25.950422763824463
Epoch: 2/50 Loss: 5.067686557769775 Time Taken : 25.42293691635132
Epoch: 3/50 Loss: 4.9876298904418945 Time Taken : 25.024222373962402
Epoch: 4/50 Loss: 4.886146545410156 Time Taken : 25.23982071876526
Epoch: 5/50 Loss: 4.814850807189941 Time Taken : 25.432995557785034
Epoch: 6/50 Loss: 4.738186359405518 Time Taken : 25.31384539604187
Epoch: 7/50 Loss: 4.681518077850342 Time Taken : 25.288341522216797
Epoch: 8/50 Loss: 4.595388412475586 Time Taken : 25.334911823272705
Epoch: 9/50 Loss: 4.561583995819092 Time Taken : 25.292315006256104
Epoch: 10/50 Loss: 4.519013404846191 Time Taken : 25.317748069763184
Epoch: 11/50 Loss: 4.483057498931885 Time Taken : 25.31722855567932
Epoch: 12/50 Loss: 4.442838668823242 Time Taken : 25.300121784210205
Epoch: 13/50 Loss: 4.407785892486572 Time Taken : 25.300395250320435
Epoch: 14/50 Loss: 4.3404927253723145 Time Take

In [0]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])
    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    return ' '.join(words).encode('utf-8')


In [0]:
 y=predict(device, net, "Sherlock Holmes".split(), n_vocab, vocab_to_int, int_to_vocab, top_k=2)

In [0]:
y

b'Sherlock Holmes had said to my own to do with the first thing that I had seen a more powerfully built young man , and the same necessity was growing in the room , and I was to be able for a few minutes , but he had been a very busy man . He was a very tall , dark man with the broad and ready face . He was a perfect savage , tall , strong , strong object was a familiar man , and he was a very tall and fair haired woman hobbled to the table . He was'

In [0]:
path="./Sherlocked/Dataset/Clean/{}.txt"

Data_flags = Namespace(
    data_dir=path.format("cano"),
    seq_size=64,
    batch_size=256,
)
Brain_flags=Namespace(
    num_layers=1,
    is_bidirectional=False,
    seq_size=Data_flags.seq_size,
    batch_size=Data_flags.batch_size,
    embedding_size=256,
    lstm_size=256,
    gradients_norm=5,
    dropout=0.1
)

In [0]:
# Data Class
class Data :
  def __init__(self,Data_flags):
    self.flags=Data_flags
    
    # Pre Process Data
    self.pre_process()
    
  # Prepare Data For Training
  def pre_process(self):
    start=time.time()
    # Read datat and split to words from files
    text=open(Data_flags.data_dir).read().split()

    # Create Frequency Dictionary
    word_counts = Counter(text)
    self.sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)

    # Word2Vec Mapping
    self.int_to_vocab = {k: w for k, w in enumerate(self.sorted_vocab)}
    self.vocab_to_int = {w: k for k, w in self.int_to_vocab.items()}
    self.n_vocab = len(self.int_to_vocab)

    # Create Input-Output Data
    self.int_text = [self.vocab_to_int[w] for w in text]
    num_batches = int(len(self.int_text) / (Data_flags.seq_size * Data_flags.batch_size))
    self.in_text = self.int_text[:num_batches * Data_flags.batch_size * Data_flags.seq_size]
    self.out_text = np.zeros_like(self.in_text)
    self.out_text[:-1] = self.in_text[1:]
    self.out_text[-1] = self.in_text[0]
    self.in_text = np.reshape(self.in_text, (Data_flags.batch_size, -1))
    self.out_text = np.reshape(self.out_text, (Data_flags.batch_size, -1))
    print("Data Preprocessing complete with {} words".format(self.n_vocab))

  # Create Input-Output Batch Generator 
  def get_batches(self):
    num_batches = np.prod(self.in_text.shape) // (self.flags.seq_size * self.flags.batch_size)
    for i in range(0, num_batches * self.flags.seq_size, self.flags.seq_size):
      yield self.in_text[:, i:i+self.flags.seq_size], self.out_text[:, i:i+self.flags.seq_size]

In [0]:
# Neural Network
class Network(nn.Module):
  def __init__(self,flags,data):
    super(Network, self).__init__()
    self.flags=flags

    # Embedding Layer , LSTM Stack and Output Layer
    self.embedding= nn.Embedding(data.n_vocab, 256)
    self.lstm=nn.LSTM(256,256,batch_first=True,num_layers=2,dropout=0.1)
    self.dense = nn.Linear(256, data.n_vocab)

  # Forward Pass
  def forward(self, x, prev_state):
      embed = self.embedding(x)
      output, state = self.lstm(embed, prev_state)
      logits = self.dense(output)
      return logits, state

  # ZeroState Init
  def zero_state(self):
        return (torch.zeros(self.flags.num_layers*(2 if self.flags.is_bidirectional else 1), self.flags.batch_size, self.flags.lstm_size),
                torch.zeros(self.flags.num_layers*(2 if self.flags.is_bidirectional else 1), self.flags.batch_size, self.flags.lstm_size))

# Solver / Optimizer
class Optimizer :
  def __init__(self,Network):
    self.lr=0.02
    self.Network=Network

    # Defining Loss function and Optimizer
    self.criterion = nn.CrossEntropyLoss()
    self.optimizer = torch.optim.Adam(self.Network.parameters(), lr=self.lr)

  def decay(self,loss):

    # Update with Factor or value
    def factor_update(self,factor):
      for group in self.optimizer.param_groups:
        group['lr']*=value
      self.lr*=value
    def value_update(self,value):
      for group in self.optimizer.param_groups:
        group['lr']=value
      self.lr=value

    if self.lr > 0.001:
      # Initial Fast Changes
      if loss > 10 :
        value_update(0.02)
      elif loss > 7 :
        value_update(0.01)
      #slower changes
      elif loss > 4 :
        value_update(0.005)
      #exponential drop
      else :
        factor_update(0.97)


In [289]:
d=Data(Data_flags)
n=Network(Brain_flags,d)
o=Optimizer(n)

Data Preprocessing complete with 21252 words


In [0]:
# Neural Network aka Brain
class Brain :
  def __init__(self,flags,data,Network):
    self.data=data
    self.device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.loss=0
    self.flags=flags
    self.Network=Network
    self.Optimizer=Optimizer(self.Network)
    self.Network.to(self.device)
    
  #Train Function
  def Train(self,epochs):
    self.Network.train()
    for epoch in range(epochs):
      start=time.time()
      batches=self.data.get_batches()
      state_h, state_c = self.Network.zero_state()
      
      
      # Transfer data to GPU
      state_h = state_h.to(self.device)
      state_c = state_c.to(self.device)
      
      for x, y in tqdm(batches):
        
        # Reset all gradients
        self.Optimizer.optimizer.zero_grad()

        # Transfer data to GPU
        x = torch.tensor(x).to(self.device)
        y = torch.tensor(y).to(self.device)
        
        logits, (state_h, state_c) = self.Network(x, (state_h, state_c))
        self.loss = self.Optimizer.criterion(logits.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()

        self.loss = loss.item()
        
        # Update the network's parameters
        self.Optimizeroptimizer.step()
        self.loss.backward()
        _ = torch.nn.utils.clip_grad_norm_(self.Network.parameters(), self.flags.gradients_norm)
        self.Optimizer.decay(self.loss)
        
        self.Optimizer.optimizer.step()
      print('Epoch: {}/{}'.format(epoch, epochs),'Loss: {}'.format(self.loss),'Time Taken : {}'.format(time.time()-start))

In [0]:
from torchsummary import summary

In [317]:
br=Brain(Brain_flags,d,n)
br.Train(2)




0it [00:00, ?it/s][A[A[A


[A[A[A

TypeError: ignored

In [231]:
print(data)

<__main__.Data object at 0x7f65a191b9e8>
