# Importing Libraries

In [0]:
import re
import torch
import warnings
import numpy as np
from torch import nn
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import DataLoader

In [0]:
warnings.filterwarnings('ignore')

# Importing  Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
cd drive/My\ Drive/dataset

/content/drive/My Drive/dataset


In [5]:
!ls

cifar-10-python  processed_tag	  test.pickle
output.csv	 shakespeare.txt  train.pickle


In [0]:
with open('shakespeare.txt', 'r') as f:
    text = f.read()

In [7]:
print(text[:100])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose mi


# Pre processing text

In [0]:
class DataPreprocessing:
  
  def __init__(self,text):
    self.text = text
    
  def display_text(self):
    print(self.text)
    
  def  remove_numerals(self):
    return re.sub(r'[0-9]+','', self.text)
  
  def lower_text(self):
    return "".join(list(map(lambda x : x.lower(), self.text)))
  
  def remove_new_line(self):
    return re.sub("[\n]+"," ",self.text)
  
  def remove_multiple_spaces(self):
    return re.sub(' +', ' ',self.text)[1:]

In [0]:
processed_text = DataPreprocessing(text)

In [0]:
setattr(processed_text,'text',processed_text.remove_numerals())

In [0]:
setattr(processed_text,'text',processed_text.lower_text())

In [0]:
setattr(processed_text,'text',processed_text.remove_new_line())

In [0]:
setattr(processed_text, 'text', processed_text.remove_multiple_spaces())

# Vectorization and One hot encoding

In [0]:
class VectorizeEncode:
  
  def __init__(self,text):
    self.text = text
    self.character_dict = tuple(set(self.text))
    self.int2char = dict(enumerate(self.character_dict))
    self.char2int = {ch:idx for idx,ch in self.int2char.items()}
    self.label_encoded_value = np.array([self.char2int[char] for char in self.text])
    
  def onehot_encoding(self):
    onehot_encoder = OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoder.fit_transform(self.label_encoded_value.reshape(len(self.label_encoded_value),1))
    return onehot_encoded  

In [0]:
encoded_vector = VectorizeEncode(getattr(processed_text,'text'))

In [0]:
encoded_value = np.array(encoded_vector.onehot_encoding())

In [0]:
CHAR_LEN=10

In [0]:
train_dataset =[]
for idx in range(len(encoded_value)-CHAR_LEN):
  train_dataset.append((encoded_value[idx:idx+CHAR_LEN].flatten(), encoded_value[idx+CHAR_LEN]))

In [19]:
train_dataset[:1]

[(array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [0]:
BATCH_SIZE = 10

In [0]:
loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

In [22]:
# Check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


# Declaring Model

In [0]:
class RecurrentLanguageModel(nn.Module):
  
  def __init__ (self, batch, seq_len,num_layers, num_directions, hidden_size, lr=0.001, drop_prob=0.5, input_size=1):
    
    super().__init__()
    
    #lstm required dimensions
    self.lr = lr
    self.batch = batch
    self.seq_len = seq_len
    self.drop_prob = drop_prob
    self.num_layers = num_layers
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_directions = num_directions
    
    self.rnn = nn.RNN(input_size=(self.seq_len,self.batch,self.input_size),
                      hidden_size=(self.num_layers*self.num_directions, hidden_size),
                      batch_first=True,
                      dropout=self.drop_prob )
    