## Import libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [2]:
# import pandas as pd
# data = pd.read_csv('book-war-and-peace.txt', header = None, sep=" ", error_bad_lines=False);

In [3]:
txt = open('book-war-and-peace.txt', "r")
# print(txt.readline())
# print(txt.read(10))

In [4]:
book_chars = []

In [5]:
for line in txt:
    for char in line:
        book_chars.append(char)

In [6]:
txt.close()

In [7]:
book_chars[:10]

['C', 'H', 'A', 'P', 'T', 'E', 'R', ' ', 'I', '\n']

In [8]:
s=set(book_chars)
print('Elements:',s)
print('Set size:', len(s))

Elements: {'/', 'w', '1', '6', '4', 'a', 'X', 'z', 'E', '\n', '"', 'R', 'h', "'", 'S', 'W', 'D', 'n', '9', 't', 'd', '.', 'f', 'ä', 'y', 'Y', 'i', 'l', '!', '7', 'Q', 'P', 'j', 'K', ':', 'v', '0', 'r', 'ê', '*', 'p', 'b', 'q', 'I', 'e', '=', 'F', 'm', 'o', '?', '-', ')', '8', 'L', 'N', '3', ';', ',', 'J', 'g', 'M', 'T', 'C', 'c', '5', 'u', 'O', 'A', 'H', 'é', 's', 'U', 'B', ' ', 'V', 'k', '2', 'x', 'à', 'Z', '(', 'G'}
Set size: 82


In [9]:
book_dict = {}

In [10]:
i = 0
for char in s:
    book_dict[char] = i
    i += 1

In [11]:
len(book_chars)

3202303

In [12]:
book_dict["a"]

5

## One-hot encoding

In [13]:
book_chars = np.array(book_chars)
dict_size = len(s)
seq_len = 1
batch_size = 25

In [14]:
def one_hot_encode(sequence, dict_size, 
                   seq_len, batch_size,
                      num_batches, book_dict):
    # Creating a multi-dimensional array of zeros with the 
    # desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), 
                        dtype=np.float32)

    # Replacing the 0 at the relevant character index 
    # with a 1 to represent that character
    for i in range(batch_size):
        for u in range(seq_len):
            char_ix = (num_batches - 1) * batch_size + i
            features[i, u, book_dict[sequence[char_ix]]] = 1
    return features

In [35]:
def target_encode(sequence, batch_size, char_dict, num_batches):
    batch_list = []
    for x in range(batch_size):
        char_idx = (num_batches - 1) * batch_size + x
        char_val = char_dict[sequence[char_idx]]
        batch_list.append(char_val)
    return batch_list

In [33]:
features = one_hot_encode(book_chars, dict_size, seq_len, batch_size, 2, book_dict)
print(features[-1])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


## Test Train Split using 70-30

In [17]:
train_char_len = (int) (0.7 * len(book_chars))
test_char_len = len(book_chars) - train_char_len
print(len(book_chars), train_char_len, test_char_len)

3202303 2241612 960691


In [18]:
train_batches = (int) (train_char_len / batch_size)
test_batches = (int) (test_char_len / batch_size)
print(train_batches, test_batches,test_batches + train_batches )

89664 38427 128091


In [19]:
X_train = []
for i in range(train_batches):
    minibatch = one_hot_encode(book_chars, dict_size, seq_len, batch_size, i, book_dict)
    X_train.append(torch.Tensor(minibatch))

In [127]:
Y_train = []
for i in range(train_batches):
    Y_train.append(torch.LongTensor(target_encode(book_chars[1:], batch_size, book_dict, i)))

In [128]:
print(len(X_train))
print(X_train[-1][0])
print(len(Y_train))
print(Y_train[0])

89664
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]])
89664
tensor([26, 63, 12, 73,  1, 44, 73,  5, 37, 44, 73, 17, 48, 19,  9, 63, 48, 17,
        70, 63, 26, 48, 65, 70, 21])


In [23]:
X_test = []
for i in range(train_batches, train_batches + test_batches):
    minibatch = one_hot_encode(book_chars, dict_size, seq_len, batch_size, i, book_dict)
    X_test.append(torch.Tensor(minibatch))

In [129]:
Y_test = []
for i in range(train_batches, train_batches + test_batches):
    Y_test.append(torch.LongTensor(target_encode(book_chars[1:], batch_size, book_dict, i)))

In [130]:
print(len(X_test))
print(X_test[-1][0])
print(len(Y_test))
print(Y_test[1])

38427
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
38427
tensor([12, 44, 73, 22, 65, 37, 17, 26, 19, 65, 37, 44, 73,  1,  5, 70,  9, 41,
        44, 26, 17, 59, 73, 63,  5])


In [46]:
def convert_book():
    char_vals = []
    for c in book_chars:
        char_vals.append(book_dict[c])
    return char_vals

In [48]:
%time vals = convert_book()
print(len(vals))

CPU times: user 2.66 s, sys: 62.5 ms, total: 2.72 s
Wall time: 3.48 s
3202303


## Model definition

In [119]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim, no_layers = 1):
        super(RNN, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.no_layers = no_layers
        
        self.rnn_layer = nn.RNN(self.input_dim, 
                        self.hidden_dim, self.no_layers,
                                batch_first = True)
        self.linear_out = nn.Linear(self.hidden_dim, self.output_dim)
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, x):
        batch_size = x.size(0)
        hidden = torch.zeros(self.no_layers, batch_size, self.hidden_dim).requires_grad_()
        out, hidden = self.rnn_layer(x, hidden.detach())
        out = out.view(-1, self.hidden_dim)
        out = self.linear_out(out)
        out = self.softmax(out)
        return out, hidden

In [120]:
input_size = len(s)
hidden_size = 128
output_size = len(s)
rnn = RNN(input_size, hidden_size, output_size)

In [121]:
epochs = 100
lr = 1e-3
# Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr = lr)

## Training Run

In [132]:
out, hidden = rnn(X_train[0])
loss = criterion(output, Y_train[0])
# print(out[:2])
print(out.size())
print(loss)

torch.Size([25, 82])
tensor(4.4124, grad_fn=<NllLossBackward>)


In [133]:
Y_train[:2]

[tensor([26, 63, 12, 73,  1, 44, 73,  5, 37, 44, 73, 17, 48, 19,  9, 63, 48, 17,
         70, 63, 26, 48, 65, 70, 21]),
 tensor([68, 67, 31, 61,  8, 11, 73, 43,  9,  9, 10, 15, 44, 27, 27, 57, 73, 31,
         37, 26, 17, 63, 44, 57, 73])]

In [None]:
for epoch in range(1, epochs+1):
    for x, target in zip(X_train, Y_train):
    
        optimizer.zero_grad()

        output, hidden = rnn(x)
        
        loss = criterion(output, target)
        loss.backward() 
        optimizer.step() 

        if epoch%5 == 0:
            print('Epoch: {}/{}.............'.format(epoch, epochs), end=' ')
            print("Loss: {:.4f}".format(loss.item()))