## Import libraries

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [6]:
# import pandas as pd
# data = pd.read_csv('book-war-and-peace.txt', header = None, sep=" ", error_bad_lines=False);

In [7]:
txt = open('book-war-and-peace.txt', "r")
# print(txt.readline())
# print(txt.read(10))

In [8]:
book_chars = []

In [9]:
for line in txt:
    for char in line:
        book_chars.append(char)

In [10]:
txt.close()

In [11]:
book_chars[:10]

['C', 'H', 'A', 'P', 'T', 'E', 'R', ' ', 'I', '\n']

In [12]:
s=set(book_chars)
print('Elements:',s)
print('Set size:', len(s))

Elements: {'l', 'i', 'C', 'j', 'a', 'E', 'G', 'I', '!', 'J', '5', '0', 'é', '*', 'g', '2', ')', 'q', 'e', 'n', ':', 'U', 'L', ';', 'F', '4', 'A', 'Z', 'm', '.', '9', 'X', '8', '?', '=', 'P', 'N', 'ä', 'V', 'R', '1', '-', 'ê', '3', '\n', 'r', 'v', 'Y', 'c', 'K', '/', 'p', 's', 't', '7', 'k', 'H', 'u', 'Q', 'f', '"', 'B', '(', 'y', 'x', 'd', 'O', '6', 'W', 'D', 'z', 'h', ',', 'b', 'w', 'o', 'T', 'à', "'", 'S', 'M', ' '}
Set size: 82


In [18]:
book_dict = {}

In [19]:
i = 0
for char in s:
    book_dict[char] = i
    i += 1

In [25]:
len(book_chars)

3202303

In [23]:
book_dict["a"]

4

## One-hot encoding

In [36]:
book_chars = np.array(book_chars)
dict_size = len(s)
seq_len = 1
batch_size = 25

In [57]:
def one_hot_encode(sequence, dict_size, 
                   seq_len, batch_size,
                      num_batches, book_dict):
    # Creating a multi-dimensional array of zeros with the 
    # desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), 
                        dtype=np.float32)

    # Replacing the 0 at the relevant character index 
    # with a 1 to represent that character
    for i in range(batch_size):
        for u in range(seq_len):
            char_ix = (num_batches - 1) * batch_size + i
            features[i, u, book_dict[sequence[char_ix]]] = 1
    return features

In [64]:
features = one_hot_encode(book_chars, dict_size, seq_len, batch_size, 2, book_dict)
print(features[-1])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


## Test Train Split using 70-30

In [144]:
total_batches = (int) (len(book_chars) / batch_size)
train_batches = (int) (0.7 * total_batches)
test_batches = total_batches - train_batches
print(total_batches, train_batches, test_batches)

128092 89664 38428


In [145]:
train_set = []
for i in range(train_batches):
    minibatch = one_hot_encode(book_chars, dict_size, seq_len, batch_size, i, book_dict)
    train_set.append(torch.Tensor(minibatch))

In [146]:
print(len(train_set))
print(train_set[-1][0])

89664
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])


In [147]:
test_set = []
for i in range(train_batches, train_batches + test_batches):
    minibatch = one_hot_encode(book_chars, dict_size, seq_len, batch_size, i, book_dict)
    test_set.append(torch.Tensor(minibatch))

In [148]:
print(len(test_set))
print(test_set[-1][0])

38428
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])


In [149]:
y_train = book_chars[ batch_size : train_batches*batch_size: 26]
print(y_train[10:20])
print(len(y_train))

['h' 'n' 'i' 'u' 'r' 'I' 'u' 'l' 'l' 'w']
86215


In [150]:
y_train_encoding = one_hot_encode(y_train, dict_size, seq_len, len(y_train), 1, book_dict)
print(len(y_train_encoding))

86215


In [152]:
y_test = book_chars[ train_batches*batch_size + batch_size : : 26]
print(y_test[:5])
print(len(y_test))

['a' ' ' 'r' 'h' 'o']
36950


In [142]:
y_test_encoding = one_hot_encode(y_test, dict_size, seq_len, len(y_test), 1, book_dict)
print(len(y_test_encoding))

36950


In [108]:
# features_train = torch.Tensor(train_set)
# features_test = torch.Tensor(test_set)

In [109]:
print(type(features_train))

<class 'torch.Tensor'>


## Model definition

In [153]:
class RNN(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers = 1):
        super(RNN, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        self.rnn_layer = nn.RNN(self.input_dim, 
                        self.hidden_dim, 
                        num_layers)
        self.connected = nn.Linear(self.hidden_dim, self.output_dim)
        
    def forward(self, x):
        batch_size = x.size(0)
        hidden = 
        c_n , h_n = self.rnn.layer(x, h_last)
        c_n = self.connected(c_n)
        return h_n

SyntaxError: invalid syntax (<ipython-input-153-c5e979e9e191>, line 16)

In [102]:
input_size = len(s)
hidden_size = 12
output_size = len(s)
rnn = RNN(input_size, hidden_size, output_size)

In [103]:
epochs = 100
lr = 1e-3

# Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr = lr)

## Training Run

In [143]:
for epoch in range(1, epochs+1):
    for input, target in zip():
    
        optimizer.zero_grad()

        # train_batch.to(device)

        output, hidden = rnn(train_batch)
        
        loss = criterion(output, )
        loss.backward() 
        optimizer.step() 

        if epoch%10 == 0:
            print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
            print("Loss: {:.4f}".format(loss.item()))

TypeError: forward() takes 1 positional argument but 2 were given