# RNN from scratch with PyTorch
A RNN ist just a normal NN.
It's very easy to implement in PyTorch due to its dynamic nature.

We'll build a very simple character based language model.

Taken from http://www.fast.ai/

## Init and helpers

In [1]:
from pathlib import Path
import numpy as np

## Data

In [2]:
NIETSCHE_PATH = Path("../data/raw/nietzsche.txt")
if NIETSCHE_PATH.is_file():
    print("I already have the data.")
else:
    !wget -o ../data/raw/nietzsche.txt https://s3.amazonaws.com/text-datasets/nietzsche.txt
        
with NIETSCHE_PATH.open() as f:
    data = f.read()

I already have the data.


A tweet of Nietzsche:

In [3]:
print(data[:140])

PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been



We need to know the alphabet and we add a padding value "\0" to the alphabet.

In [4]:
alphabet = ["\0", *sorted(list(set(data)))]
n_alphabet = len(alphabet)
n_alphabet

85

In [5]:
char2index = {c: i for i, c in enumerate(alphabet)}
index2char = {i: c for i, c in enumerate(alphabet)}

Convert the data into a list of integers

In [6]:
index = [char2index[c] for c in data]

In [7]:
print(index[:25])
print("".join(index2char[i] for i in index[:25]))

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1, 43, 45, 40, 40, 39, 43, 33, 38, 31, 2, 73, 61, 54, 73, 2]
PREFACE


SUPPOSING that 


In [8]:
index[0: 3]

[40, 42, 29]

In [9]:
X, y = [], []
for i in range(len(index) - 4):
    X.append(index[i : i + 3])
    y.append(index[i + 3])
    
X = np.stack(X)
y = np.stack(y)

In [10]:
X.shape, y.shape

((600889, 3), (600889,))

In [11]:
X[0], y[0]

(array([40, 42, 29]), 30)

In [12]:
type(y)

numpy.ndarray

In [13]:
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset


train_ds = TensorDataset(torch.from_numpy(X), torch.from_numpy(y))
train_dl = DataLoader(train_ds, batch_size=500)

# The model

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [15]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [16]:
class CharModel(nn.Module):
    def __init__(self, n_vocab, n_embedding, n_hidden):
        super().__init__()
        self.emb = nn.Embedding(n_vocab, n_embedding)
        self.lin_in = nn.Linear(n_embedding, n_hidden)
        
        self.lin_hidden = nn.Linear(n_hidden, n_hidden)
        self.lin_out = nn.Linear(n_hidden, n_vocab)
        
    def forward(self, X):
        c1, c2, c3 = X[:, 0], X[:, 1], X[:, 2]
        
        in1 = F.relu(self.lin_in(self.emb(c1)))
        h = F.tanh(self.lin_hidden(in1))
                   
        in2 = F.relu(self.lin_in(self.emb(c2)))
        h = F.tanh(self.lin_hidden(h + in2))
        
        in3 = F.relu(self.lin_in(self.emb(c3)))
        h = F.tanh(self.lin_hidden(h + in3))
        
        return F.log_softmax(self.lin_out(h), dim=-1)

In [17]:
n_embedding = 40
n_hidden = 256

model = CharModel(n_alphabet, n_embedding=40, n_hidden=128)
model = model.to(device)

In [18]:
optimizer = optim.Adam(model.parameters(), 0.001)
#criterion = nn.CrossEntropyLoss()
criterion = F.nll_loss

In [19]:
def fit(model, n_epoch=2):
    optimizer = optim.Adam(model.parameters(), 0.001)
    
    for epoch in range(n_epoch):
        print(f"Epoch {epoch}:")
        running_loss, correct = 0.0, 0

        model.train()
        for X, y in train_dl:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()

            y_ = model(X)
            loss = criterion(y_, y)

            loss.backward()
            optimizer.step()

            _, y_label_ = torch.max(y_, 1)
            correct += (y_label_ == y).sum().item()
            running_loss += loss.item() * X.shape[0]

        print(f"  Train Loss: {running_loss / len(train_dl.dataset):0.4f}")
        print(f"  Train Acc:  {correct / len(train_dl.dataset):0.2f}")

In [20]:
fit(model, 2)

Epoch 0:




  Train Loss: 2.2342
  Train Acc:  0.37
Epoch 1:
  Train Loss: 1.9212
  Train Acc:  0.44


In [21]:
def predict(word):
    word_idx = [char2index[c] for c in word]
    word_idx
    with torch.no_grad():
        X = torch.tensor(word_idx).unsqueeze(0).to(device)
        model.eval()
        y_ = model(X).cpu()
    pred = index2char[torch.argmax(y_).item()]
    print(f"{word} --> '{pred}'")

In [22]:
predict("the")

the --> ' '


In [23]:
predict("wom")

wom --> 'e'


In [24]:
predict("man")

man --> ' '


In [25]:
predict("hum")

hum --> 'a'


In [26]:
class CharModel(nn.Module):
    def __init__(self, n_vocab, n_embedding, n_hidden):
        super().__init__()
        self.emb = nn.Embedding(n_vocab, n_embedding)
        self.lin_in = nn.Linear(n_embedding, n_hidden)
        self.lin_hidden = nn.Linear(n_hidden, n_hidden)
        self.lin_out = nn.Linear(n_hidden, n_vocab)
        
    def forward(self, X):
        c1, c2, c3 = X[:, 0], X[:, 1], X[:, 2]
        
        in1 = F.relu(self.lin_in(self.emb(c1)))       
        in2 = F.relu(self.lin_in(self.emb(c2)))
        in3 = F.relu(self.lin_in(self.emb(c3)))

        h = F.tanh(self.lin_hidden(in1))
        h = F.tanh(self.lin_hidden(h + in2))
        h = F.tanh(self.lin_hidden(h + in3))
        
        return F.log_softmax(self.lin_out(h), dim=-1)

In [27]:
model = CharModel(n_alphabet, n_embedding=n_embedding, n_hidden=128).to(device)
fit(model)

print()
predict("the")
predict("wom")
predict("man")
predict("hum")

Epoch 0:
  Train Loss: 2.2401
  Train Acc:  0.37
Epoch 1:
  Train Loss: 1.9226
  Train Acc:  0.44

the --> ' '
wom --> 'e'
man --> ' '
hum --> 'a'


In [28]:
class CharModel(nn.Module):
    def __init__(self, n_vocab, n_embedding, n_hidden):
        super().__init__()
        self.emb = nn.Embedding(n_vocab, n_embedding)
        self.lin_in = nn.Linear(n_embedding, n_hidden)
        self.lin_hidden = nn.Linear(n_hidden, n_hidden)
        self.lin_out = nn.Linear(n_hidden, n_vocab)
        
        self.n_hidden = n_hidden
        
    def forward(self, X):
        c1, c2, c3 = X[:, 0], X[:, 1], X[:, 2]
        
        in1 = F.relu(self.lin_in(self.emb(c1)))       
        in2 = F.relu(self.lin_in(self.emb(c2)))
        in3 = F.relu(self.lin_in(self.emb(c3)))
        
        h = torch.zeros(X.shape[0], n_hidden, requires_grad=True).to(device)
        h = F.tanh(self.lin_hidden(h + in1))
        h = F.tanh(self.lin_hidden(h + in2))
        h = F.tanh(self.lin_hidden(h + in3))
        
        return F.log_softmax(self.lin_out(h), dim=-1)

In [29]:
model = CharModel(n_alphabet, n_embedding=n_embedding, n_hidden=n_hidden).to(device)
fit(model)

print()
predict("the")
predict("wom")
predict("man")
predict("hum")

Epoch 0:
  Train Loss: 2.0848
  Train Acc:  0.40
Epoch 1:
  Train Loss: 1.7971
  Train Acc:  0.47

the --> ' '
wom --> 'a'
man --> ' '
hum --> 'a'


In [30]:
class CharModel(nn.Module):
    def __init__(self, n_vocab, n_embedding, n_hidden):
        super().__init__()
        self.emb = nn.Embedding(n_vocab, n_embedding)
        self.lin_in = nn.Linear(n_embedding, n_hidden)
        self.lin_hidden = nn.Linear(n_hidden, n_hidden)
        self.lin_out = nn.Linear(n_hidden, n_vocab)
        
        self.n_hidden = n_hidden
        
    def forward(self, X):
        h = torch.zeros(X.shape[0], n_hidden, requires_grad=True).to(device)
        for i in range(X.shape[1]):
            c = X[:, i]
            in_ = F.relu(self.lin_in(self.emb(c)))
            h = F.tanh(self.lin_hidden(h + in_))

        return F.log_softmax(self.lin_out(h), dim=-1)

In [31]:
model = CharModel(n_alphabet, n_embedding=n_embedding, n_hidden=n_hidden).to(device)
fit(model)

print()
predict("the")
predict("wom")
predict("man")
predict("hum")

Epoch 0:
  Train Loss: 2.0915
  Train Acc:  0.40
Epoch 1:
  Train Loss: 1.8016
  Train Acc:  0.47

the --> ' '
wom --> 'a'
man --> ' '
hum --> 'a'


In [32]:
predict("the huma")

the huma --> 'n'


In [33]:
predict("those ")

those  --> 't'


In [34]:
predict("those o")

those o --> 'f'


In [35]:
predict("those of ")

those of  --> 'h'


In [36]:
predict("those of u")

those of u --> 'n'


You can use `nn.Sequential` to make it a bit more readable.

In [37]:
class CharModel(nn.Module):
    def __init__(self, n_vocab, n_embedding, n_hidden):
        super().__init__()
        self.i2e = nn.Sequential(
            nn.Embedding(n_vocab, n_embedding),
            nn.Linear(n_embedding, n_hidden),
            nn.ReLU(),
        )
        self.h2h = nn.Sequential(
            nn.Linear(n_hidden, n_hidden),
            nn.Tanh(),
        )
        self.h2out = nn.Linear(n_hidden, n_vocab)
        
        self.n_hidden = n_hidden
        
    def forward(self, X):
        h = torch.zeros(X.shape[0], n_hidden, requires_grad=True).to(device)
        for i in range(X.shape[1]):
            c = X[:, i]
            h = self.h2h(h + self.i2e(c))

        return F.log_softmax(self.h2out(h), dim=-1)

In [38]:
model = CharModel(n_alphabet, n_embedding=n_embedding, n_hidden=n_hidden).to(device)
fit(model)

print()
predict("the")
predict("wom")
predict("man")
predict("hum")

Epoch 0:
  Train Loss: 2.0905
  Train Acc:  0.40
Epoch 1:
  Train Loss: 1.7963
  Train Acc:  0.47

the --> ' '
wom --> 'a'
man --> ' '
hum --> 'a'
