Chapter 15

In [1]:
import torch
import torch.nn as nn

torch.manual_seed(1)
rnn_layer = nn.RNN(input_size=5, hidden_size=2,
                   num_layers=1, batch_first=True)
w_xh = rnn_layer.weight_ih_l0
w_hh = rnn_layer.weight_hh_l0
b_xh = rnn_layer.bias_ih_l0
b_hh = rnn_layer.bias_hh_l0

print('W_xh shape:', w_xh)
print()
print('W_xh shape:', w_xh.shape)
print('W_hh shape:', w_hh.shape)
print('b_xh shape:', b_xh.shape)
print('b_hh shape:', b_hh.shape)

W_xh shape: Parameter containing:
tensor([[ 0.3643, -0.3121, -0.1371,  0.3319, -0.6657],
        [ 0.4241, -0.1455,  0.3597,  0.0983, -0.0866]], requires_grad=True)

W_xh shape: torch.Size([2, 5])
W_hh shape: torch.Size([2, 2])
b_xh shape: torch.Size([2])
b_hh shape: torch.Size([2])


In [2]:
x_seq = torch.tensor([[1.0]*5, [2.0]*5, [3.0]*5]).float()

## output of the simple RNN:
output, hn = rnn_layer(torch.reshape(x_seq, (1, 3, 5)))

## manually computing the output:
out_man = []
for t in range(3):
    xt = torch.reshape(x_seq[t], (1, 5))
    print(f'Time step {t} =>')
    print('    Input           :', xt.numpy())
    
    ht = torch.matmul(xt, torch.transpose(w_xh, 0, 1)) + b_hh
    print('   Hidden           :', ht.detach().numpy())
    if t > 0:
         prev_h = out_man[t-1]
    else:
         prev_h = torch.zeros((ht.shape))
    ot = ht + torch.matmul(prev_h, torch.transpose(w_hh, 0, 1)) \
            + b_hh
    ot = torch.tanh(ot)
    out_man.append(ot)
    print('    Output (manual) :', ot.detach().numpy())
    print('    RNN output      :', output[:, t].detach().numpy())
    print()

Time step 0 =>
    Input           : [[1. 1. 1. 1. 1.]]
   Hidden           : [[-0.3161478   0.64722455]]
    Output (manual) : [[-0.21046415  0.56788784]]
    RNN output      : [[-0.3519801   0.52525216]]

Time step 1 =>
    Input           : [[2. 2. 2. 2. 2.]]
   Hidden           : [[-0.73478645  1.2972739 ]]
    Output (manual) : [[-0.5741978  0.7945334]]
    RNN output      : [[-0.68424344  0.76074266]]

Time step 2 =>
    Input           : [[3. 3. 3. 3. 3.]]
   Hidden           : [[-1.153425   1.9473232]]
    Output (manual) : [[-0.8130059   0.91817397]]
    RNN output      : [[-0.8649416   0.90466356]]



#Project one – predicting the sentiment of IMDb movie reviews

In [3]:
from torchtext.datasets import IMDB
from collections.abc import Iterable

train_dataset = IMDB(split='train')
test_dataset = IMDB(split='test')

In [5]:
## Step 1: create the datasets

In [4]:
from torch.utils.data.dataset import random_split

torch.manual_seed(1)
train_dataset, valid_dataset = random_split(
    list(train_dataset), [20000, 5000])

In [7]:
## Step 2: find unique tokens (words)
import re 
from collections import Counter, OrderedDict

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall(
        '(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()
    )
    text = re.sub('[\W]+', ' ', text.lower()) + \
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    
    return tokenized


token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)
print('Vocab-size:', len(token_counts))

Vocab-size: 69023


In [27]:
# token_counts

In [8]:
## step 3: encoding each uniqe token into integers

In [14]:
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(
    token_counts.items(), key=lambda x: x[1], reverse=True)

ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab = vocab(ordered_dict)
vocab.insert_token('<pad>', 0)
vocab.insert_token('<unk>', 1)
vocab.set_default_index(1)

In [28]:
ordered_dict

OrderedDict([('the', 267877),
             ('and', 130797),
             ('a', 130057),
             ('of', 116119),
             ('to', 107513),
             ('is', 85847),
             ('it', 76964),
             ('in', 74646),
             ('i', 69938),
             ('this', 60714),
             ('that', 58340),
             ('s', 50537),
             ('was', 38365),
             ('as', 37322),
             ('for', 35262),
             ('with', 35163),
             ('movie', 35149),
             ('but', 34034),
             ('film', 32139),
             ('t', 27384),
             ('on', 27246),
             ('you', 27236),
             ('not', 24329),
             ('he', 24075),
             ('are', 23613),
             ('his', 23434),
             ('have', 21941),
             ('one', 21383),
             ('be', 21260),
             ('all', 19102),
             ('at', 18751),
             ('they', 18203),
             ('by', 17895),
             ('an', 17204),
             ('who', 

In [18]:
print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 457]
