# Homework 4

Name: Stanly Gomes

Student ID: 801118166

GitHub Repository: https://github.com/stanlygomes/RealTimeML

In [2]:
from tqdm import tqdm
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split, Subset

import numpy as np
import pandas as pd
import cv2
from matplotlib import pyplot as plt
from ptflops import get_model_complexity_info

import torchvision
from torchvision import transforms, datasets
from torchmetrics.classification import MulticlassAccuracy, BinaryAccuracy
from torchmetrics import ConfusionMatrix
from datetime import datetime

## Import and preprocess TimeMachine.txt Dataset

In [28]:
import urllib.request
import re
import collections

url = 'https://www.gutenberg.org/files/35/old/35.txt'
start_string = 'The Time Machine, by H. G. Wells [1898]\r\n\r\n'

with urllib.request.urlopen(url) as response:
    text = response.read().decode('utf-8')

# Find the index of the start string
start_idx = text.find(start_string)
if start_idx == -1:
    raise ValueError('Start string not found')

# Extract the text after the start string
text = text[start_idx+len(start_string):]

# Remove any remaining metadata at the end of the file
end_string = '*** END OF THIS PROJECT GUTENBERG EBOOK THE TIME MACHINE ***'
end_idx = text.find(end_string)
if end_idx != -1:
    text = text[:end_idx]
    
# Saving unprocessed dataset
if not os.path.exists('timemachine.txt'):
    with open('timemachine.txt', 'w') as f:
        f.write(text)
else:
    print('File exists already!')

# Preprocess the text
lines = [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in text.split('\n')]

print(f'# text lines: {len(lines)}')
print(lines[0])
print(lines[10])

File exists already!
# text lines: 3229

lights in the lilies of silver caught the bubbles that flashed and


In [29]:
def tokenize(lines, token='word'):
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('Error: Unkown token type:' + token)

tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
['fire', 'burned', 'brightly', 'and', 'the', 'soft', 'radiance', 'of', 'the', 'incandescent']
['lights', 'in', 'the', 'lilies', 'of', 'silver', 'caught', 'the', 'bubbles', 'that', 'flashed', 'and']


In [30]:
def count_corpus(tokens):
    # Here `tokens` is a 1D list or 2D list
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # Flatten a list of token lists into a list of tokens
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [31]:
# Implement vocabulary
class Vocab:
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
                
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
    def unk(self):
        return 0
    
    def token_freqs(self):
        return self._token_freqs

In [32]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('i', 2), ('and', 3), ('of', 4), ('a', 5), ('to', 6), ('was', 7), ('in', 8), ('that', 9)]


In [33]:
for i in [0, 10]:
    print('words:', tokens[i])
    print('indices:', vocab[tokens[i]])

words: []
indices: []
words: ['lights', 'in', 'the', 'lilies', 'of', 'silver', 'caught', 'the', 'bubbles', 'that', 'flashed', 'and']


TypeError: 'dict' object is not callable

## Problem 1: Train RNN, LSTM, and GRU on varying hyperparameters

In [4]:
# Implement GRU architecture from lecture/D2L textbook
class GRUScratch(nn.Module):
    def __init__(self, num_inputs, hidden_states, sigma=0.01):
        super().__init__()
        
        # Draw the weights from a Gaussian distribution with standard deviation to be sigma, setting bias to 0
        initial_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma)
        triple = lambda: (initial_weight(num_inputs, hidden_states),
                          initial_weight(hidden_states, hidden_states),
                          nn.Parameter(torch.zeroes(hidden_states)))
        
        self.W_xz, self.W_hz, self.b_z = triple()  # Update gate
        self.W_xr, self.W_hr, self.b_r = triple()  # Reset gate
        self.W_xh, self.W_hh, self.b_h = triple()  # Candidate hidden state
    
    def forward(self, inputs, H=None):
        if H is None:
            # Initial state with shape: (batch_size, num_hiddens)
            H = torch.zeros((inputs.shape[1], self.hidden_states),
                          device=inputs.device)
        outputs = []
        for X in inputs:
            Z = torch.sigmoid(torch.matmul(X, self.W_xz) +
                            torch.matmul(H, self.W_hz) + self.b_z)
            R = torch.sigmoid(torch.matmul(X, self.W_xr) +
                            torch.matmul(H, self.W_hr) + self.b_r)
            H_tilde = torch.tanh(torch.matmul(X, self.W_xh) +
                            torch.matmul(R * H, self.W_hh) + self.b_h)
            H = Z * H + (1 - Z) * H_tilde
            outputs.append(H)
        return outputs, H