In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
file_path = '/content/drive/MyDrive/ADL/HW5/names.txt'

In [3]:
import torch
import torch.nn as nn
from collections import defaultdict
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import tqdm
import math
from collections import defaultdict

In [4]:
FILL_IN = "FILL_IN"

In [5]:
# Dictionaries, {idx -> ch} and {ch -> idx}
itos = defaultdict(int)
stoi = defaultdict(int)
# Number of characters used to predict the target character in the MLP Language Model
block_size = 3
# Batch size used in MLP Language Model
batch_size = 32
# Embedding dimension, per character
d_model = 10
# Hidden dimension for RNN and also MLP Language Models
d_h = 200

# START = START token
stoi['.'] = 0
itos[0] = '.'

# Loop over all names and create mappings itos and stoi mapping a unique character to a idx
idx = 1
for name in open(file_path, 'r'):
    name = name.lower().strip()
    for ch in name:
        if ch not in stoi:
            stoi[ch] = idx
            itos[idx] = ch
            idx += 1

In [6]:
assert len(stoi) == len(itos)
vocab_size = len(stoi)
assert vocab_size == 27

In [7]:
stoi

defaultdict(int,
            {'.': 0,
             'e': 1,
             'm': 2,
             'a': 3,
             'o': 4,
             'l': 5,
             'i': 6,
             'v': 7,
             's': 8,
             'b': 9,
             'p': 10,
             'h': 11,
             'c': 12,
             'r': 13,
             't': 14,
             'y': 15,
             'n': 16,
             'g': 17,
             'z': 18,
             'f': 19,
             'd': 20,
             'u': 21,
             'k': 22,
             'w': 23,
             'q': 24,
             'x': 25,
             'j': 26})

## BiGram Language Model
- Implement the Bigram Language Model
- Get all the relevent counts, then get the train dataset Perplexity
- Use the class notes

In [None]:
# Using the formulas in class, loop over each name and get the parameters
c1 = defaultdict(int)
c2 = defaultdict(int)
for name in open(file_path, 'r'):
    # Lowercase and remove any whitespace at the end
    name = name.lower().strip()
    # Pad with START = '.' and STOP = '.'
    name = '.' + name + '.'
    # Transform to integer
    name = [stoi[ch] for ch in name]
    # Get the counts for Bigrams and Unigrams
    for i in range(len(name)-1):
        c1[name[i]] += 1
        c2[(name[i], name[i+1])] += 1

In [None]:
# Get perplexity
sumneglogp = 0
T = 0
for name in open(file_path, 'r'):
    # Get rid of white space and lowercase
    name = name.lower().strip()
    # Get the length of the word, without padding
    T += (len(name) + 1)
    # Don't pad the STOP since we are not generating; pad with START
    name = '.' + name + '.'
    # Transform to integrs
    name = [stoi[ch] for ch in name]
    # Get the loss -log(p(name)); use that the log of the product is the sum of the logs
    for i in range(len(name) - 1):
        char1, char2 = name[i], name[i + 1]
        if c2[(char1, char2)] == 0:
            p = 1e-10
        else:
            p = c2[(char1, char2)] / c1[char1]
        sumneglogp += -math.log2(p)
# Print the Perplexity
print('Preplexity: ', torch.pow(2, torch.tensor(sumneglogp / T )).item())

Preplexity:  11.634961128234863


In [None]:
# Generate a random word using this distributon
# When you generate STOP, terminate
name = '.'
while True:
    c = stoi[name[-1]]
    # Make the distribution from c to any other word other than START
    p = []
    for d in range(vocab_size):
        # Use the same indicies as the dictionary we set up
        # Populate p
        next_char = itos[d]
        if c1[c] > 0:
            p.append(c2[(c, d)] / c1[c])
        else:
            p.append(0.0)
    p = torch.tensor(p)
    #print(p)
    assert len(p) == vocab_size
    # Sample randmly from the probability using torch.Categorical
    c = torch.multinomial(p, num_samples=1)
    # Offset by 1 since we want indices [1, 2, ..., vocab_size]
    if c.item() == 0:
        break
    else:
        name += itos[c.item()]
print('Generated name: ' , name[1:])

Generated name:  memar


## MLP Language Model

- Implement the MLP language model from below
- Look at page 7, Equation (1)
- https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [None]:
x_data = []
y_data = []
for name in open(file_path, 'r'):
    name = name.lower().strip()
    # Pad with block_size START tokens and 1 STOP token
    name = ''.join(block_size * ['.']) + name + '.'
    # Loop through name and get the (x, y) pairs
    # Add (x, y) to x_data and y_data and make sure you transform to characters
    # Make sure x_data and y_data have integers, use stoi
    for i in range(len(name) - block_size):
        x = name[i:i+block_size]
        y = name[i+block_size]
        x_data.append([stoi[c] for c in x])
        y_data.append(stoi[y])

In [None]:
class MLPLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # An embedding for each character; vocab_size of them
        self.e = nn.Embedding(vocab_size, d_model)
        # H; should take in block_size * d_model vector and output d_h
        self.H = nn.Linear(block_size * d_model, d_h)
        # U; should take in d_h vector and output vocab_size
        self.U = nn.Linear(d_h, vocab_size)
        # W; for the skip connection, should take in block_size * d_model and output vocab_size
        self.W = nn.Linear(block_size * d_model, vocab_size)

    # x should be (batch_size, block_size)
    def forward(self, x):
        emb = self.e(x)
        emb_flat = emb.view(emb.shape[0], -1)
        h = torch.tanh(self.H(emb_flat))
        return self.W(emb_flat) + self.U(h)

In [None]:
x_data[0], y_data[0]

([0, 0, 0], 1)

In [None]:
# Define a dataloader with x_data and y_data with batch_size
dl = DataLoader(TensorDataset(torch.tensor(x_data), torch.tensor(y_data)), batch_size=batch_size, shuffle=True)

In [None]:
for xb, yb in dl:
    assert xb.shape == (batch_size, 3)
    assert yb.shape == (batch_size,)
    break

In [None]:
# Define the MLP model and the Adam optimizer learning rate 0.001
model = MLPLanguageModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
total_loss = 0
total_ct = 0
total_epochs = 20

for _ in range(total_epochs):
    for xb, yb in dl:
        # Zero the gradients
        optimizer.zero_grad()

        # Get the logits
        logits = model(xb)

        # Compute the loss
        loss = F.cross_entropy(logits, yb)

        # Get the new gradient
        loss.backward()

        # Clip the gradients to max norm 0.1
        # Use clid_grad_norm from torch
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Do a gradient update
        optimizer.step()

        # Get the loss for the batch and get the number of batches
        total_loss += loss.item()
        total_ct += 1

        # Print the loss
        if total_ct and total_ct % 500 == 0:
            print(total_loss / total_ct)
            total_loss = 0
            total_ct = 0

2.6053924565315247
2.436605429649353
2.40250626373291
2.363287213087082
2.3462562260627746
2.3277494304180144
2.338515154361725
2.342711992740631
2.311801985502243
2.292671504497528
2.2905889723300934
2.295849435567856
2.2754329426288606
2.272705602645874
2.2835639333724975
2.2766950030326845
2.2493808541297913
2.241417848110199
2.246465926885605
2.257120555639267
2.244067485332489
2.23559250831604
2.2409944989681243
2.220934106349945
2.229681006193161
2.2235280320644377
2.2382821826934816
2.214663508653641
2.22204710149765
2.2306108002662657
2.204568883895874
2.216143187522888
2.2112326803207396
2.204763936758041
2.190671452999115
2.208361694097519
2.2099015822410584
2.2041461975574492
2.192136327266693
2.1921737818717957
2.199219096660614
2.1866283791065215
2.179388942480087
2.183414498329163
2.1937762246131896
2.175004707813263
2.169992664575577
2.17849192070961
2.1871815695762633
2.1947344074249266
2.164343858718872
2.175243107557297
2.168392813920975
2.160478837251663
2.1920420763

In [None]:
with torch.no_grad():
    # Get perplexity
    sumneglogp = 0
    T = 0
    for name in open(file_path, 'r'):
        name = name.lower().strip()
        T += (len(name) + 1)
        # Pad with block_size START tokens
        name = '.' * block_size + name + '.'
        x_data = []
        y_data = []
        # Gather all the terms over the loss
        # Notice that we compute -log p(...abc)
        # Which is -log p(a | ...) - log p(b | a..) - log p(c | ba.)
        for i in range(len(name) - block_size):
            x = [stoi[c] for c in name[i:i+block_size]]
            y = stoi[name[i+block_size]]
            x_data.append(x)
            y_data.append(y)
        # Gather the loss over the name, for each term
        # You need to get the softmax loss for each term
        # Can either use the CrossEntropyLoss or do this manually
        # Compute the loss
        logits = model(torch.tensor(x_data))

        # Use reduction "sum" so you don't need to worry about N
        loss = F.cross_entropy(logits.view(-1, vocab_size), torch.tensor(y_data), reduction='sum')

        # Change to log base 2
        loss *= math.log2(math.e)

        sumneglogp += loss

    print('Preplexity: ', torch.pow(2, sumneglogp.clone().detach() / T).item())

Preplexity:  7.962662220001221


In [None]:
# Generate a random word using this distributon
# When you generate STOP, terminate
name = ''.join(block_size * ['.'])
while True:
    # Get the idx
    c = [stoi[ch] for ch in name[-block_size:]]
    # Make the distribution from c to any other word other than START
    p = F.softmax(model(torch.tensor([c])), dim=1)
    # Randomly sample from p a new character
    c = torch.multinomial(p, num_samples=1)
    if c.item() == 0:
        break
    else:
        name += itos[c.item()]
print('Generated name: ' , name[block_size:])

Generated name:  jabrinn


## RNN Language Model
- For each name, run an RNN character by character
- Use the recursion x = Tanh()(Wh @ h + Wx @ x + bh + bx) and y = Softmax()(Wy h + by)
- Do not use the RNN Cell from PyTorch, do this manually as hinted below

In [8]:
class RNNLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Each token has an embedding of size vocab_size
        self.e = nn.Embedding(vocab_size, d_model)
        # Wh used to map hidden to hidden
        self.Wh = nn.Linear(d_h, d_h)
        self.Wx = nn.Linear(d_model, d_h)
        self.Wy = nn.Linear(d_h, vocab_size)

    def forward(self, x, h):
        # Run through to get the embedding for the token
        # The embedding per token is the feature vector x we pass into the
        # Represent x as an embedding
        x = self.e(x)
        # Get the hidden state
        h = torch.tanh(self.Wh(h) + self.Wx(x))
        # Get the logits we use to predict y
        z = self.Wy(h)
        # Return the z predicting y for the timestep we are at and the next hidden state
        return z, h

In [9]:
model = RNNLanguageModel()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [10]:
total_loss = 0
total_ct = 0
total_epochs = 10

for _ in range(total_epochs):
    for name in open(file_path, 'r'):
        name = name.lower().strip()
        # Add the start and end padding token
        name = '.' + name + '.'
        # name[:-1]
        x_data = [stoi[c] for c in name[:-1]]
        # name[1:]
        y_data = [stoi[c] for c in name[1:]]
        logits = []
        # Set the hidden state to random
        h = torch.zeros(1, d_h)
        # Zero the grad
        optimizer.zero_grad()

        # Loop through each token and get the new h and then pass it forward
        # Accumulate all the logits
        for x in x_data:
            x = torch.tensor(x).view(1)
            z, h = model(x, h)
            logits.append(z)

        # Put all the logits into one tensor
        logits = torch.cat(logits)

        # Compute the loss
        loss = F.cross_entropy(logits, torch.tensor(y_data))

        # Get the new gradient
        loss.backward()

        # Clip the gradients at max norm 0.1
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Do a gradient update
        optimizer.step()

        # Get the loss for the batch and get the number of batches
        total_loss += loss.item()
        total_ct += 1

        if total_ct and total_ct % 100 == 0:
            print(total_loss / total_ct)
            total_loss = 0
            total_ct = 0


3.1895894956588746
2.8844231510162355
2.638272862434387
2.5558280181884765
2.5070971298217772
2.496905882358551
2.342495483160019
2.326489473581314
2.3527750527858733
2.414100458621979
2.3644703817367554
2.3383925557136536
2.3404877650737763
2.359119316339493
2.311307729482651
2.2736580955982206
2.3249522376060487
2.3156863403320314
2.3341918444633483
2.2605009019374847
2.3849704205989837
2.3202267694473266
2.3042042875289916
2.3782431757450104
2.330101571083069
2.292976791858673
2.282235687971115
2.2708072197437286
2.289008790254593
2.2592110979557036
2.2520893132686615
2.292061687707901
2.240769283771515
2.248006786108017
2.2614398682117463
2.2260755062103272
2.313931061029434
2.312238084077835
2.2515999126434325
2.2480646646022797
2.2825918185710905
2.3468603897094726
2.240620311498642
2.2877870321273805
2.2458071315288546
2.24307461977005
2.265151627063751
2.226843634843826
2.2388923370838167
2.2466877126693725
2.2605516648292543
2.2968906223773957
2.253459953069687
2.3157957971096

In [11]:
with torch.no_grad():
    # Get perplexity
    sumneglogp = 0
    T = 0
    for name in open(file_path, 'r'):
        name = name.lower().strip()
        T += (len(name) + 1)
        name = '.' + name + '.'
        # Get the name from index 0 to -1 exclusive end
        x_data = [stoi[c] for c in name[:-1]]
        # Get the y from index 1 to end inclusive end
        y_data = [stoi[c] for c in name[1:]]
        # logits per token prediction
        logits = []
        # Initialize the h vector to random
        h = torch.zeros(1, d_h)
        # Loop over each chracter in the name and pass h and this into the RNN
        # Get the new logit
        for x in x_data:
            # Get the int for x
            x = torch.tensor(x).view(1)
            # Get z and h
            z, h = model(x, h)
            # Append to logit
            logits.append(z)

        # Get all the logits for each character
        logits = torch.cat(logits)

        # Compute the loss across all characters
        loss = F.cross_entropy(logits, torch.tensor(y_data), reduction='sum')

        # Change to log base 2
        # log2(x) = ln(x) / ln(2)
        loss *= math.log2(math.e)

        sumneglogp += loss

    # sumneglogp is -log(p('.' + name1)) -log(p('.' + name2)) -log(p('.' + name3)) ...
    # Divide by the appropriate term to get the answer we want
    print('Preplexity: ', torch.pow(2, sumneglogp.clone().detach() / T).item())

Preplexity:  8.773097038269043


In [13]:
# Generate a random word using this distributon
# Intialize the word with
name = '.'
# Initialize h to random
h = torch.zeros(1, d_h)
while True:
    # Make c to an integer
    c = torch.tensor([stoi[name[-1]]]).view(1)
    # Make the distribution from c to any other word other than START
    logits, h = model(c, h)
    # Get p; use Softmax
    p = F.softmax(logits, dim=1)
    # Sample from p
    c = torch.multinomial(p, num_samples=1)
    # If we generate '.', stop
    if c.item() == 0:
        break
    else:
        name += itos[c.item()]
print('Generated name: ' , name[1:])

Generated name:  yonger
