In [1]:
import numpy as np
import torch

In [2]:
# Read names from file
with open("names.txt", "r") as f:
    names = f.readlines()

In [3]:
names = [name.strip() for name in names]
print(names[:3])
print(len(names))

['emma', 'olivia', 'ava']
32033


In [4]:
letter_set = sorted(list({l for name in names for l in name}))
letter_set.insert(0, '.')
len(letter_set)

27

In [5]:
stoi = {letter: pos for pos, letter in enumerate(letter_set)}
itos = {pos: letter for letter, pos in stoi.items()}

In [6]:
pairs = [(a, b) for a in letter_set  for b in letter_set if b!= '.']
pairs.insert(0, ('.', '.'))

In [7]:
ptoi = {pair: pos for pos, pair in enumerate(pairs)}
itop = {pos: pair for pair, pos in ptoi.items()}

In [8]:
len(pairs)

703

### E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [9]:
# Create a 27 by 27 by 27 matrix to count all the number of occurences of trigrams
lookup_table = torch.ones((703, 27), dtype=torch.int32)

In [10]:
# Fill the lookup table with the counts of each trigram
for name in names:
    name = ['.', '.'] + list(name) + ['.']
    for char1, char2, char3 in zip(name, name[1:], name[2:]):
        p1, p2 = ptoi[(char1, char2)],  stoi[char3]
        lookup_table[p1, p2] += 1

In [11]:
def get_occurences(char1, char2, char3):
    return lookup_table[ptoi[(char1, char2)], stoi[char3]]

In [12]:
get_occurences('.', '.', 'a')

tensor(4411, dtype=torch.int32)

In [13]:
# Normalize the count table 
lookup_table = torch.div(lookup_table, torch.sum(lookup_table, dim=1, keepdims=True))
torch.sum(lookup_table)

tensor(703.0001)

In [14]:
# Get some new predictions using the counts table
gen = torch.Generator().manual_seed(2147483647)
output_count = []
for _ in range(10):
    prev = ('.', '.')
    out = []
    while True:
        idx = torch.multinomial(lookup_table[ptoi[prev]], num_samples=1, replacement=True, generator=gen).item()
        if idx==0:
            break
        out.append(itos[idx])
        prev = (prev[1], itos[idx])
    output_count.append("".join(out))
    print("", "".join(out))

 junide
 jakasid
 prelay
 adin
 kairritoper
 sathen
 sameia
 yanileniassibduinrwin
 lessiyanayla
 te


In [17]:
# Calculate the loss (negative log likelihood loss) and compare it to the loss of the bigram model (previously done)
log_likelihood = 0.0 
num_samples = 0
for name in names:
    name = ['.', '.'] + list(name) + ['.']
    for char1, char2, char3 in zip(name, name[1:], name[2:]):
        prob = lookup_table[ptoi[(char1, char2)], stoi[char3]]
        log_likelihood += torch.log(prob)
        num_samples += 1
nll = -log_likelihood
print(f"{log_likelihood=}")
print(f"nll_loss: {nll/num_samples:.4f}")
print(f"bigram_loss: 2.4544")

log_likelihood=tensor(-504653.)
nll_loss: 2.2120
bigram_loss: 2.4544


In [78]:
# Create trigram samples to train a gradient based model
xs, ys = [], []
for name in names:
    name = ['.', '.'] + list(name) + ['.']
    for char1, char2, char3 in zip(name, name[1:], name[2:]):
        xs.append(ptoi[(char1, char2)])
        ys.append(stoi[char3])
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
print(f"Number of samples: {xs.nelement()}")

Number of samples: 228146


In [79]:
# Change the inputs into one hot vectors and initialize the weights
x_oh = torch.nn.functional.one_hot(xs, num_classes=703).float()
# Reshape the input matrix from [_, 2, 27] to [_, 54] to make the multiplication easier
# x_oh = x_oh.view(x_oh.shape[0], x_oh.shape[1] * x_oh.shape[2])
W = torch.randn((703, 27), requires_grad=True)
print(f"Shape of encoded inputs: {x_oh.shape}")
print(f"Shape of weights matrix: {W.shape}")
print(f"Shape of labels vector: {ys.shape}")

Shape of encoded inputs: torch.Size([228146, 703])
Shape of weights matrix: torch.Size([703, 27])
Shape of labels vector: torch.Size([228146])


In [75]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
W = W.to(device)

In [164]:
# Training
for _ in range(1000):
    logits = torch.matmul(x_oh.to(device), W.to(device))
    counts = logits.exp() # Exponentiate to get the counts 
    probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True)) # Normalize the counts
    loss = -probs[torch.arange(x_oh.shape[0]), ys].log().mean() #+ 0.01 * (W ** 2).mean() # Calculate the nll loss
    print(f"Loss: {loss.item()}")
    W.grad = None # clear the gradients
    loss.backward()
#     W.retain_grad()
    W.data += -40 * W.grad # update the weights

Loss: 2.198673725128174
Loss: 2.198671579360962
Loss: 2.198669672012329
Loss: 2.198667526245117
Loss: 2.1986653804779053
Loss: 2.1986632347106934
Loss: 2.1986608505249023
Loss: 2.1986589431762695
Loss: 2.1986567974090576
Loss: 2.1986546516418457
Loss: 2.198652505874634
Loss: 2.198650360107422
Loss: 2.198648452758789
Loss: 2.198646068572998
Loss: 2.1986441612243652
Loss: 2.1986420154571533
Loss: 2.1986398696899414
Loss: 2.1986377239227295
Loss: 2.1986355781555176
Loss: 2.1986334323883057
Loss: 2.1986312866210938
Loss: 2.198629379272461
Loss: 2.19862699508667
Loss: 2.198624849319458
Loss: 2.198622703552246
Loss: 2.1986207962036133
Loss: 2.1986186504364014
Loss: 2.1986162662506104
Loss: 2.1986143589019775
Loss: 2.1986122131347656
Loss: 2.1986100673675537
Loss: 2.198608160018921
Loss: 2.19860577583313
Loss: 2.198603630065918
Loss: 2.198601722717285
Loss: 2.1985995769500732
Loss: 2.1985974311828613
Loss: 2.1985952854156494
Loss: 2.1985929012298584
Loss: 2.1985909938812256
Loss: 2.1985888481

Loss: 2.1980013847351074
Loss: 2.1979994773864746
Loss: 2.197997570037842
Loss: 2.197995901107788
Loss: 2.1979939937591553
Loss: 2.1979920864105225
Loss: 2.1979899406433105
Loss: 2.1979880332946777
Loss: 2.197986125946045
Loss: 2.197984218597412
Loss: 2.1979823112487793
Loss: 2.1979806423187256
Loss: 2.1979787349700928
Loss: 2.197976589202881
Loss: 2.197974920272827
Loss: 2.1979727745056152
Loss: 2.1979711055755615
Loss: 2.1979691982269287
Loss: 2.197967290878296
Loss: 2.197965383529663
Loss: 2.1979634761810303
Loss: 2.1979618072509766
Loss: 2.1979598999023438
Loss: 2.197957992553711
Loss: 2.197956085205078
Loss: 2.1979541778564453
Loss: 2.1979522705078125
Loss: 2.1979503631591797
Loss: 2.197948455810547
Loss: 2.197946548461914
Loss: 2.1979446411132812
Loss: 2.1979427337646484
Loss: 2.1979410648345947
Loss: 2.197939157485962
Loss: 2.197937250137329
Loss: 2.1979353427886963
Loss: 2.1979334354400635
Loss: 2.1979315280914307
Loss: 2.197929859161377
Loss: 2.197927951812744
Loss: 2.19792604

Loss: 2.197397470474243
Loss: 2.1973958015441895
Loss: 2.1973941326141357
Loss: 2.197392225265503
Loss: 2.197390556335449
Loss: 2.1973891258239746
Loss: 2.197387456893921
Loss: 2.197385549545288
Loss: 2.1973841190338135
Loss: 2.1973822116851807
Loss: 2.197380781173706
Loss: 2.1973788738250732
Loss: 2.1973772048950195
Loss: 2.197375535964966
Loss: 2.197373867034912
Loss: 2.1973719596862793
Loss: 2.1973702907562256
Loss: 2.197368621826172
Loss: 2.197366714477539
Loss: 2.1973652839660645
Loss: 2.1973636150360107
Loss: 2.197361946105957
Loss: 2.197360038757324
Loss: 2.1973583698272705
Loss: 2.197356700897217
Loss: 2.197355031967163
Loss: 2.1973533630371094
Loss: 2.1973516941070557
Loss: 2.197350025177002
Loss: 2.1973483562469482
Loss: 2.1973464488983154
Loss: 2.197345018386841
Loss: 2.197343349456787
Loss: 2.1973416805267334
Loss: 2.1973400115966797
Loss: 2.197338342666626
Loss: 2.197336435317993
Loss: 2.1973350048065186
Loss: 2.1973330974578857
Loss: 2.197331428527832
Loss: 2.197329759597

In [165]:
# Comparing the loss of bigram model and trigram model
print("Loss on bigram (gradient based): 2.4804")
print(f"Loss on trigram: {loss:.4f}")

Loss on bigram (gradient based): 2.4804
Loss on trigram: 2.1969


In [166]:
W.shape

torch.Size([703, 27])

In [167]:
# Generate some names and compare with the names from counting method
gen = torch.Generator(device=device).manual_seed(2147483647)
output_gradient = []
for _ in range(10):
    out = []
    char1 = '.'
    char2 = '.'
    
    while True:
        x_enc = torch.nn.functional.one_hot(torch.tensor([ptoi[(char1, char2)]]), num_classes=703).float().to(device)
        
        logits = torch.matmul(x_enc.to(device), W.to(device))
        counts = logits.exp()
        probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True))
        
        idx = torch.multinomial(probs, num_samples=1, replacement=True, generator=gen).item()
        if idx == 0:
            break
        char1 = char2
        char2 = itos[idx]
        out.append(itos[idx])
    output_gradient.append("".join(out))

In [168]:
print("{:<25}\t{}".format("Counting based: ", "Gradient based: "))
print("----------------------------------------------------")
for out1, out2 in zip(output_count, output_gradient):
    print("{:<25}\t{:<20}".format(out1, out2))


Counting based:          	Gradient based: 
----------------------------------------------------
junide                   	khen                
jakasid                  	ephyling            
prelay                   	yulagolbiahen       
adin                     	ramson              
kairritoper              	macxonnan           
sathen                   	rine                
sameia                   	delenlian           
yanileniassibduinrwin    	ermarishan          
lessiyanayla             	any                 
te                       	aleedon             


### E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [169]:
x_oh.shape, ys.shape

(torch.Size([228146, 703]), torch.Size([228146]))

In [170]:
# Split the dataset into train, validation and test sets
train_idx, val_idx, test_idx = torch.utils.data.random_split(range(x_oh.shape[0]), [0.8, 0.1, 0.1])

xs_train, ys_train = x_oh[train_idx], ys[train_idx]
xs_val, ys_val = x_oh[val_idx], ys[val_idx]
xs_test, ys_test = x_oh[test_idx], ys[test_idx]

print(xs_train.shape, ys_train.shape)
print(xs_val.shape, ys_val.shape)
print(xs_test.shape, ys_test.shape)

torch.Size([182517, 703]) torch.Size([182517])
torch.Size([22815, 703]) torch.Size([22815])
torch.Size([22814, 703]) torch.Size([22814])


In [180]:
# Initialize new weights
W = torch.randn((703, 27), requires_grad=True) # weights matrix
R = 0.01 # regulatization loss

In [181]:
def forward_pass(inputs, weights):
    """Perform a matrix multiplication between the input and weights tensor and return probabilities after exponentiating and normalizing"""
    logits = torch.matmul(inputs.to(device), weights.to(device))
    counts = logits.exp()
    probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True))
    return probs

def nll_loss(probs, n_inputs, labels):
    """Calculate the negative log likelihood loss and apply model smoothing with regularization"""
    return -probs[torch.arange(n_inputs), labels].log().mean() + R * (W**2).mean()

In [183]:
# Train the model
for i in range(1001):
    probs = forward_pass(xs_train, W)
    loss = nll_loss(probs, xs_train.shape[0], ys_train)
    if i % 50 == 0:
        val_probs = forward_pass(xs_val, W)
        val_loss = nll_loss(val_probs, xs_val.shape[0], ys_val)
        print(f"Training Loss: {loss.item():.4f} \t Val Loss: {val_loss.item():.4f}")
    W.grad = None
    loss.backward()
    W.data += -30 * W.grad

Training Loss: 2.4279 	 Val Loss: 2.4301
Training Loss: 2.3998 	 Val Loss: 2.4028
Training Loss: 2.3780 	 Val Loss: 2.3820
Training Loss: 2.3607 	 Val Loss: 2.3656
Training Loss: 2.3465 	 Val Loss: 2.3523
Training Loss: 2.3347 	 Val Loss: 2.3414
Training Loss: 2.3247 	 Val Loss: 2.3321
Training Loss: 2.3162 	 Val Loss: 2.3243
Training Loss: 2.3088 	 Val Loss: 2.3175
Training Loss: 2.3023 	 Val Loss: 2.3116
Training Loss: 2.2966 	 Val Loss: 2.3064


In [184]:
# Calculate the loss on the test set
test_loss = nll_loss(forward_pass(xs_test, W), len(xs_test), ys_test)
print(f"Test loss: {test_loss:.4f}")

Test loss: 2.3098


### E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?
