In [1]:
import numpy as np
import torch

In [2]:
# Read names from file
with open("names.txt", "r") as f:
    names = f.readlines()

In [3]:
names = [name.strip() for name in names]
print(names[:3])
print(len(names))

['emma', 'olivia', 'ava']
32033


In [4]:
letter_set = sorted(list({l for name in names for l in name}))
letter_set.insert(0, '.')
len(letter_set)

27

In [5]:
stoi = {letter: pos for pos, letter in enumerate(letter_set)}
itos = {pos: letter for letter, pos in stoi.items()}

### E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [6]:
# Create a 27 by 27 by 27 matrix to count all the number of occurences of trigrams
lookup_table = torch.ones((27, 27, 27), dtype=torch.int32)

In [7]:
# Fill the lookup table with the counts of each trigram
for name in names:
    name = ['.', '.'] + list(name) + ['.']
    for char1, char2, char3 in zip(name, name[1:], name[2:]):
        p1, p2, p3 = stoi[char1], stoi[char2], stoi[char3]
        lookup_table[p1, p2, p3] += 1

In [8]:
def get_occurences(char1, char2, char3):
    return lookup_table[stoi[char1], stoi[char2], stoi[char3]]

In [9]:
get_occurences('.', '.', 'a')

tensor(4411, dtype=torch.int32)

In [10]:
# Normalize the count table 
lookup_table = torch.div(lookup_table, torch.sum(lookup_table, dim=2, keepdims=True))
torch.sum(lookup_table)

tensor(729.0001)

In [11]:
# Get some new predictions using the counts table
gen = torch.Generator().manual_seed(2147483647)
output_count = []
for _ in range(10):
    idx1, idx2 = 0, 0
    out = []
    while True:
        idx3 = torch.multinomial(lookup_table[idx1, idx2], num_samples=1, replacement=True, generator=gen).item()
        if idx3==0:
            break
        out.append(itos[idx3])
        idx1 = idx2
        idx2 = idx3
    output_count.append("".join(out))
    print("", "".join(out))

 junide
 jakasid
 prelay
 adin
 kairritoper
 sathen
 sameia
 yanileniassibduinrwin
 lessiyanayla
 te


In [14]:
# Calculate the loss (negative log likelihood loss) and compare it to the loss of the bigram model (previously done)
log_likelihood = 0.0 
num_samples = 0
for name in names:
    name = ['.', '.'] + list(name) + ['.']
    for char1, char2, char3 in zip(name, name[1:], name[2:]):
        prob = lookup_table[stoi[char1], stoi[char2], stoi[char3]]
        log_likelihood += torch.log(prob)
        num_samples += 1
nll = -log_likelihood
print(f"{log_likelihood=}")
print(f"nll_loss: {nll/num_samples:.4f}")
print(f"bigram_loss: 2.4544")

log_likelihood=tensor(-504653.)
nll_loss: 2.2120
bigram_loss: 2.4544


In [15]:
# Create trigram samples to train a gradient based model
xs, ys = [], []
for name in names:
    name = ['.', '.'] + list(name) + ['.']
    for char1, char2, char3 in zip(name, name[1:], name[2:]):
        xs.append([stoi[char1], stoi[char2]])
        ys.append(stoi[char3])
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
print(f"Number of samples: {xs.nelement()}")

Number of samples: 456292


In [17]:
# Change the inputs into one hot vectors and initialize the weights
x_oh = torch.nn.functional.one_hot(xs, num_classes=27).float()
# Reshape the input matrix from [_, 2, 27] to [_, 54] to make the multiplication easier
x_oh = x_oh.view(x_oh.shape[0], x_oh.shape[1] * x_oh.shape[2])
W = torch.randn((54, 27), requires_grad=True)
print(f"Shape of encoded inputs: {x_oh.shape}")
print(f"Shape of weights matrix: {W.shape}")
print(f"Shape of labels vector: {ys.shape}")

Shape of encoded inputs: torch.Size([228146, 54])
Shape of weights matrix: torch.Size([54, 27])
Shape of labels vector: torch.Size([228146])


In [23]:
# Training
for _ in range(700):
    logits = torch.matmul(x_oh, W)
    counts = logits.exp() # Exponentiate to get the counts 
    probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True)) # Normalize the counts
    loss = -probs[torch.arange(xs.shape[0]), ys].log().mean() + 0.02 * (W ** 2).mean() # Calculate the nll loss
    print(f"Loss: {loss.item()}")
    W.grad = None # clear the gradients
    loss.backward() 
    W.data += -30 * W.grad # update the weights

Loss: 4.331145763397217
Loss: 3.732630729675293
Loss: 3.456670045852661
Loss: 3.2706995010375977
Loss: 3.1345431804656982
Loss: 3.0353121757507324
Loss: 2.957371234893799
Loss: 2.8958206176757812
Loss: 2.8451552391052246
Loss: 2.802884578704834
Loss: 2.7669413089752197
Loss: 2.7360732555389404
Loss: 2.709300994873047
Loss: 2.685901403427124
Loss: 2.665297746658325
Loss: 2.647034168243408
Loss: 2.630741834640503
Loss: 2.6161224842071533
Loss: 2.6029324531555176
Loss: 2.5909717082977295
Loss: 2.5800743103027344
Loss: 2.570103645324707
Loss: 2.5609445571899414
Loss: 2.5524988174438477
Loss: 2.5446856021881104
Loss: 2.5374348163604736
Loss: 2.530686378479004
Loss: 2.5243892669677734
Loss: 2.5184993743896484
Loss: 2.51297664642334
Loss: 2.507789134979248
Loss: 2.50290584564209
Loss: 2.4983017444610596
Loss: 2.4939522743225098
Loss: 2.489838123321533
Loss: 2.485940456390381
Loss: 2.4822423458099365
Loss: 2.478729248046875
Loss: 2.4753878116607666
Loss: 2.4722061157226562
Loss: 2.469172954559

Loss: 2.367455005645752
Loss: 2.3674259185791016
Loss: 2.3673973083496094
Loss: 2.367368698120117
Loss: 2.367340087890625
Loss: 2.36731219291687
Loss: 2.367283821105957
Loss: 2.3672561645507812
Loss: 2.3672285079956055
Loss: 2.367201328277588
Loss: 2.3671741485595703
Loss: 2.367147207260132
Loss: 2.3671202659606934
Loss: 2.367093563079834
Loss: 2.3670668601989746
Loss: 2.3670408725738525
Loss: 2.3670146465301514
Loss: 2.3669888973236084
Loss: 2.3669629096984863
Loss: 2.3669376373291016
Loss: 2.3669121265411377
Loss: 2.366886615753174
Loss: 2.3668620586395264
Loss: 2.3668367862701416
Loss: 2.366812229156494
Loss: 2.366787910461426
Loss: 2.3667635917663574
Loss: 2.366739511489868
Loss: 2.366715431213379
Loss: 2.3666913509368896
Loss: 2.3666677474975586
Loss: 2.3666443824768066
Loss: 2.3666210174560547
Loss: 2.3665976524353027
Loss: 2.36657452583313
Loss: 2.366551637649536
Loss: 2.3665289878845215
Loss: 2.366506576538086
Loss: 2.3664839267730713
Loss: 2.366461753845215
Loss: 2.36643958091

Loss: 2.3633270263671875
Loss: 2.3633222579956055
Loss: 2.3633182048797607
Loss: 2.3633134365081787
Loss: 2.363309144973755
Loss: 2.363304615020752
Loss: 2.363300085067749
Loss: 2.363295793533325
Loss: 2.3632912635803223
Loss: 2.3632869720458984
Loss: 2.3632824420928955
Loss: 2.363278388977051
Loss: 2.363274097442627
Loss: 2.363269567489624
Loss: 2.3632652759552
Loss: 2.3632612228393555
Loss: 2.3632571697235107
Loss: 2.363253116607666
Loss: 2.363248586654663
Loss: 2.3632445335388184
Loss: 2.3632402420043945
Loss: 2.3632359504699707
Loss: 2.363231897354126
Loss: 2.3632280826568604
Loss: 2.3632237911224365
Loss: 2.363219976425171
Loss: 2.363215684890747
Loss: 2.3632113933563232
Loss: 2.3632078170776367
Loss: 2.363203525543213


In [24]:
# Comparing the loss of bigram model and trigram model
print("Loss on bigram (gradient based): 2.4804")
print(f"Loss on trigram: {loss:.4f}")

Loss on bigram (gradient based): 2.4804
Loss on trigram: 2.3632


In [25]:
# Generate some names and compare with the names from counting method
gen = torch.Generator().manual_seed(2147483647)
output_gradient = []
for _ in range(10):
    out = []
    idx1 = 0
    idx2 = 0
    
    while True:
        x_enc_1 = torch.nn.functional.one_hot(torch.tensor([idx1]), num_classes=27).float()
        x_enc_2 = torch.nn.functional.one_hot(torch.tensor([idx2]), num_classes=27).float()
        
        logits = torch.matmul(torch.hstack((x_enc_1, x_enc_2)), W)
        counts = logits.exp()
        probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True))
        
        idx3 = torch.multinomial(probs, num_samples=1, replacement=True, generator=gen).item()
        if idx3 == 0:
            break
        idx1 = idx2
        idx2 = idx3
        out.append(itos[idx3])
    output_gradient.append("".join(out))

In [26]:
print("{:<25}\t{}".format("Counting based: ", "Gradient based: "))
print("----------------------------------------------------")
for out1, out2 in zip(output_count, output_gradient):
    print("{:<25}\t{:<20}".format(out1, out2))


Counting based:          	Gradient based: 
----------------------------------------------------
junide                   	juwide              
jakasid                  	janasad             
prelay                   	pariay              
adin                     	ainn                
kairritoper              	koi                 
sathen                   	ritoleras           
sameia                   	tee                 
yanileniassibduinrwin    	kalania             
lessiyanayla             	yanileniassibdainrwi
te                       	ta                  


### E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [28]:
x_oh.shape, ys.shape

(torch.Size([228146, 54]), torch.Size([228146]))

In [67]:
# Split the dataset into train, validation and test sets
train_idx, val_idx, test_idx = torch.utils.data.random_split(range(x_oh.shape[0]), [0.8, 0.1, 0.1])

xs_train, ys_train = x_oh[train_idx], ys[train_idx]
xs_val, ys_val = x_oh[val_idx], ys[val_idx]
xs_test, ys_test = x_oh[test_idx], ys[test_idx]

print(xs_train.shape, ys_train.shape)
print(xs_val.shape, ys_val.shape)
print(xs_test.shape, ys_test.shape)

torch.Size([182517, 54]) torch.Size([182517])
torch.Size([22815, 54]) torch.Size([22815])
torch.Size([22814, 54]) torch.Size([22814])


In [72]:
# Initialize new weights
W = torch.randn((54, 27), requires_grad=True) # weights matrix
R = 0.01 # regulatization loss

In [73]:
def forward_pass(inputs, weights):
    """Perform a matrix multiplication between the input and weights tensor and return probabilities after exponentiating and normalizing"""
    logits = torch.matmul(inputs, weights)
    counts = logits.exp()
    probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True))
    return probs

def nll_loss(probs, n_inputs, labels):
    """Calculate the negative log likelihood loss and apply model smoothing with regularization"""
    return -probs[torch.arange(n_inputs), labels].log().mean() + R * (W**2).mean()

In [74]:
# Train the model
for i in range(500):
    probs = forward_pass(xs_train, W)
    loss = nll_loss(probs, xs_train.shape[0], ys_train)
    if i % 50 == 0 or i == 999:
        val_probs = forward_pass(xs_val, W)
        val_loss = nll_loss(val_probs, xs_val.shape[0], ys_val)
        print(f"Training Loss: {loss.item():.4f} \t Val Loss: {val_loss.item():.4f}")
    W.grad = None
    loss.backward()
    W.data += -30 * W.grad

Training Loss: 4.3237 	 Val Loss: 4.3352
Training Loss: 2.4400 	 Val Loss: 2.4290
Training Loss: 2.3918 	 Val Loss: 2.3834
Training Loss: 2.3763 	 Val Loss: 2.3689
Training Loss: 2.3687 	 Val Loss: 2.3619
Training Loss: 2.3643 	 Val Loss: 2.3578
Training Loss: 2.3614 	 Val Loss: 2.3551
Training Loss: 2.3594 	 Val Loss: 2.3533
Training Loss: 2.3580 	 Val Loss: 2.3520
Training Loss: 2.3569 	 Val Loss: 2.3510


In [77]:
# Calculate the loss on the test set
test_loss = nll_loss(forward_pass(xs_test, W), len(xs_test), ys_test)
print(f"Test loss: {test_loss:.4f}")

Test loss: 2.3576


### E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?
