In [1]:
import numpy as np
import torch
import torch.nn.functional as F

In [2]:
# Read names from file
with open("names.txt", "r") as f:
    names = f.readlines()

In [3]:
names = [name.strip() for name in names]
print(names[:3])
print(len(names))

['emma', 'olivia', 'ava']
32033


In [4]:
letter_set = sorted(list({l for name in names for l in name}))
letter_set.insert(0, '.')
len(letter_set)

27

In [5]:
stoi = {letter: pos for pos, letter in enumerate(letter_set)}
itos = {pos: letter for letter, pos in stoi.items()}

In [6]:
#create all possible pairs of letters. The '.' never comes in the second place except at the start when we start with two dots.
pairs = [(a, b) for a in letter_set  for b in letter_set if b!= '.']
pairs.insert(0, ('.', '.'))

In [7]:
ptoi = {pair: pos for pos, pair in enumerate(pairs)}
itop = {pos: pair for pair, pos in ptoi.items()}

In [8]:
len(pairs)

703

### E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [18]:
# Create a 27 by 27 by 27 matrix to count all the number of occurences of trigrams
lookup_table = torch.ones((703, 27), dtype=torch.int32)

In [19]:
# Fill the lookup table with the counts of each trigram
for name in names:
    name = ['.', '.'] + list(name) + ['.']
    for char1, char2, char3 in zip(name, name[1:], name[2:]):
        p1, p2 = ptoi[(char1, char2)],  stoi[char3]
        lookup_table[p1, p2] += 1

In [20]:
def get_occurences(char1, char2, char3):
    return lookup_table[ptoi[(char1, char2)], stoi[char3]]

In [21]:
get_occurences('.', '.', 'a')

tensor(4411, dtype=torch.int32)

In [22]:
# Normalize the count table 
lookup_table = torch.div(lookup_table, torch.sum(lookup_table, dim=1, keepdims=True))
torch.sum(lookup_table)

tensor(703.0001)

In [23]:
# Get some new predictions using the counts table
gen = torch.Generator().manual_seed(2147483647)
output_count = []
for _ in range(10):
    prev = ('.', '.')
    out = []
    while True:
        idx = torch.multinomial(lookup_table[ptoi[prev]], num_samples=1, replacement=True, generator=gen).item()
        if idx==0:
            break
        out.append(itos[idx])
        prev = (prev[1], itos[idx])
    output_count.append("".join(out))
    print("", "".join(out))

 junide
 jakasid
 prelay
 adin
 kairritoper
 sathen
 sameia
 yanileniassibduinrwin
 lessiyanayla
 te


In [24]:
# Calculate the loss (negative log likelihood loss) and compare it to the loss of the bigram model (previously done)
log_likelihood = 0.0 
num_samples = 0
for name in names:
    name = ['.', '.'] + list(name) + ['.']
    for char1, char2, char3 in zip(name, name[1:], name[2:]):
        prob = lookup_table[ptoi[(char1, char2)], stoi[char3]]
        log_likelihood += torch.log(prob)
        num_samples += 1
nll = -log_likelihood
print(f"{log_likelihood=}")
print(f"nll_loss: {nll/num_samples:.4f}")
print(f"bigram_loss: 2.4544")

log_likelihood=tensor(-504653.)
nll_loss: 2.2120
bigram_loss: 2.4544


In [25]:
# Create trigram samples to train a gradient based model
xs, ys = [], []
for name in names:
    name = ['.', '.'] + list(name) + ['.']
    for char1, char2, char3 in zip(name, name[1:], name[2:]):
        xs.append(ptoi[(char1, char2)])
        ys.append(stoi[char3])
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
print(f"Number of samples: {xs.nelement()}")

Number of samples: 228146


In [26]:
# Change the inputs into one hot vectors and initialize the weights
x_oh = torch.nn.functional.one_hot(xs, num_classes=703).float()
W = torch.randn((703, 27), requires_grad=True)
print(f"Shape of encoded inputs: {x_oh.shape}")
print(f"Shape of weights matrix: {W.shape}")
print(f"Shape of labels vector: {ys.shape}")

Shape of encoded inputs: torch.Size([228146, 703])
Shape of weights matrix: torch.Size([703, 27])
Shape of labels vector: torch.Size([228146])


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# W = W.to(device)

In [28]:
# Training
for i in range(2001):
    logits = torch.matmul(x_oh.to(device), W.to(device))
    counts = logits.exp() # Exponentiate to get the counts 
    probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True)) # Normalize the counts
    loss = -probs[torch.arange(x_oh.shape[0]), ys].log().mean() + 0.001 * (W ** 2).mean() # Calculate the nll loss
    if i % 20 == 0:
        print(f"Loss: {loss.item()}")
    W.grad = None # clear the gradients
    loss.backward()
#     W.retain_grad()
    W.data += -40 * W.grad # update the weights

Loss: 3.7144980430603027
Loss: 3.0218570232391357
Loss: 2.807922124862671
Loss: 2.6918089389801025
Loss: 2.6155264377593994
Loss: 2.560713052749634
Loss: 2.5190727710723877
Loss: 2.4862024784088135
Loss: 2.459519147872925
Loss: 2.437392234802246
Loss: 2.418728828430176
Loss: 2.4027678966522217
Loss: 2.38895845413208
Loss: 2.376892566680908
Loss: 2.3662590980529785
Loss: 2.356815814971924
Loss: 2.348371982574463
Loss: 2.3407750129699707
Loss: 2.3339014053344727
Loss: 2.3276500701904297
Loss: 2.3219377994537354
Loss: 2.3166956901550293
Loss: 2.3118667602539062
Loss: 2.307401657104492
Loss: 2.303260326385498
Loss: 2.2994070053100586
Loss: 2.2958123683929443
Loss: 2.2924511432647705
Loss: 2.289299964904785
Loss: 2.2863399982452393
Loss: 2.2835543155670166
Loss: 2.280927896499634
Loss: 2.278447389602661
Loss: 2.2761011123657227
Loss: 2.273878574371338
Loss: 2.2717700004577637
Loss: 2.2697675228118896
Loss: 2.2678627967834473
Loss: 2.266049385070801
Loss: 2.2643210887908936
Loss: 2.262671709

In [29]:
# Comparing the loss of bigram model and trigram model
print("Loss on bigram (gradient based): 2.4804")
print(f"Loss on trigram: {loss:.4f}")

Loss on bigram (gradient based): 2.4804
Loss on trigram: 2.2218


In [30]:
W.shape

torch.Size([703, 27])

In [31]:
# Generate some names and compare with the names from counting method
gen = torch.Generator(device=device).manual_seed(2147483647)
output_gradient = []
for _ in range(10):
    out = []
    char1 = '.'
    char2 = '.'
    
    while True:
        x_enc = torch.nn.functional.one_hot(torch.tensor([ptoi[(char1, char2)]]), num_classes=703).float().to(device)
        
        logits = torch.matmul(x_enc.to(device), W.to(device))
        counts = logits.exp()
        probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True))
        
        idx = torch.multinomial(probs, num_samples=1, replacement=True, generator=gen).item()
        if idx == 0:
            break
        char1 = char2
        char2 = itos[idx]
        out.append(itos[idx])
    output_gradient.append("".join(out))

In [32]:
print("{:<25}\t{}".format("Counting based: ", "Gradient based: "))
print("----------------------------------------------------")
for out1, out2 in zip(output_count, output_gradient):
    print("{:<25}\t{:<20}".format(out1, out2))


Counting based:          	Gradient based: 
----------------------------------------------------
junide                   	khya                
jakasid                  	epwktonn            
prelay                   	yulagolbiahen       
adin                     	ramsiyamicxonnan    
kairritoper              	rine                
sathen                   	delenlian           
sameia                   	ermarishan          
yanileniassibduinrwin    	any                 
lessiyanayla             	aleedon             
te                       	lyashily            


### E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [33]:
x_oh.shape, ys.shape

(torch.Size([228146, 703]), torch.Size([228146]))

In [34]:
# Split the dataset into train, validation and test sets
train_idx, val_idx, test_idx = torch.utils.data.random_split(range(x_oh.shape[0]), [0.8, 0.1, 0.1])

xs_train, ys_train = x_oh[train_idx], ys[train_idx]
xs_val, ys_val = x_oh[val_idx], ys[val_idx]
xs_test, ys_test = x_oh[test_idx], ys[test_idx]

print(xs_train.shape, ys_train.shape)
print(xs_val.shape, ys_val.shape)
print(xs_test.shape, ys_test.shape)

torch.Size([182517, 703]) torch.Size([182517])
torch.Size([22815, 703]) torch.Size([22815])
torch.Size([22814, 703]) torch.Size([22814])


In [35]:
# Initialize new weights
W = torch.randn((703, 27), requires_grad=True) # weights matrix
R = 0.001 # regulatization loss

In [36]:
def forward_pass(inputs, weights):
    """Perform a matrix multiplication between the input and weights tensor and return probabilities after exponentiating and normalizing"""
    logits = torch.matmul(inputs.to(device), weights.to(device))
    counts = logits.exp()
    probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True))
    return probs

def nll_loss(probs, n_inputs, labels, lbd=0.01):
    """Calculate the negative log likelihood loss and apply model smoothing with regularization"""
    return -probs[torch.arange(n_inputs), labels].log().mean() + lbd * (W**2).mean()

In [37]:
# Train the model
for i in range(1001):
    probs = forward_pass(xs_train, W)
    loss = nll_loss(probs, xs_train.shape[0], ys_train)
    if i % 50 == 0:
        val_probs = forward_pass(xs_val, W)
        val_loss = nll_loss(val_probs, xs_val.shape[0], ys_val)
        print(f"Training Loss: {loss.item():.4f} \t Val Loss: {val_loss.item():.4f}")
    W.grad = None
    loss.backward()
    W.data += -30 * W.grad

Training Loss: 3.7686 	 Val Loss: 3.7647
Training Loss: 2.8360 	 Val Loss: 2.8389
Training Loss: 2.6344 	 Val Loss: 2.6421
Training Loss: 2.5342 	 Val Loss: 2.5453
Training Loss: 2.4736 	 Val Loss: 2.4873
Training Loss: 2.4323 	 Val Loss: 2.4481
Training Loss: 2.4022 	 Val Loss: 2.4195
Training Loss: 2.3791 	 Val Loss: 2.3977
Training Loss: 2.3609 	 Val Loss: 2.3806
Training Loss: 2.3462 	 Val Loss: 2.3666
Training Loss: 2.3339 	 Val Loss: 2.3551
Training Loss: 2.3236 	 Val Loss: 2.3454
Training Loss: 2.3148 	 Val Loss: 2.3371
Training Loss: 2.3072 	 Val Loss: 2.3300
Training Loss: 2.3005 	 Val Loss: 2.3238
Training Loss: 2.2946 	 Val Loss: 2.3184
Training Loss: 2.2894 	 Val Loss: 2.3136
Training Loss: 2.2848 	 Val Loss: 2.3093
Training Loss: 2.2806 	 Val Loss: 2.3055
Training Loss: 2.2768 	 Val Loss: 2.3021
Training Loss: 2.2734 	 Val Loss: 2.2990


In [38]:
# Calculate the loss on the test set
test_loss = nll_loss(forward_pass(xs_test, W), len(xs_test), ys_test)
print(f"Test loss: {test_loss:.4f}")

Test loss: 2.2926


<p> The loss is a bit higher on the validation and test sets as compared to the train sets. But thats totally fine as long as the difference is small. </p>

### E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?


In [None]:
import pandas as pd

df = pd.DataFrame(columns=["lbd", "val_loss"])

for lbd in [0.1, 0.007, 0.004, 0.001, 0.0007, 0.0004, 0.0001, 0.00007, 0.00004, 0.00001, 0.0]:
    val_probs = forward_pass(xs_val, W)
    val_loss = -val_probs[torch.arange(xs_val.shape[0]), ys_val].log().mean() + lbd * (W**2).mean()
    df = df.append({"lbd": lbd, "val_loss": val_loss.item()}, ignore_index=True)

In [40]:
df

Unnamed: 0,lbd,val_loss
0,0.1,2.404053
1,0.007,2.295467
2,0.004,2.291964
3,0.001,2.288461
4,0.0007,2.288111
5,0.0004,2.287761
6,0.0001,2.287411
7,7e-05,2.287376
8,4e-05,2.287341
9,1e-05,2.287306


<p> Model works best without any regularization. </p>

In [41]:
# Check the loss on the test set with the suitable value regularization parameter
test_loss = nll_loss(forward_pass(xs_test, W), len(xs_test), ys_test, 0.0)
print(f"Test loss: {test_loss:.4f}")

Test loss: 2.2810


### E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

<p> The vector xs contains the inputs without the one-hot encoding. </p>

In [42]:
train_idx, val_idx = torch.utils.data.random_split(range(xs.shape[0]), [0.8, 0.2])
xs_train, ys_train = xs[train_idx], ys[train_idx]
xs_val, ys_val = xs[val_idx], ys[val_idx]

In [43]:
def forward_pass(inputs, weights):
    """Perform a matrix multiplication between the input and weights tensor and return probabilities after exponentiating and normalizing"""
#     logits = torch.matmul(inputs.to(device), weights.to(device))
    logits = weights[inputs] # The input matrix consists the indexes which should be plucked from the weights matrix. 
    counts = logits.exp()
    probs = torch.div(counts, torch.sum(counts, dim=1, keepdims=True))
    return probs

def nll_loss(probs, n_inputs, labels, lbd=0.01):
    """Calculate the negative log likelihood loss and apply model smoothing with regularization"""
    return -probs[torch.arange(n_inputs), labels].log().mean() + lbd * (W**2).mean()

In [44]:
for i in range(1001):
    probs = forward_pass(xs_train, W)
    loss = nll_loss(probs, xs_train.shape[0], ys_train, 0.0)
    if i % 50 == 0:
        val_probs = forward_pass(xs_val, W)
        val_loss = nll_loss(val_probs, xs_val.shape[0], ys_val, 0.0)
        print(f"Training Loss: {loss.item():.4f} \t Val Loss: {val_loss.item():.4f}")
    W.grad = None
    loss.backward()
    W.data += -30 * W.grad

Training Loss: 2.2645 	 Val Loss: 2.2730
Training Loss: 2.2600 	 Val Loss: 2.2726
Training Loss: 2.2564 	 Val Loss: 2.2714
Training Loss: 2.2533 	 Val Loss: 2.2701
Training Loss: 2.2504 	 Val Loss: 2.2687
Training Loss: 2.2478 	 Val Loss: 2.2673
Training Loss: 2.2455 	 Val Loss: 2.2660
Training Loss: 2.2433 	 Val Loss: 2.2648
Training Loss: 2.2412 	 Val Loss: 2.2636
Training Loss: 2.2393 	 Val Loss: 2.2624
Training Loss: 2.2376 	 Val Loss: 2.2614
Training Loss: 2.2359 	 Val Loss: 2.2604
Training Loss: 2.2343 	 Val Loss: 2.2594
Training Loss: 2.2329 	 Val Loss: 2.2585
Training Loss: 2.2315 	 Val Loss: 2.2577
Training Loss: 2.2302 	 Val Loss: 2.2568
Training Loss: 2.2290 	 Val Loss: 2.2561
Training Loss: 2.2278 	 Val Loss: 2.2553
Training Loss: 2.2267 	 Val Loss: 2.2546
Training Loss: 2.2256 	 Val Loss: 2.2540
Training Loss: 2.2246 	 Val Loss: 2.2533


### E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [46]:
W = torch.rand((703, 27), requires_grad=True)

In [49]:
for i in range(2000):
    logits = W[xs_train]
    loss = F.cross_entropy(logits, ys_train, label_smoothing=0.001)
    if i % 50 == 0:
        val_logits = W[xs_val]
        val_loss = F.cross_entropy(val_logits, ys_val, label_smoothing=0.001)
        print(f"Training Loss: {loss.item():.4f} \t Val Loss: {val_loss.item():.4f}")
    W.grad = None
    loss.backward()
    W.data += -30 * W.grad

Training Loss: 3.3491 	 Val Loss: 3.3458
Training Loss: 2.6668 	 Val Loss: 2.6717
Training Loss: 2.5195 	 Val Loss: 2.5287
Training Loss: 2.4467 	 Val Loss: 2.4595
Training Loss: 2.4021 	 Val Loss: 2.4175
Training Loss: 2.3715 	 Val Loss: 2.3889
Training Loss: 2.3490 	 Val Loss: 2.3680
Training Loss: 2.3316 	 Val Loss: 2.3520
Training Loss: 2.3178 	 Val Loss: 2.3393
Training Loss: 2.3065 	 Val Loss: 2.3290
Training Loss: 2.2971 	 Val Loss: 2.3205
Training Loss: 2.2891 	 Val Loss: 2.3133
Training Loss: 2.2822 	 Val Loss: 2.3071
Training Loss: 2.2763 	 Val Loss: 2.3018
Training Loss: 2.2710 	 Val Loss: 2.2972
Training Loss: 2.2664 	 Val Loss: 2.2931
Training Loss: 2.2623 	 Val Loss: 2.2895
Training Loss: 2.2586 	 Val Loss: 2.2863
Training Loss: 2.2553 	 Val Loss: 2.2834
Training Loss: 2.2523 	 Val Loss: 2.2807
Training Loss: 2.2495 	 Val Loss: 2.2784
Training Loss: 2.2470 	 Val Loss: 2.2762
Training Loss: 2.2447 	 Val Loss: 2.2742
Training Loss: 2.2425 	 Val Loss: 2.2724
Training Loss: 2

### E06: meta-exercise! Think of a fun/interesting exercise and complete it.


<p> Train the whole dataset using cross entropy loss and check the predictions. </p>

In [51]:
xs.shape, ys.shape

(torch.Size([228146]), torch.Size([228146]))

In [53]:
W = torch.rand((703, 27), requires_grad=True)

In [54]:
for i in range(3001):
    logits = W[xs]
    loss = F.cross_entropy(logits, ys, label_smoothing=0.001)
    if i % 100 == 0:
        print(f"Loss: {loss}")
    W.grad = None
    loss.backward()
    W.data += -30 * W.grad

Loss: 3.352999448776245
Loss: 2.521942615509033
Loss: 2.4047069549560547
Loss: 2.3517673015594482
Loss: 2.320843458175659
Loss: 2.3003499507904053
Loss: 2.285701274871826
Loss: 2.2746834754943848
Loss: 2.2660791873931885
Loss: 2.2591638565063477
Loss: 2.2534759044647217
Loss: 2.248708963394165
Loss: 2.2446517944335938
Loss: 2.2411534786224365
Loss: 2.238102912902832
Loss: 2.2354178428649902
Loss: 2.233035087585449
Loss: 2.2309043407440186
Loss: 2.2289879322052
Loss: 2.2272531986236572
Loss: 2.225675582885742
Loss: 2.224234104156494
Loss: 2.2229108810424805
Loss: 2.2216923236846924
Loss: 2.2205650806427
Loss: 2.219519853591919
Loss: 2.2185471057891846
Loss: 2.217639923095703
Loss: 2.2167911529541016
Loss: 2.2159953117370605
Loss: 2.215247631072998


In [75]:
# The loss above is nearly identical to the loss from the counting method. So we expect the model to generate same names.
gen = torch.Generator().manual_seed(2147483647)
for _ in range(10):
    out = []
    char1, char2 = '.', '.'
    
    while True:
        logits = W[ptoi[char1, char2]]
        probs = F.softmax(logits, dim=0)
        idx = torch.multinomial(probs, num_samples=1, replacement=True, generator=gen).item()
        if idx == 0:
            break
        char1 = char2
        char2 = itos[idx]
        out.append(itos[idx])
    
    print("".join(out))

junide
jakasid
prelay
adin
kairritoper
sathen
sameia
yanileniassibiainewin
lessiyanayla
te
