In [64]:
import os

In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
BASE_PATH = "/content/drive/MyDrive/makemore_course/"

In [67]:
NAMES = os.path.join(BASE_PATH, 'names.txt')

In [68]:
names = []
for f in open(NAMES, "r"):
  names.append(f[:-1])

**E02**: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [69]:
import torch
import torch.nn.functional as F
import itertools
import random
import time

In [70]:
alphabet = list(map(chr, range(97, 123)))
alphabet[:5]

['a', 'b', 'c', 'd', 'e']

In [71]:
comb2 = list(map(lambda x : x[0]+x[1], itertools.product(alphabet,repeat = 2))) 
# we are missing the start end token ".", let's add it.
# we can have it either in front or in the back of any letter.
s = list(map(lambda x: x[0]+x[1], zip(["." for _ in range(26)],alphabet)))
e = list(map(lambda x: x[0]+x[1], zip(alphabet, ["." for _ in range(26)])))
# add to our list of possible combinations:
comb2 += s
comb2 += e

In [72]:
# So what we have now is the possible first part of our trigram,
# which consist of 2 chars
# We need a way of encoding it.
chars_to_idx = {chars:idx for idx, chars in enumerate(comb2)}
# Also we want a mapping back:
idx_to_chars = {idx:chars for chars, idx in chars_to_idx.items()}

In [73]:
chars_to_idx[".e"]

680

In [74]:
# Also we want a mapping back:
idx_to_chars = {idx:chars for chars, idx in chars_to_idx.items()}
# analog for the second part of the trigram.
char_to_idx = {char:idx for idx, char in enumerate(['.'] + alphabet)}
idx_to_char = {idx:char for char, idx in char_to_idx.items()}

In [75]:
def create_dataset_trigram(words):
  x_s = []
  y_s = []
  for word in words:
    for x,y in zip(map(lambda x: x[0]+x[1], 
                      (zip("." + word, word[:]))),[x for x in word[1:] + "."]):
      x_i = chars_to_idx[x]
      y_i = char_to_idx[y]
      x_s.append(x_i)
      y_s.append(y_i)
  return torch.tensor(x_s), torch.tensor(y_s)

In [76]:
def create_dataset_bigram(words):
  # create the training set of bigrams (x,y)
  xs, ys = [], []

  for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
      ix1 = char_to_idx[ch1]
      ix2 = char_to_idx[ch2]
      xs.append(ix1)
      ys.append(ix2)
      
  xs = torch.tensor(xs)
  ys = torch.tensor(ys)
  return xs, ys

In [77]:
def init_weight_trigram():
  return torch.randn((728,27), requires_grad=True)

In [78]:
def init_weight_bigram():
  return torch.randn((27,27), requires_grad=True)

In [79]:
def encode_bigram(xs):
  xenc = F.one_hot(xs, num_classes=27).float()
  return xenc

In [80]:
def encode_trigram(xs):
  xenc = F.one_hot(xs, num_classes=728).float()
  return xenc

In [81]:
# create datasets
from sklearn.model_selection import train_test_split
#------------bigram--------------------
xsb, ysb = create_dataset_bigram(names)
xencb = encode_bigram(xsb)
xb_, xbtst, yb_, ybtst = train_test_split(xencb, ysb, test_size=0.1)
xbtrn, xbdev, ybtrn, ybdev = train_test_split(xb_, yb_, test_size=0.1/0.9)
#------------trigram--------------------
xst, yst = create_dataset_trigram(names)
xenct = encode_trigram(xst)
xt_, xttst, yt_, yttst = train_test_split(xenct, yst, test_size=0.1)
xttrn, xtdev, yttrn, ytdev = train_test_split(xt_, yt_, test_size=0.1/0.9)

Steps for training sumarized:

In [82]:
# calc log counts:
def calc_log_counts(xenc, W):
  return xenc@W

In [83]:
# calc counts:
def calc_counts(logits):
  return logits.exp()

In [84]:
# calc probs:
def calc_probs(counts):
  return counts/counts.sum(1, keepdims=True)

In [85]:
# calculate the loss:
# In we are interested that the joined probs are close to 1
# take the log of joints probs as log(a*b) = log(a)+log(b)
# in each step we are interested in the log(prob) given the labels.
# As log(1) = 0 our goal and bad loss is high positive number multiply by -1

def calc_log_prob(probs, y):
  n = len(y)
  log_prob = -probs[torch.arange(n),y].log().mean()
  return log_prob

In [86]:
# A step towards the right solution ;-):
def step(xenc, ys, W, lr, reg):
  # zero out the gradient:
  W.grad = None
  logits = calc_log_counts(xenc, W)
  counts = calc_counts(logits)
  probs = calc_probs(counts)
  loss = calc_log_prob(probs, ys)
  # add regularization!
  loss += reg*(W**2).mean()
  # accumalate gradients
  loss.backward()
  # update!
  with torch.no_grad():
    W -= lr * W.grad
  return loss, W

In [87]:
Wb = init_weight_bigram()
Wt = init_weight_trigram()

In [88]:
# run a whole training loop:
for k in range(300):
  lossb, Wb = step(xbtrn, ybtrn, Wb, 50, 0.01)
  losst, Wt = step(xttrn, yttrn, Wt, 500, 0.01)
  if k%5 == 0:
    print(f"epoch = {k}")
    print(f"current loss = {lossb.item()}")
    print(f"current loss = {losst.item()}")

epoch = 0
current loss = 3.6050350666046143
current loss = 3.819342613220215
epoch = 5
current loss = 2.8077731132507324
current loss = 2.5973057746887207
epoch = 10
current loss = 2.668156147003174
current loss = 2.4158191680908203
epoch = 15
current loss = 2.6057846546173096
current loss = 2.326489210128784
epoch = 20
current loss = 2.571425676345825
current loss = 2.2787113189697266
epoch = 25
current loss = 2.55021333694458
current loss = 2.244295120239258
epoch = 30
current loss = 2.5359885692596436
current loss = 2.2206623554229736
epoch = 35
current loss = 2.525876045227051
current loss = 2.2119650840759277
epoch = 40
current loss = 2.518364667892456
current loss = 2.215937376022339
epoch = 45
current loss = 2.512594699859619
current loss = 2.186171770095825
epoch = 50
current loss = 2.508045196533203
current loss = 2.193129777908325
epoch = 55
current loss = 2.504382371902466
current loss = 2.166161060333252
epoch = 60
current loss = 2.5013837814331055
current loss = 2.15848302

In [89]:
logits = calc_log_counts(xbdev, Wb)
counts = calc_counts(logits)
probs = calc_probs(counts)
loss = calc_log_prob(probs, ybdev)
print(f"Bigram loss on dev set   = {loss.item()}")
logits = calc_log_counts(xbtst, Wb)
counts = calc_counts(logits)
probs = calc_probs(counts)
loss = calc_log_prob(probs, ybtst)
print(f"Bigram loss on test set  = {loss.item()}")
logits = calc_log_counts(xtdev, Wt)
counts = calc_counts(logits)
probs = calc_probs(counts)
loss = calc_log_prob(probs, ytdev)
print(f"Trigram loss on dev set  = {loss.item()}")
logits = calc_log_counts(xttst, Wt)
counts = calc_counts(logits)
probs = calc_probs(counts)
loss = calc_log_prob(probs, yttst)
print(f"Trigram loss on test set = {loss.item()}")

Bigram loss on dev set   = 2.468491315841675
Bigram loss on test set  = 2.462188720703125
Trigram loss on dev set  = 2.1004040241241455
Trigram loss on test set = 2.119518756866455


**E03**: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [90]:
regs = torch.logspace(start=-7, end=-4, steps=20)
regs

tensor([1.0000e-07, 1.4384e-07, 2.0691e-07, 2.9764e-07, 4.2813e-07, 6.1585e-07,
        8.8587e-07, 1.2743e-06, 1.8330e-06, 2.6367e-06, 3.7927e-06, 5.4556e-06,
        7.8476e-06, 1.1288e-05, 1.6238e-05, 2.3357e-05, 3.3598e-05, 4.8329e-05,
        6.9519e-05, 1.0000e-04])

In [91]:
start = time.time()
for reg in regs:
  Wb = init_weight_bigram()
  Wt = init_weight_trigram()
  # run a whole training loop:
  for k in range(50):
    lossb, Wb = step(xbdev, ybdev, Wb, 50, reg)
    losst, Wt = step(xtdev, ytdev, Wt, 500, reg)
    if k == 49:
      print("-"*60)
      print(f"reg = {reg}")
      print(f"final loss bigram  = {lossb.item()}")
      print(f"final loss trigram = {losst.item()}")
      print("-"*20 + " EVAL ON TRAIN DATA " + "-"*20)
      logits = calc_log_counts(xbtrn, Wb)
      counts = calc_counts(logits)
      probs = calc_probs(counts)
      loss = calc_log_prob(probs, ybtrn)
      print(f"bigram loss on the train set = {loss}")
      logits = calc_log_counts(xttrn, Wt)
      counts = calc_counts(logits)
      probs = calc_probs(counts)
      loss = calc_log_prob(probs, yttrn)
      print(f"trigram loss on the train set = {loss}")
end = time.time()
eval_time1 = end-start

------------------------------------------------------------
reg = 1.0000000116860974e-07
final loss bigram  = 2.501140594482422
final loss trigram = 2.056353807449341
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.510770082473755
trigram loss on the train set = 2.2604286670684814
------------------------------------------------------------
reg = 1.4384498570052529e-07
final loss bigram  = 2.4968645572662354
final loss trigram = 2.068697214126587
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.5062930583953857
trigram loss on the train set = 2.259241819381714
------------------------------------------------------------
reg = 2.069138105298407e-07
final loss bigram  = 2.496697187423706
final loss trigram = 2.059469223022461
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.50626277923584
trigram loss on the train set = 2.268860101699829
---------------

It seems that for the bigram model we can get better model with less data. 


In [92]:
"""BEST SETTINGS (POSSIBLE IMPROVEMENTS BY ADJUSTING LR AND TRAINING STEPS)
------------------------------------------------------------
reg = 4.281332337541244e-07
final loss bigram  = 2.4863979816436768
final loss trigram = 2.0799720287323
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.500825881958008
trigram loss on the train set = 2.2533843517303467
------------------------------------------------------------
------------------------------------------------------------
reg = 1.6237767340498976e-05
final loss bigram  = 2.4881796836853027
final loss trigram = 2.0798087120056152
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.504011631011963
trigram loss on the train set = 2.2543632984161377
------------------------------------------------------------
"""

'BEST SETTINGS (POSSIBLE IMPROVEMENTS BY ADJUSTING LR AND TRAINING STEPS)\n------------------------------------------------------------\nreg = 4.281332337541244e-07\nfinal loss bigram  = 2.4863979816436768\nfinal loss trigram = 2.0799720287323\n-------------------- EVAL ON TRAIN DATA --------------------\nbigram loss on the train set = 2.500825881958008\ntrigram loss on the train set = 2.2533843517303467\n------------------------------------------------------------\n------------------------------------------------------------\nreg = 1.6237767340498976e-05\nfinal loss bigram  = 2.4881796836853027\nfinal loss trigram = 2.0798087120056152\n-------------------- EVAL ON TRAIN DATA --------------------\nbigram loss on the train set = 2.504011631011963\ntrigram loss on the train set = 2.2543632984161377\n------------------------------------------------------------\n'

**E04**: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [93]:
# A simple example:
X = torch.tensor([[0,1],[1,0],[0,1]])
X

tensor([[0, 1],
        [1, 0],
        [0, 1]])

In [94]:
W = torch.arange(6).reshape(2,3)
W

tensor([[0, 1, 2],
        [3, 4, 5]])

In [95]:
# First column contains second column of W
# Second column contains first column of W
# Third like in first case
X@W

tensor([[3, 4, 5],
        [0, 1, 2],
        [3, 4, 5]])

In [96]:
# Before OHE the the vector x was given by:
X_s = torch.tensor([1,0,1])

In [97]:
X_s

tensor([1, 0, 1])

In [98]:
W[X_s]

tensor([[3, 4, 5],
        [0, 1, 2],
        [3, 4, 5]])

In [99]:
# This archieves exactly the same!.

In [100]:
# calc log counts without encoding:
def calc_log_counts_alt(x_s, W):
  return W[x_s]

In [101]:
# A step towards the right solution ;-):
def step_alt(x_s, ys, W, lr, reg):
  # zero out the gradient:
  W.grad = None
  logits = calc_log_counts_alt(x_s, W)
  counts = calc_counts(logits)
  probs = calc_probs(counts)
  loss = calc_log_prob(probs, ys)
  # add regularization!
  loss += reg*(W**2).mean()
  # accumalate gradients
  loss.backward()
  # update!
  with torch.no_grad():
    W -= lr * W.grad
  return loss, W

In [102]:
import time
start = time.time()
for _ in range(20):
  step_alt(xst,yst, Wt, 500, 0.0001);
end = time.time()

print(f"time it took = {end-start}s")

time it took = 1.5573322772979736s


In [103]:
import time
start = time.time()
for _ in range(20):
  step(xenct,yst, Wt, 500, 0.0001);
end = time.time()

print(f"time it took = {end-start}s")

time it took = 9.793309450149536s


In [104]:
# The indexing method brings a significant upgrade in speed!

In [105]:
# create datasets for alternative method
from sklearn.model_selection import train_test_split
#------------bigram--------------------
xsb, ysb = create_dataset_bigram(names)
xb_, xbtst, yb_, ybtst = train_test_split(xsb, ysb, test_size=0.1)
xbtrn, xbdev, ybtrn, ybdev = train_test_split(xb_, yb_, test_size=0.1/0.9)
#------------trigram--------------------
xst, yst = create_dataset_trigram(names)
xt_, xttst, yt_, yttst = train_test_split(xst, yst, test_size=0.1)
xttrn, xtdev, yttrn, ytdev = train_test_split(xt_, yt_, test_size=0.1/0.9)

In [106]:
start = time.time()
for reg in regs:
  Wb = init_weight_bigram()
  Wt = init_weight_trigram()
  # run a whole training loop:
  for k in range(50):
    lossb, Wb = step_alt(xbdev, ybdev, Wb, 50, reg)
    losst, Wt = step_alt(xtdev, ytdev, Wt, 500, reg)
    if k == 49:
      print("-"*60)
      print(f"reg = {reg}")
      print(f"final loss bigram  = {lossb.item()}")
      print(f"final loss trigram = {losst.item()}")
      print("-"*20 + " EVAL ON TRAIN DATA " + "-"*20)
      logits = calc_log_counts_alt(xbtrn, Wb)
      counts = calc_counts(logits)
      probs = calc_probs(counts)
      loss = calc_log_prob(probs, ybtrn)
      print(f"bigram loss on the train set = {loss}")
      logits = calc_log_counts_alt(xttrn, Wt)
      counts = calc_counts(logits)
      probs = calc_probs(counts)
      loss = calc_log_prob(probs, yttrn)
      print(f"trigram loss on the train set = {loss}")
end = time.time()
eval_time2 = end-start

------------------------------------------------------------
reg = 1.0000000116860974e-07
final loss bigram  = 2.4864425659179688
final loss trigram = 2.0778849124908447
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.500325918197632
trigram loss on the train set = 2.256464958190918
------------------------------------------------------------
reg = 1.4384498570052529e-07
final loss bigram  = 2.4858601093292236
final loss trigram = 2.070258617401123
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.500339984893799
trigram loss on the train set = 2.271763801574707
------------------------------------------------------------
reg = 2.069138105298407e-07
final loss bigram  = 2.489915132522583
final loss trigram = 2.067309856414795
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.503258228302002
trigram loss on the train set = 2.2604968547821045
-------------

In [113]:
print(f"speedup for hyperparam: {100*(1-eval_time2/eval_time1):.2f}%")

speedup for hyperparam: 76.33%


**E05**: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [116]:
# A step towards the right solution ;-):
def step_alt(x_s, ys, W, lr, reg):
  # zero out the gradient:
  W.grad = None
  logits = calc_log_counts_alt(x_s, W)
  loss = F.cross_entropy(logits, ys)
  # add regularization!
  loss += reg*(W**2).mean()
  # accumalate gradients
  loss.backward()
  # update!
  with torch.no_grad():
    W -= lr * W.grad
  return loss, W

In [117]:
start = time.time()
for reg in regs:
  Wb = init_weight_bigram()
  Wt = init_weight_trigram()
  # run a whole training loop:
  for k in range(50):
    lossb, Wb = step_alt(xbdev, ybdev, Wb, 50, reg)
    losst, Wt = step_alt(xtdev, ytdev, Wt, 500, reg)
    if k == 49:
      print("-"*60)
      print(f"reg = {reg}")
      print(f"final loss bigram  = {lossb.item()}")
      print(f"final loss trigram = {losst.item()}")
      print("-"*20 + " EVAL ON TRAIN DATA " + "-"*20)
      logits = calc_log_counts_alt(xbtrn, Wb)
      counts = calc_counts(logits)
      probs = calc_probs(counts)
      loss = calc_log_prob(probs, ybtrn)
      print(f"bigram loss on the train set = {loss}")
      logits = calc_log_counts_alt(xttrn, Wt)
      counts = calc_counts(logits)
      probs = calc_probs(counts)
      loss = calc_log_prob(probs, yttrn)
      print(f"trigram loss on the train set = {loss}")
end = time.time()
eval_time3 = end-start

------------------------------------------------------------
reg = 1.0000000116860974e-07
final loss bigram  = 2.487001419067383
final loss trigram = 2.0704164505004883
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.50141978263855
trigram loss on the train set = 2.2719459533691406
------------------------------------------------------------
reg = 1.4384498570052529e-07
final loss bigram  = 2.485849618911743
final loss trigram = 2.0710623264312744
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.500710964202881
trigram loss on the train set = 2.276031255722046
------------------------------------------------------------
reg = 2.069138105298407e-07
final loss bigram  = 2.4877727031707764
final loss trigram = 2.0697617530822754
-------------------- EVAL ON TRAIN DATA --------------------
bigram loss on the train set = 2.5016205310821533
trigram loss on the train set = 2.2576088905334473
-----------

In [121]:
eval_time3/eval_time2

0.9705164269225395

We get a speedup by 3 percent. We (most of the times) prefer pytorch implementations as they are optimized for highest efficency and keep the code clean.

A trade off is that it's less clear how to interprete the intermediate steps as they are not visible to us.