In [1]:
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
BASE_PATH = "/content/drive/MyDrive/makemore_course/"

In [4]:
NAMES = os.path.join(BASE_PATH, 'names.txt')

In [5]:
names = []
for f in open(NAMES, "r"):
  names.append(f[:-1])

In [6]:
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']



**E01**: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [7]:
import torch
import torch.nn.functional as F
import itertools
import random

In [8]:
alphabet = list(map(chr, range(97, 123)))
alphabet[:5]

['a', 'b', 'c', 'd', 'e']

In [9]:
list(itertools.product(alphabet[:5],repeat=2))

[('a', 'a'),
 ('a', 'b'),
 ('a', 'c'),
 ('a', 'd'),
 ('a', 'e'),
 ('b', 'a'),
 ('b', 'b'),
 ('b', 'c'),
 ('b', 'd'),
 ('b', 'e'),
 ('c', 'a'),
 ('c', 'b'),
 ('c', 'c'),
 ('c', 'd'),
 ('c', 'e'),
 ('d', 'a'),
 ('d', 'b'),
 ('d', 'c'),
 ('d', 'd'),
 ('d', 'e'),
 ('e', 'a'),
 ('e', 'b'),
 ('e', 'c'),
 ('e', 'd'),
 ('e', 'e')]

In [10]:
# Possible combinations of two chars (underscore)
list(map(lambda x : x[0]+x[1], itertools.product(alphabet[:3],repeat = 2))) 

['aa', 'ab', 'ac', 'ba', 'bb', 'bc', 'ca', 'cb', 'cc']

In [11]:
comb2 = list(map(lambda x : x[0]+x[1], itertools.product(alphabet,repeat = 2))) 
random.sample(comb2,10)

['tr', 'lz', 'ph', 'du', 'pl', 'fh', 'sq', 'ah', 'fa', 'ks']

In [22]:
comb3 = list(map(lambda x : x[0]+x[1], itertools.product(alphabet,comb2))) 
random.sample(comb3,10)

['fpg', 'gsr', 'ohu', 'xac', 'hja', 'sww', 'odk', 'aid', 'exf', 'pfp']

In [28]:
# we are missing the start end token ".", let's add it.
# we can have it either in front or in the back of any letter.
s = list(map(lambda x: x[0]+x[1], zip(["." for _ in range(26*26)],comb2)))
e = list(map(lambda x: x[0]+x[1], zip(comb2, ["." for _ in range(26*26)])))

In [29]:
random.sample(s,3)

['.re', '.qb', '.os']

In [30]:
random.sample(e,3)

['by.', 'cb.', 'es.']

In [32]:
# add to our list of possible combinations:
comb3 += s
comb3 += e

In [33]:
print(f"{len(comb3)} combiantions!")
print(f"Sanity check: combs of letters + start + end = 26 x 26 x 26  + 728 + 728 = {26*26*26 + 26*26 + 26*26}")

18928 combiantions!
Sanity check: combs of letters + start + end = 26 x 26 x 26  + 728 + 728 = 18928


In [34]:
# So what we have now is the possible first part of our fourgram,
# which consist of 2 chars
# We need a way of encoding it.
chars_to_idx = {chars:idx for idx, chars in enumerate(comb3)}
chars_to_idx["axh"]

605

In [35]:
# Also we want a mapping back:
idx_to_chars = {idx:chars for chars, idx in chars_to_idx.items()}
idx_to_chars[605]

'axh'

In [36]:
# analog for the second part of the fourgram.
char_to_idx = {char:idx for idx, char in enumerate(['.'] + alphabet)}
char_to_idx["x"]

24

In [37]:
idx_to_char = {idx:char for char, idx in char_to_idx.items()}
idx_to_char[23]

'w'

In [38]:
# Consider following list of words
words = ["dagobert", "donald", "gustav"]
words

['dagobert', 'donald', 'gustav']

In [51]:
# possible first part of fourgram
list(map(lambda x: x[0]+x[1][0]+x[1][1], zip("." + words[0], zip(words[0][:],words[0][1:]))))

['.da', 'dag', 'ago', 'gob', 'obe', 'ber', 'ert']

In [52]:
# possible last part of fourgram
[x for x in words[0][2:] + "."]

['g', 'o', 'b', 'e', 'r', 't', '.']

In [55]:
# lets print out all fourgrams in the above words!
for word in words:
  for x,y in zip(map(lambda x: x[0]+x[1][0]+x[1][1], 
                     zip("." + word, zip(word[:],word[1:]))),
                 [x for x in word[2:] + "."]):
    print(x,y)

.da g
dag o
ago b
gob e
obe r
ber t
ert .
.do n
don a
ona l
nal d
ald .
.gu s
gus t
ust a
sta v
tav .


In [56]:
# we can convert to int with the dicts from above:
# lets print out all fourgrams in the above words!
for word in words:
  for x,y in zip(map(lambda x: x[0]+x[1][0]+x[1][1], 
                     zip("." + word, zip(word[:],word[1:]))),
                 [x for x in word[2:] + "."]):
    x_i = chars_to_idx[x]
    y_i = char_to_idx[y]
    print(x_i, y_i)

17654 7
2034 15
170 2
4421 5
9494 18
797 20
3165 0
17668 14
2405 1
9802 12
8799 4
289 0
17752 19
4594 20
14007 1
12662 22
12865 0


In [57]:
def create_dataset(words):
  x_s = []
  y_s = []
  for word in words:
    for x,y in zip(map(lambda x: x[0]+x[1][0]+x[1][1], 
                     zip("." + word, zip(word[:],word[1:]))),
                 [x for x in word[2:] + "."]):
      x_i = chars_to_idx[x]
      y_i = char_to_idx[y]
      x_s.append(x_i)
      y_s.append(y_i)
  return torch.tensor(x_s), torch.tensor(y_s)

In [67]:
# Lets encode the names
x_s, y_s = create_dataset(names)

Steps for training sumarized:

In [68]:
# init Weights:
def init_weight():
  return torch.randn((18928,27), requires_grad=True)

In [69]:
# calc log counts with indexing method (see notebook for exercises 2-5):
def calc_log_counts(x_s, W):
  return W[x_s]

In [78]:
# calc counts:
def calc_counts(logits):
  return logits.exp()

In [79]:
# calc probs:
def calc_probs(counts):
  return counts/counts.sum(1, keepdims=True)

In [70]:
# A step towards the right solution ;-):
# Make use of pytorchs crossentropy for faster execution
def step(x_s, ys, W, lr, reg=0):
  # zero out the gradient:
  W.grad = None
  logits = calc_log_counts(x_s, W)
  loss = F.cross_entropy(logits, ys)
  # add regularization!
  loss += reg*(W**2).mean()
  # accumalate gradients
  loss.backward()
  # update!
  with torch.no_grad():
    W -= lr * W.grad
  return loss, W

In [97]:
# run a whole training loop:
W = init_weight()
for k in range(1000):
  loss, W = step(x_s, y_s, W, 400, 0.0001)
  if k%50 == 0:
    print(f"epoch = {k}")
    print(f"current loss = {loss.item()}")

epoch = 0
current loss = 3.776249408721924
epoch = 50
current loss = 2.373969793319702
epoch = 100
current loss = 2.151280164718628
epoch = 150
current loss = 2.0459976196289062
epoch = 200
current loss = 1.9819753170013428
epoch = 250
current loss = 1.937883734703064
epoch = 300
current loss = 1.9052724838256836
epoch = 350
current loss = 1.8799858093261719
epoch = 400
current loss = 1.8596954345703125
epoch = 450
current loss = 1.8429841995239258
epoch = 500
current loss = 1.8289380073547363
epoch = 550
current loss = 1.8169379234313965
epoch = 600
current loss = 1.8065502643585205
epoch = 650
current loss = 1.7974599599838257
epoch = 700
current loss = 1.7894316911697388
epoch = 750
current loss = 1.7822870016098022
epoch = 800
current loss = 1.7758852243423462
epoch = 850
current loss = 1.7701154947280884
epoch = 900
current loss = 1.7648885250091553
epoch = 950
current loss = 1.7601314783096313


In [98]:
g = torch.Generator().manual_seed(2147483647)
def create_names(num_names = 5, starting_letters = "ab"):
  # to be reporoducible
  random.seed(10)
  # Now we can use our model to predict words!
  for i in range(num_names):
    out = ""
    # Create names which start with ma
    letters = "." + starting_letters
    ixs = chars_to_idx[letters]
    while True:
      # Append letter to name
      out += letters[1]
      logits = calc_log_counts(ixs, W) 
      counts = calc_counts(logits) 
      p = counts/counts.sum()
      # ----------
      # Sample next letter from distribution given by the network!
      ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
      # update the letters used to predict
      letters = letters[1:] + idx_to_char[ix]
      ixs = chars_to_idx[letters]
      # end token -> Stop!
      if ix == 0:
        print(out)
        break

In [111]:
create_names(20, "lo")

lorabel
lou
loeec
lovehwvdfr
lorhann
logy
lor
lo
lowayn
loyalea
lorey
logenicarisha
loveremere
lolpbghcogqtdbe
loy
lorel
love
lokpyut
lora
lov


Our model seems to be full of love :)
