In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# Read in the words
words = open('/Users/rajesh/Documents/Datasets/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

In [7]:
# Build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [9]:
# Build the dataset

block_size = 3     # Context Length: How many previous characters do we use to predict the next one?
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        idx = stoi[ch]
        X.append(context)
        Y.append(idx)
        print(''.join(itos[i] for i in context), '----->', itos[idx])
        context = context[1:] + [idx]     # Crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... -----> e
..e -----> m
.em -----> m
emm -----> a
mma -----> .
olivia
... -----> o
..o -----> l
.ol -----> i
oli -----> v
liv -----> i
ivi -----> a
via -----> .
ava
... -----> a
..a -----> v
.av -----> a
ava -----> .
isabella
... -----> i
..i -----> s
.is -----> a
isa -----> b
sab -----> e
abe -----> l
bel -----> l
ell -----> a
lla -----> .
sophia
... -----> s
..s -----> o
.so -----> p
sop -----> h
oph -----> i
phi -----> a
hia -----> .


In [10]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [11]:
# Embedding logup table
C = torch.randn((27,2))

In [12]:
C[5]

tensor([-0.5771, -0.2291])

In [13]:
F.one_hot(torch.tensor(5), num_classes=27)

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

In [14]:
# Multiply the one hot encoding vector with the columns of C

F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([-0.5771, -0.2291])

In [18]:
# Get multiple elements
# Note that the index 7 is repeated multiple times and will be retrieved as many times
C[torch.tensor([5,6,7,7,7,7])]

tensor([[-0.5771, -0.2291],
        [-0.3132, -1.0164],
        [-0.2915,  1.8365],
        [-0.2915,  1.8365],
        [-0.2915,  1.8365],
        [-0.2915,  1.8365]])

In [19]:
C[X].shape

torch.Size([32, 3, 2])

In [20]:
# Using pytorch indexing we create our embedding as follows
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [21]:
# Construct the hidden layer
# We have 3 embeddings each of which is 2 dimensional - 6 inputs
# Assume 100 neurons in the hidden layer
# b is bias

W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [22]:
# We want to multiply the embeddings by the weight
# However, the below gives an error because the embeddings have not been concatenated
emb @ W1 + b1

RuntimeError: mat1 and mat2 shapes cannot be multiplied (96x2 and 6x100)

In [24]:
# Note that the emb is a tensor of size 32, 3, 2
# This results in a 32 x 6 tensor by concatenating the 3 embeddings
torch.cat([emb[:, 0, :], emb[:, 1, :], emb [:, 2, :]], 1)

tensor([[ 1.1960, -1.3210,  1.1960, -1.3210,  1.1960, -1.3210],
        [ 1.1960, -1.3210,  1.1960, -1.3210, -0.5771, -0.2291],
        [ 1.1960, -1.3210, -0.5771, -0.2291,  1.2075, -0.6821],
        [-0.5771, -0.2291,  1.2075, -0.6821,  1.2075, -0.6821],
        [ 1.2075, -0.6821,  1.2075, -0.6821,  0.6793, -1.3878],
        [ 1.1960, -1.3210,  1.1960, -1.3210,  1.1960, -1.3210],
        [ 1.1960, -1.3210,  1.1960, -1.3210, -1.2841,  1.7233],
        [ 1.1960, -1.3210, -1.2841,  1.7233,  2.0226, -1.2506],
        [-1.2841,  1.7233,  2.0226, -1.2506,  1.1542,  0.4299],
        [ 2.0226, -1.2506,  1.1542,  0.4299, -0.5908,  0.8096],
        [ 1.1542,  0.4299, -0.5908,  0.8096,  1.1542,  0.4299],
        [-0.5908,  0.8096,  1.1542,  0.4299,  0.6793, -1.3878],
        [ 1.1960, -1.3210,  1.1960, -1.3210,  1.1960, -1.3210],
        [ 1.1960, -1.3210,  1.1960, -1.3210,  0.6793, -1.3878],
        [ 1.1960, -1.3210,  0.6793, -1.3878, -0.5908,  0.8096],
        [ 0.6793, -1.3878, -0.5908,  0.8

In [25]:
# Use torch.unbind to remove a tensor dimension
torch.unbind(emb, 1)

(tensor([[ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [-0.5771, -0.2291],
         [ 1.2075, -0.6821],
         [ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [-1.2841,  1.7233],
         [ 2.0226, -1.2506],
         [ 1.1542,  0.4299],
         [-0.5908,  0.8096],
         [ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [ 0.6793, -1.3878],
         [ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [ 1.1542,  0.4299],
         [-0.6814, -0.0943],
         [ 0.6793, -1.3878],
         [-0.6247,  1.1240],
         [-0.5771, -0.2291],
         [ 2.0226, -1.2506],
         [ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [-0.6814, -0.0943],
         [-1.2841,  1.7233],
         [ 0.6852, -2.2540],
         [-0.2732,  0.7326]]),
 tensor([[ 1.1960, -1.3210],
         [ 1.1960, -1.3210],
         [-0

In [26]:
# Note that the Pytorch concatenation creates a whole new tensor that uses new memory

torch.cat(torch.unbind(emb, 1), 1).shape

torch.Size([32, 6])

In [27]:
# Torch views are more efficient ways of concatenating the data
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [28]:
# The above torch vector can be represented as different sized tensors
a.view(2,9)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [29]:
a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [31]:
# Storage is always a 1-D vector in memory
a.storage()

# Reference: log.ezyang.com (PyTorch internals)

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage._TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [32]:
emb.shape

torch.Size([32, 3, 2])

In [33]:
emb.view(32,6)

tensor([[ 1.1960, -1.3210,  1.1960, -1.3210,  1.1960, -1.3210],
        [ 1.1960, -1.3210,  1.1960, -1.3210, -0.5771, -0.2291],
        [ 1.1960, -1.3210, -0.5771, -0.2291,  1.2075, -0.6821],
        [-0.5771, -0.2291,  1.2075, -0.6821,  1.2075, -0.6821],
        [ 1.2075, -0.6821,  1.2075, -0.6821,  0.6793, -1.3878],
        [ 1.1960, -1.3210,  1.1960, -1.3210,  1.1960, -1.3210],
        [ 1.1960, -1.3210,  1.1960, -1.3210, -1.2841,  1.7233],
        [ 1.1960, -1.3210, -1.2841,  1.7233,  2.0226, -1.2506],
        [-1.2841,  1.7233,  2.0226, -1.2506,  1.1542,  0.4299],
        [ 2.0226, -1.2506,  1.1542,  0.4299, -0.5908,  0.8096],
        [ 1.1542,  0.4299, -0.5908,  0.8096,  1.1542,  0.4299],
        [-0.5908,  0.8096,  1.1542,  0.4299,  0.6793, -1.3878],
        [ 1.1960, -1.3210,  1.1960, -1.3210,  1.1960, -1.3210],
        [ 1.1960, -1.3210,  1.1960, -1.3210,  0.6793, -1.3878],
        [ 1.1960, -1.3210,  0.6793, -1.3878, -0.5908,  0.8096],
        [ 0.6793, -1.3878, -0.5908,  0.8

In [34]:
# Elementwise equality comparison
emb.view(32,6) == torch.cat(torch.unbind(emb, 1), 1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [35]:
# Back to the hidden layer

# Note that the -1 is interpreted by Pytorch as the remaining number of elements
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)

# Note that + b1 is using broadcasting.
# (emb.view(-1, 6) @ W1).shape = torch.Size([32,100])
# b1.shape = torch.Size([100])
# Broadcasting will align from the right and create a fake dimension (32)
# 32, 100
# 1, 100

In [36]:
h

tensor([[-0.8824, -0.9436, -0.9902,  ...,  0.9998, -0.8671,  0.5502],
        [-0.9745,  0.4683,  0.3501,  ...,  0.7964,  0.8828,  0.9975],
        [-0.8380, -0.2689, -0.9956,  ...,  0.9753, -0.9988, -0.9362],
        ...,
        [ 0.9512, -0.9699, -0.8260,  ..., -0.9998,  0.9580, -0.0167],
        [-0.9100,  0.3560,  0.0192,  ..., -0.0477, -0.9796, -0.3332],
        [-0.1659, -0.4239, -0.5958,  ...,  0.9731,  0.3468, -0.8914]])

In [37]:
# Hidden layer activations for every one of our 32 examples
h.shape

torch.Size([32, 100])

In [None]:
# Create the output layer
W2 = torch.rndn((100,27))
b2 = torch.rndn(27)

In [None]:
logits = h @ W2 + b2

In [None]:
logits.shape