In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
from google.colab import drive
drive.mount('/content/drive')
with open('/content/drive/MyDrive/Colab Notebooks/shakespeare.txt', 'r', encoding='utf-8') as f:
  text = f.read()

# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
len(text)

1125396

In [21]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [26]:
#character mapping
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [29]:
data = torch.tensor(encode(text), dtype=torch.long)

In [32]:
n = int(0.9*(len(data)))
train_data = data[:n]
val_data = data[n:]

In [35]:
torch.manual_seed(1337)
batch_size = 4 # parallel processes
block_size = 8

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size, ))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch('train')

In [42]:
class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.toekn_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    #idx and targets are both (B,T) tensor of ints
    logits = self.toekn_embedding_table(idx) #B,T,C - Batch (4), Time (8), Channel (vocab_size - 65)

    if targets is None: loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets) # pyTorch expects B,C,T

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B,T) array of indices in current context
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=1)
      idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
      idx = torch.cat((idx, idx_next), dim=1)
    return idx



In [43]:
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
loss

print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])

phIOWd3AqNcgg,G!;j
UtVYwJteWJc3xq.NBpFdLXaqK; eyjnB,Icl'Vn3M3M:JSe;bVbN N&DsRi?!DSaeyNZlSYjVkCzkSdoc


In [44]:
#create a pyTorch Optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [55]:
batch_size = 32
for steps in range(10000):

  #sample a batch of data
  xb, yb = get_batch('train')

  #evaluate the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if steps % 1000 == 0:
    print(f'step: {steps} - loss: {loss.item()}')

step: 0 - loss: 2.4793341159820557
step: 1000 - loss: 2.417367696762085
step: 2000 - loss: 2.4927120208740234
step: 3000 - loss: 2.5249452590942383
step: 4000 - loss: 2.382594108581543
step: 5000 - loss: 2.553101062774658
step: 6000 - loss: 2.388209342956543
step: 7000 - loss: 2.2943837642669678
step: 8000 - loss: 2.478649616241455
step: 9000 - loss: 2.506782293319702


In [59]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Gons
RWhth caiferiso thonthen ivournsomednkes I GOfom puth h'sofou al gselorener, f h se:

TI alb
G 


In [65]:
#attention
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

#single head self-attention
head_size =16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B,T,16)
q = query(x) # (B,T,16)
wei = q @ k.transpose(-2,-1) #(B,T,16) @ (B,16,T) --> (B,T,T)

tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
#wei = wei.maksed_fill(trill == 0, float('-inf'))
#wei = F.softmax(wei, dim=1)
v = value(x)
out = wei @ v
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [67]:
out

tensor([[[-1.5553, -3.1604, -2.8190,  2.9571,  2.8228, -4.5819, -1.0032,
          -0.6739,  3.9320,  1.7403, -1.8659,  4.1820,  3.6124,  1.7747,
           3.6221, -1.9745],
         [ 0.2804, -6.1255, -4.6378,  3.7460,  3.5467, -6.8399, -1.4077,
           1.2388,  6.7894,  2.4688, -3.0881,  5.7816,  5.5295,  2.6312,
           3.4904, -5.3116],
         [-0.2028, -1.0798,  1.1337, -1.8281, -1.0117, -1.4870,  0.6158,
          -0.3724,  0.4066, -0.6815, -0.6541, -1.2379, -0.1042,  0.3479,
          -1.0805, -2.1146],
         [-0.1838,  0.7759,  2.4721, -2.8973, -1.5493,  1.6039,  0.8140,
          -1.7155, -1.1860, -0.3835, -0.3502, -2.5238, -1.9642,  0.1310,
          -0.1379, -1.8386],
         [-1.5905,  2.5991,  1.3898,  1.6365, -0.2178,  2.9594,  0.6283,
           1.9547, -0.5275, -2.3606, -0.1160,  0.4373, -1.1773, -1.0424,
          -1.6278, -1.0493],
         [ 3.6563,  2.5309,  1.8265, -0.0218, -1.0998,  2.1453,  1.9700,
           4.7205, -3.8981, -4.6927,  1.4376, -3.324