In [143]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [144]:
#read in all words
words=open('names.txt','r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [145]:
#build the vocabulary of characters and mapping to/from integers
chars=sorted(list(set(''.join(words))))
stoi={s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos={i:s for s,i in stoi.items()}
vocab_size=len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [146]:
#build the dataset
block_size = 3

def build_dataset(words):  #why () words? :tr,dev,te
    X,Y=[],[]

    for w in words:
        context=[0] * block_size
        for ch in w+ '.':
            ix=stoi[ch]
            X.append(context)
            Y.append(ix)
            context=context[1:]+[ix]
        
    X=torch.tensor(X)
    Y=torch.tensor(Y)
    print(X.shape,Y.shape)
    return X,Y

import random
random.seed(42)
random.shuffle(words)
n1=int(0.8*len(words))
n2=int(0.9*len(words))

Xtr,Ytr=build_dataset(words[:n1])
Xdev,Ydev=build_dataset(words[n1:n2])
Xte,Yte=build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [147]:
#function we will use later when comparing manual gradients to Pytorch gradients
def cmp(s,dt,t):
    ex=torch.all(dt == t.grad).item()
    app=torch.allclose(dt,t.grad)
    maxdiff=(dt-t.grad).abs().max().item()
    print(f'{s:15s} | exact:{str(ex):5s} | approximate:{str(app):5s}|maxdiff:{maxdiff}')  #不懂 暂放

In [148]:
n_embd = 10 #the dimensionakity of the character embedding vectors
n_hidden = 64

g=torch.Generator().manual_seed(2147483647)
C=torch.randn((vocab_size,n_embd),generator=g)
#Layer 1
W1=torch.randn((n_embd * block_size,n_hidden),generator=g)*(5/3)/((n_embd*block_size)**0.5)
b1=torch.randn(n_hidden,generator=g)*0.1  #uesless,just for fun
#Layer 2
W2=torch.randn((n_hidden,vocab_size),generator=g)*0.1
b2=torch.randn(vocab_size,generator=g)*0.1
#BatchNorm parameters  ???
bngain=torch.randn((1,n_hidden))*0.1 + 1.0
bnbias=torch.randn((1,n_hidden))*0.1

parameters = [C,W1,b1,W2,b2,bngain, bnbias]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad=True

4137


In [149]:
batch_size=32
n=batch_size
ix=torch.randint(0,Xtr.shape[0],(batch_size,),generator=g)  #在训练集矩阵里随机抽取32行，相当于有很多向量，随机抽取32个
print(ix)
print(Xtr[ix].shape)
Xb,Yb=Xtr[ix],Ytr[ix]

tensor([120000, 157809,  82137,  69514,  73004,  68734,    286, 123947,  13538,
         42674, 165010,  81021,  59151,  46471,  62456,  64636,  24418, 108817,
        169833, 145683, 168275, 157689,  36258, 142280,  32537, 149713, 149734,
        149517, 165139, 153533,  89661,  20039])
torch.Size([32, 3])


In [150]:
#forward pass

emb=C[Xb]
#print(emb.shape)
embcat=emb.view(emb.shape[0],-1) #concatenate the vectors
#Linear layer 1
hprebn=embcat@W1+b1
#BatchNorm layer
bnmeani=1/n*hprebn.sum(0,keepdim=True)
bndiff=hprebn-bnmeani  #广播
#print(bnmeani.shape)
#print(hprebn.shape)
bndiff2=bndiff**2
bnvar=1/(n-1)*(bndiff2).sum(0,keepdim=True)
bnvar_inv=(bnvar+1e-5)**-0.5
bnraw=bndiff*bnvar_inv  #normal
hpreact=bngain*bnraw + bnbias
#Non-linearity
h=torch.tanh(hpreact)
#Linear layer2
logits=h@W2+b2
logits_maxes=logits.max(1,keepdim=True).values  #??values
norm_logits=logits-logits_maxes
counts=norm_logits.exp()
counts_sum=counts.sum(1,keepdims=True)
counts_sum_inv=counts_sum**-1
probs=counts * counts_sum_inv
logprobs=probs.log()
loss=-logprobs[range(n),Yb].mean()

for p in parameters:
    p.grad=None
for t in [logprobs,probs,counts,counts_sum,counts_sum_inv,
          norm_logits,logits_maxes,logits,h,hpreact,bnraw,
          bnvar_inv,bnvar,bndiff2,bndiff,hprebn,bnmeani,embcat,emb]:
    t.retain_grad()
loss.backward()
loss

tensor(3.3434, grad_fn=<NegBackward0>)

In [151]:
dlogprobs=torch.zeros_like(logprobs)  #generate zeros which have form like ()
dlogprobs[range(n),Yb]=-1.0/n
dprobs=(1.0/probs)*dlogprobs
dcounts_sum_inv=(counts*dprobs).sum(1,keepdim=True)
dcounts=counts_sum_inv*dprobs
dcounts_sum=dcounts_sum_inv*(-counts_sum**-2)
dcounts += torch.ones_like(counts) * dcounts_sum
dnorm_logits=counts*dcounts
dlogits=dnorm_logits.clone()  #??
dlogits_maxes=(-dnorm_logits).sum(1,keepdim=True)
dlogits += F.one_hot(logits.max(1).indices,num_classes=logits.shape[1])*dlogits_maxes
dh=dlogits@W2.T
dW2=h.T@dlogits
db2=dlogits.sum(0,keepdim=True)
dhpreact=(1 - h**2) * dh
dbngain=(bnraw*dhpreact).sum(0,keepdim=True)
dbnbias=dhpreact.sum(0,keepdim=True)
dbnraw=bngain*dhpreact
dbndiff=bnvar_inv*dbnraw
dbnvar_inv=(dbnraw*bndiff).sum(0,keepdim=True)
dbnvar=(-0.5*(bnvar + 1e-5)**-1.5)*dbnvar_inv
dbndiff2=(1.0/(n-1))*torch.ones_like(bndiff2)*dbnvar
dbndiff+=(2*bndiff)*dbndiff2
dhprebn=dbndiff.clone()
dbnmeani=(-dbndiff).sum(0)
dhprebn += 1.0/n * torch.ones_like(hprebn) * dbnmeani
dembcat=dhprebn@W1.T
dW1=embcat.T@dhprebn
db1=dhprebn.sum(0)
demb=dembcat.view(emb.shape)
dC=torch.zeros_like(C)
for k in range(Xb.shape[0]):
    for j in range(Xb.shape[1]):
        ix=Xb[k,j]
        dC[ix]+=demb[k,j]
grads=[dC,dW1,db1,dW2,db2,dbngain,dbnbias]


cmp('logprobs',dlogprobs,logprobs)
cmp('probs',dprobs,probs)
cmp('counts_sum_inv',dcounts_sum_inv,counts_sum_inv)
cmp('counts_sum',dcounts_sum,counts_sum)
cmp('counts',dcounts,counts)
cmp('norm_logits',dnorm_logits,norm_logits)
cmp('logits_maxes',dlogits_maxes,logits_maxes)
cmp('logits',dlogits,logits)
cmp('h',dh,h)
cmp('W2',dW2,W2)
cmp('b2',db2,b2)
cmp('hpreact',dhpreact,hpreact)
cmp('bngain',dbngain,bngain)
cmp('bnbias',dbnbias,bnbias)
cmp('bnraw',dbnraw,bnraw)
cmp('bndiff',dbndiff,bndiff)
cmp('bnvar_inv',dbnvar_inv,bnvar_inv)
cmp('bnvar',dbnvar,bnvar)
cmp('bndiff2',dbndiff2,bndiff2)
cmp('hprebn',dhprebn,hprebn)
cmp('bnmeani',dbnmeani,bnmeani)
cmp('embcat',dembcat,embcat)
cmp('W2',dW2,W2)
cmp('b1',db1,b1)
cmp('emb',demb,emb)
cmp('C',dC,C)

logprobs        | exact:True  | approximate:True |maxdiff:0.0
probs           | exact:True  | approximate:True |maxdiff:0.0
counts_sum_inv  | exact:True  | approximate:True |maxdiff:0.0
counts_sum      | exact:True  | approximate:True |maxdiff:0.0
counts          | exact:True  | approximate:True |maxdiff:0.0
norm_logits     | exact:True  | approximate:True |maxdiff:0.0
logits_maxes    | exact:True  | approximate:True |maxdiff:0.0
logits          | exact:True  | approximate:True |maxdiff:0.0
h               | exact:True  | approximate:True |maxdiff:0.0
W2              | exact:True  | approximate:True |maxdiff:0.0
b2              | exact:True  | approximate:True |maxdiff:0.0
hpreact         | exact:True  | approximate:True |maxdiff:0.0
bngain          | exact:True  | approximate:True |maxdiff:0.0
bnbias          | exact:True  | approximate:True |maxdiff:0.0
bnraw           | exact:True  | approximate:True |maxdiff:0.0
bndiff          | exact:True  | approximate:True |maxdiff:0.0
bnvar_in

In [152]:
dhprebn=bngain*bnvar_inv/n*(n*dhpreact-dhpreact.sum(0)-n/(n-1)*bnraw*(dhpreact*bnraw).sum(0))
cmp('hprebn',dhprebn,hprebn)

hprebn          | exact:False | approximate:True |maxdiff:9.313225746154785e-10


In [153]:
loss_fast=F.cross_entropy(logits,Yb)
print(loss_fast.item(),'diff:',(loss_fast-loss).item())

3.343411922454834 diff: 2.384185791015625e-07


In [None]:
n_embd = 10 #the dimensionakity of the character embedding vectors
n_hidden = 200
g=torch.Generator().manual_seed(2147483647)
C=torch.randn((vocab_size,n_embd),generator=g)
#Layer 1
W1=torch.randn((n_embd * block_size,n_hidden),generator=g)*(5/3)/((n_embd*block_size)**0.5)
b1=torch.randn(n_hidden,generator=g)*0.1  #uesless,just for fun
#Layer 2
W2=torch.randn((n_hidden,vocab_size),generator=g)*0.1
b2=torch.randn(vocab_size,generator=g)*0.1
#BatchNorm parameters  ???
bngain=torch.randn((1,n_hidden))*0.1 + 1.0
bnbias=torch.randn((1,n_hidden))*0.1

parameters = [C,W1,b1,W2,b2,bngain, bnbias]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad=True

max_steps=200000
batch_size=32
n=batch_size
lossi=[]
for i in range(max_steps):
    ix=torch.randint(0,Xtr.shape[0],(batch_size,),generator=g)
    Xb,Yb=Xtr[ix],Ytr[ix]

    emb=C[Xb]
#print(emb.shape)
    embcat=emb.view(emb.shape[0],-1) #concatenate the vectors
#Linear layer 1
    hprebn=embcat@W1+b1
#BatchNorm layer
    bnmean=hprebn.mean(0,keepdim=True)

    bnvar=hprebn.var(0,keepdim=True,unbiased=True)
    bnvar_inv=(bnvar+1e-5)**-0.5
    bnraw=(hprebn-bnmean)*bnvar_inv  #normal
    hpreact=bngain*bnraw + bnbias
#Non-linearity
    h=torch.tanh(hpreact)
#Linear layer2
    logits=h@W2+b2
    loss=F.cross_entropy(logits,Yb)

    for p in parameters:
        p.grad=None
    loss.backward()

    dC,dW1,db1,dW2,db2,dbngain,dbnbias=None,None,None,None,None,None,None
    grads=[dC,dW1,db1,dW2,db2,dbngain,dbnbias]

    lr=0.1 if i<100000 else 0.01
    for p,grad in zip(parameters,grads):
        print(f'{i:7d}/{max_steps:7d}:{loss.item():.4f}')
    lossi.append(loss.log10().item())

    if i >= 100:
        break