In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
#read in all words
words=open('names.txt','r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [3]:
#build the vocabulary of characters and mapping to/from integers
chars=sorted(list(set(''.join(words))))
stoi={s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos={i:s for s,i in stoi.items()}
vocab_size=len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [4]:
#build the dataset
block_size = 8

def build_dataset(words):  #why () words? :tr,dev,te
    X,Y=[],[]

    for w in words:
        context=[0] * block_size
        for ch in w+ '.':
            ix=stoi[ch]
            X.append(context)
            Y.append(ix)
            context=context[1:]+[ix]
        
    X=torch.tensor(X)
    Y=torch.tensor(Y)
    print(X.shape,Y.shape)
    return X,Y

import random
random.seed(42)
random.shuffle(words)
n1=int(0.8*len(words))
n2=int(0.9*len(words))

Xtr,Ytr=build_dataset(words[:n1])
Xdev,Ydev=build_dataset(words[n1:n2])
Xte,Yte=build_dataset(words[n2:])

torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])


In [5]:
for x,y in zip(Xtr[:20],Ytr[:20]):
    print(''.join(itos[ix.item()] for ix in x),'-->',itos[y.item()])

........ --> y
.......y --> u
......yu --> h
.....yuh --> e
....yuhe --> n
...yuhen --> g
..yuheng --> .
........ --> d
.......d --> i
......di --> o
.....dio --> n
....dion --> d
...diond --> r
..diondr --> e
.diondre --> .
........ --> x
.......x --> a
......xa --> v
.....xav --> i
....xavi --> e


In [2]:
from typing import Any


class Linear:

    def __init__(self,fan_in,fan_out,bias=True):
        self.weight=torch.randn(fan_in,fan_out)/fan_in**0.5  #kaiming init
        self.bias=torch.zeros(fan_out) if bias else None

    def __call__(self, x) :
        self.out=x @ self.weight
        if self.bias is not None:
            self.out+=self.bias
        return self.out
    
    def parameters(self):
        return [self.weight]+([] if self.bias is None else [self.bias])
    
class BatchNorm1d:
    def __init__(self,dim,eps=1e-5,momentum=0.1):
        self.eps=eps
        self.momentum=momentum
        self.training=True
        self.gamma=torch.ones(dim)
        self.bata=torch.zeros(dim)
        self.running_mean=torch.zeros(dim)
        self.running_var=torch.ones(dim)

    def __call__(self, x):
        if self.training:
            if x.ndim==2:
                dim=0
            elif x.ndim==3:
                dim=(0,1)
            xmean=x.mean(dim,keepdim=True)
            xvar=x.var(dim,keepdim=True)
        else:
            xmean=self.running_mean
            xvar=self.running_var
        xhat=(x-xmean)/torch.sqrt(xvar+self.eps)
        self.out=self.gamma*xhat+self.bata

        if self.training:
            with torch.no_grad():
                self.running_mean=(1-self.momentum)*self.running_mean+self.momentum*xmean
                self.running_var=(1-self.momentum)*self.running_var+self.momentum*xvar
        return self.out
    
    def parameters(self):
        return [self.gamma,self.bata]

class Tanh:
    def __call__(self, x) :
        self.out=torch.tanh(x)
        return self.out
    def parameters(self):
        return []

class Embedding:
    def __init__(self,num_embeddings,embedding_dim):
        self.weight=torch.randn(num_embeddings,embedding_dim)

    def __call__(self,IX):
        self.out=self.weight[IX]
        return self.out
    
    def parameters(self):
        return [self.weight]
    
class FlattenConsecutive:
    def __init__(self,n) :
        self.n=n

    def __call__(self, x):
        B,T,C=x.shape
        x=x.view(B,T//self.n,C*self.n)
        if x.shape[1]==1:
            x=x.squeeze(1)
        self.out=x
        return self.out
    def parameters(self):
        return []
    
class Sequential:

    def __init__(self,layers):
        self.layers=layers
    def __call__(self, x):
        for layer in self.layers:
            x=layer(x)
        self.out=x
        return self.out
    
    def parameters(self):
        return [p for layer in self.layers for p in self.parameters()]


In [11]:
f = 3.0002323
print(f"{str(f):20s}aaaa")

3.0002323           aaaa


In [4]:
bn = BatchNorm1d(20)

t = torch.rand((4,4,20))

r = bn(t)

In [7]:
torch.manual_seed(42);

In [9]:
n_embd=10
n_hidden=68
layers=[
    Embedding(vocab_size,n_embd),
    FlattenConsecutive(2),  Linear(n_embd*2,n_hidden,bias=False),BatchNorm1d(n_hidden),Tanh(),
    FlattenConsecutive(2),Linear(n_hidden*2,n_hidden,bias=False),BatchNorm1d(n_hidden),Tanh(),  #???
    FlattenConsecutive(2),Linear(n_hidden*2,n_hidden,bias=False),BatchNorm1d(n_hidden),Tanh(),
    Linear(n_hidden,vocab_size),
]
model=Sequential(layers)
with torch.no_grad():
    layers[-1].weight *= 0.1

parameters=[p for layer in layers for p in layer.parameters()]
print(sum(p.nelement()for p in parameters) )
for p in parameters:
    p.requires_grad=True

22397


In [None]:
max_steps=200000
batch_size=32
lossi=[]

for i in range(max_steps):
    ix=torch.randint(0,Xtr.shape[0],(batch_size,))
    Xb,Yb=Xtr[ix],Ytr[ix]

    logits=model(Xb)
    loss=F.cross_entropy(logits,Yb)

    for p in parameters:
        p.grad=None
    loss.backward()

    lr=0.1 if i<150000 else 0.01
    for p in parameters:
        p.data+=-lr*p.grad

    if i%10000==0:
        print(f'{i:7d}/{max_steps:7d}:{loss.item():.4f}')
    lossi.append(loss.log10().item())


      0/ 200000:3.3142
  10000/ 200000:2.2096
  20000/ 200000:2.1495
  30000/ 200000:2.1232
  40000/ 200000:1.9629
  50000/ 200000:2.4183
  60000/ 200000:1.9633
  70000/ 200000:1.8959
  80000/ 200000:2.1884
  90000/ 200000:1.8161
 100000/ 200000:1.5931
 110000/ 200000:2.3607
 120000/ 200000:2.3670
 130000/ 200000:1.9098
 140000/ 200000:2.2285
 150000/ 200000:1.7118
 160000/ 200000:1.8517
 170000/ 200000:1.9997
 180000/ 200000:2.1044
 190000/ 200000:1.6968


In [None]:
for layer in model.layers:
    print(layer.__class__.__name__,':',tuple(layer.out.shape))

Embedding : (32, 8, 10)
FlattenConsecutive : (32, 4, 20)
Linear : (32, 4, 68)
BatchNorm1d : (32, 4, 68)
Tanh : (32, 4, 68)
FlattenConsecutive : (32, 2, 136)
Linear : (32, 2, 68)
BatchNorm1d : (32, 2, 68)
Tanh : (32, 2, 68)
FlattenConsecutive : (32, 136)
Linear : (32, 68)
BatchNorm1d : (32, 68)
Tanh : (32, 68)
Linear : (32, 27)


In [None]:
for layer in model.layers:
    layer.training=False

In [None]:
#evaluate the loss
@torch.no_grad()
def split_loss(split):
    x,y={
        'train':(Xtr,Ytr),
        'val':(Xdev,Ydev),
        'test':(Xte,Yte),
    }[split]
    logits=model(x)
    loss=F.cross_entropy(logits,y)
    print(split,loss.item())

split_loss('train')
split_loss('val')

train 1.9110115766525269
val 2.020193576812744


In [None]:
for _ in range(20):
    out=[]
    context=[0]*block_size
    while True:
        logits=model(torch.tensor([context]))   #?weight没有初始化
        probs=F.softmax(logits,dim=1)
        ix=torch.multinomial(probs,num_samples=1).item()
        context=context[1:]+[ix]
        out.append(ix)

        if ix==0:
            break
    
    print(''.join(itos[i] for i in out))

lansan.
dyluwa.
jenel.
gisen.
fynnton.
nocah.
kamore.
elizan.
audlyn.
cadysin.
kimber.
synnabe.
zhry.
jkeer.
shiito.
noraandro.
aaithel.
quiam.
braeon.
darielli.


In [None]:
def f(x):
    print(x)