In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline


In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/sfilatov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sfilatov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
raw_text = open('../lolita.txt', 'rb').read()
from nltk.tokenize import RegexpTokenizer

SIZE = 200
sentences = []

sent_text = nltk.sent_tokenize(raw_text.decode("windows-1251"))
for sentence in sent_text:
    tokenized_text = nltk.word_tokenize(sentence)
#     tagged = nltk.pos_tag(tokenized_text)
    sentences.append(tokenized_text)

# skip intro
intro = 30
sentences = sentences[intro:SIZE+intro]

In [4]:
res = set()
for s in sentences:
  for w in s:
    res.add(w)
    
res.add('<S>')
res.add('<E>')
L = len(res)
wtoi = {w:i for i,w in enumerate(list(res))}
itow = {i:w for w,i in wtoi.items()}
start = wtoi['<S>']

In [5]:
N = torch.zeros((L, L), dtype=torch.int32)
for s in sentences:
  ws = ['<S>'] + list(s) + ['<E>']
  for w1, w2 in zip(ws, ws[1:]):
    ix1 = wtoi[w1]
    ix2 = wtoi[w2]
    N[ix1, ix2] += 1

In [6]:
block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for s in sentences[:5]:
  
#   print(s)
  context = [start] * block_size
  for w in s + ['<E>']:
    ix = wtoi[w]
    X.append(context)
    Y.append(ix)
#     print(' '.join(itow[i] for i in context), '--->', itow[ix])
    context = context[1:] + [ix] # crop and append
  
X = torch.tensor(X)
Y = torch.tensor(Y)

In [7]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([163, 3]), torch.int64, torch.Size([163]), torch.int64)

In [8]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(sentences):  
  X, Y = [], []
  for s in sentences:

      context = [start] * block_size
      for w in s + ['<E>']:
        ix = wtoi[w]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(sentences)
n1 = int(0.8*len(sentences))
n2 = int(0.9*len(sentences))

Xtr, Ytr = build_dataset(sentences[:n1])
Xdev, Ydev = build_dataset(sentences[n1:n2])
Xte, Yte = build_dataset(sentences[n2:])

torch.Size([4803, 3]) torch.Size([4803])
torch.Size([472, 3]) torch.Size([472])
torch.Size([479, 3]) torch.Size([479])


In [9]:
C = torch.randn((L, 2))

In [10]:
emb = C[X]
emb.shape

torch.Size([163, 3, 2])

In [11]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [12]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)

In [13]:
h.shape

torch.Size([163, 100])

In [14]:
W2 = torch.randn((100, L))
b2 = torch.randn(L)

In [15]:
logits = h @ W2 + b2

In [16]:
logits.shape


torch.Size([163, 2667])

In [17]:
counts = logits.exp()

In [18]:
prob = counts / counts.sum(1, keepdims=True)

In [19]:
prob.shape

torch.Size([163, 2667])

In [20]:
loss = -prob[torch.arange(len(Y)), Y].log().mean()
loss



tensor(30.2210)

In [21]:


Xtr.shape, Ytr.shape # dataset



(torch.Size([4803, 3]), torch.Size([4803]))

In [22]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((L, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, L), generator=g)
b2 = torch.randn(L, generator=g)
parameters = [C, W1, b1, W2, b2]



In [23]:
sum(p.nelement() for p in parameters) # number of parameters in total

568937

In [24]:
for p in parameters:
  p.requires_grad = True

In [25]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre

In [26]:
lri = []
lossi = []
stepi = []

In [35]:
for i in range(100):
  
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (10,))
  
  # forward pass
  emb = C[Xtr[ix]]
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Ytr[ix])
  
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  #lr = lrs[i]
  lr = 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  #lri.append(lre[i])
  stepi.append(i)
  lossi.append(loss.log10().item())

print(loss.item())

38.47242736816406


In [30]:
emb = C[Xtr] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
loss

tensor(44.9614, grad_fn=<NllLossBackward0>)

In [219]:
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

tensor(23.7153, grad_fn=<NllLossBackward0>)

In [220]:
context = [start] * block_size
C[torch.tensor([context])].shape

torch.Size([1, 3, 10])

In [223]:
g = torch.Generator().manual_seed(2147483646)

for _ in range(5):
    
    out = []
    context = [start] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      w = itow[ix]
      if w in ['<E>', '<S>']:
        break
      if w not in [',', '.', '?', '!']:
        out.append(' ')
      out.append(w)
    
    print(''.join(out))
    print('\n')

 В Более разобраться, что в он сонным, которая жила индивидуальную безнаказанностью пожаловаться совершенно, заставляя открытой у.


 столько чуть и еще переходила с Аннабеллы печи с к непристойного попадаешь угоду пойти глухая странах интересе коленки подслушать какой-то за нею минут ван ей избранниц изменений старого занимать малолетние нею.


 Я Ричарда девочка же готов сжигаем, напудренную нечто что мужчиной изощренность прославлять.


 Но некотором Сначала, испытанный меня или гений должен он находившейся будем, сопровождающих меня бросившим, поручиться предстояло с темной кулачке примечательные чары Циник среди « генов » и доктору зим, – скорбей некоторого кошкой, пределы так похоти вы Джон среде, не нрава бесплатной добротной Америке.


 Ей-богу » жизнь и она Аннабелла великолепная году маленькая, была кустом то заимствую тянулась ракетс, будет роман, играл женского же я мечтал пола черты сами.


