## Нейронные сети для генерации текста

## NeuroWorkshop

Дмитрий Сошников | dmitri@soshnikov.com

## Задача

Дан некоторый текст (Alice in Wonderland). Мы хотим научиться геренировать похожий на него текст. В качестве упраженения будем рассматривать побуквенную генерацию.

In [1]:
from __future__ import print_function
import numpy as np
import os
import sys
from cntk import Trainer, Axis
import cntk as C
from cntk.learners import momentum_sgd, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType
from cntk.ops import sequence
from cntk.losses import cross_entropy_with_softmax
from cntk.metrics import classification_error
from cntk.ops.functions import load_model
from cntk.layers import LSTM, Stabilizer, Recurrence, Dense, For, Sequential
from cntk.logging import log_number_of_parameters, ProgressPrinter

In [2]:
if not os.path.exists('Alice.txt'):
    !wget https://raw.githubusercontent.com/shwars/NeuroWorkshop/master/Data/texts/Alice.txt

--2017-11-27 05:38:40--  https://raw.githubusercontent.com/shwars/NeuroWorkshop/master/Data/texts/Alice.txt
Resolving webproxy (webproxy)... 10.72.8.104
Connecting to webproxy (webproxy)|10.72.8.104|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 143859 (140K) [text/plain]
Saving to: 'Alice.txt'


2017-11-27 05:38:40 (3.60 MB/s) - 'Alice.txt' saved [143859/143859]



In [7]:
data = open("Alice.txt", "r",encoding="utf-8").read()
data = data[0:64000].lower()
chars = sorted(list(set(data)))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 64000 characters, 44 unique.


## Попытка 1: Обычная сеть

 * Используем бегущее окно ширины `nchars` (100 символов)
 * По этому окну будем предсказывать следующий символ

In [8]:
nchars=100
def get_sample(p):
    xi = [char_to_ix[ch] for ch in data[p:p+nchars]]
    yi = [char_to_ix[data[p+1]]]
    
    X = np.eye(vocab_size, dtype=np.float32)[xi]
    Y = np.eye(vocab_size, dtype=np.float32)[yi]

    return X, Y

get_sample(0)

(array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.]], dtype=float32),
 array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.]], dtype=float32))

In [9]:
input_text = C.input_variable((nchars,vocab_size))
output_char = C.input_variable(shape=vocab_size)

model = Sequential([Dense(6000,activation=C.relu),Dense(600,activation=C.relu),Dense(vocab_size,activation=None)])

z = model(input_text)
z_sm = C.softmax(z)

In [10]:
ce = cross_entropy_with_softmax(z, output_char)
errs = classification_error(z, output_char)

lr_per_sample = learning_rate_schedule(0.01, UnitType.minibatch)
momentum_time_constant = momentum_as_time_constant_schedule(1100)
learner = C.learners.adam(z.parameters, lr_per_sample,momentum=momentum_time_constant)
progress_printer = ProgressPrinter(freq=100, tag='Training')
trainer = Trainer(z, (ce, errs), learner, progress_printer)
    
log_number_of_parameters(z)

Training 30033044 parameters in 6 parameter tensors.


In [11]:
def sample(net, prime_text='A quick brown fox jumped over the lazy sleeping dog. While I was reading this text, something happen', use_hardmax=True, length=100, temperature=1.0):

    # Применяем температуру: T < 1 - сглаживание; T=1.0 - без изменений; T > 1 - выделение пиков
    def apply_temp(p):
        p = np.power(p, (temperature))
        # повторно нормализуем
        return (p / np.sum(p))

    def sample_word(p):
        if use_hardmax:
            w = np.argmax(p)
        else:
            # выбираем случайным образом исходя из вероятностей
            p = np.exp(p) / np.sum(np.exp(p))            
            p = apply_temp(p)
            w = np.random.choice(range(vocab_size), p=p.ravel())
        return w

    if prime_text=='': prime_text = data[0:nchars]

    if (len(prime_text)<nchars): prime_text = " "*(nchars-len(prime_text))+prime_text

    out = "";

    inp = np.eye(vocab_size,dtype=np.float32)[np.array([char_to_ix[x] for x in prime_text])]

    for _ in range(length):
        # print([ix_to_char[np.argmax(x)] for x in inp])
        o = net.eval(inp)
        ochr = sample_word(o)
        out = out+ix_to_char[ochr]
        inp = np.roll(inp,-1,axis=0)
        inp[-1,:] = np.eye(vocab_size,dtype=np.float32)[ochr]
    return out


In [None]:
for ep in range(10):
    print("Epoch={}".format(ep))
    for mb in range(0,data_size-nchars-1,40):
        feat,lab = get_sample(mb)
        trainer.train_minibatch({input_text: feat, output_char: lab})
    print(sample(z_sm,use_hardmax=True,prime_text='',length=300).replace('\n',' '))

Epoch=0
Learning rate per minibatch: 0.01
 Minibatch[   1- 100]: loss = 5.346343 * 100, metric = 93.00% * 100;
 Minibatch[ 101- 200]: loss = 3.520440 * 100, metric = 88.00% * 100;
 Minibatch[ 201- 300]: loss = 4.516078 * 100, metric = 82.00% * 100;
 Minibatch[ 301- 400]: loss = 3.444583 * 100, metric = 87.00% * 100;
 Minibatch[ 401- 500]: loss = 3.202056 * 100, metric = 85.00% * 100;
 Minibatch[ 501- 600]: loss = 3.273171 * 100, metric = 87.00% * 100;
 Minibatch[ 601- 700]: loss = 3.159187 * 100, metric = 87.00% * 100;
 Minibatch[ 701- 800]: loss = 3.228071 * 100, metric = 86.00% * 100;
 Minibatch[ 801- 900]: loss = 3.038789 * 100, metric = 74.00% * 100;
 Minibatch[ 901-1000]: loss = 3.001589 * 100, metric = 81.00% * 100;
 Minibatch[1001-1100]: loss = 2.965742 * 100, metric = 81.00% * 100;
 Minibatch[1101-1200]: loss = 3.126494 * 100, metric = 87.00% * 100;
 Minibatch[1201-1300]: loss = 3.016211 * 100, metric = 81.00% * 100;
 Minibatch[1301-1400]: loss = 3.109858 * 100, metric = 84.00%

In [None]:
sap

In [10]:
minibatch_size=100
def get_sample(p):
    xi = [char_to_ix[ch] for ch in data[p:p+minibatch_size]]
    yi = [char_to_ix[ch] for ch in data[p+1:p+minibatch_size+1]]
    
    X = np.eye(vocab_size, dtype=np.float32)[xi]
    Y = np.eye(vocab_size, dtype=np.float32)[yi]

    return [X], [Y]
sample(0)

([array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  1.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)],
 [array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  1.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)])

In [12]:
input_sequence = sequence.input_variable(shape=vocab_size)
label_sequence = sequence.input_variable(shape=vocab_size)

model = Sequential([Recurrence(LSTM(200)),Dense(vocab_size)])
z = model(input_sequence)

ce = cross_entropy_with_softmax(z, label_sequence)
errs = classification_error(z, label_sequence)

In [13]:
lr_per_sample = learning_rate_schedule(0.001, UnitType.sample)
momentum_time_constant = momentum_as_time_constant_schedule(1100)
clipping_threshold_per_sample = 5.0
gradient_clipping_with_truncation = True
learner = momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant,
                    gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
                    gradient_clipping_with_truncation=gradient_clipping_with_truncation)
progress_printer = ProgressPrinter(freq=100, tag='Training')
trainer = Trainer(z, (ce, errs), learner, progress_printer)
    
log_number_of_parameters(z)

Training 247887 parameters in 5 parameter tensors.


In [15]:
for ep in range(3):
    print("Epoch={}".format(ep))
    m = [True]
    for mb in range(0,data_size-minibatch_size-1,minibatch_size//2):
        feat,lab = get_sample(mb)
        trainer.train_minibatch({input_sequence: feat, label_sequence: lab})
        m=[False]

Epoch=0
Learning rate per sample: 0.001
Momentum per sample: 0.9990913221888589
 Minibatch[   1- 100]: loss = 4.183050 * 10000, metric = 84.12% * 10000;
 Minibatch[ 101- 200]: loss = 3.380252 * 10000, metric = 81.08% * 10000;
 Minibatch[ 201- 300]: loss = 3.195379 * 10000, metric = 80.73% * 10000;
 Minibatch[ 301- 400]: loss = 3.136448 * 10000, metric = 81.35% * 10000;
 Minibatch[ 401- 500]: loss = 3.189575 * 10000, metric = 83.01% * 10000;
 Minibatch[ 501- 600]: loss = 3.095708 * 10000, metric = 78.97% * 10000;
 Minibatch[ 601- 700]: loss = 3.178728 * 10000, metric = 82.77% * 10000;
 Minibatch[ 701- 800]: loss = 3.139375 * 10000, metric = 82.76% * 10000;
 Minibatch[ 801- 900]: loss = 3.138056 * 10000, metric = 83.01% * 10000;
 Minibatch[ 901-1000]: loss = 3.179253 * 10000, metric = 83.11% * 10000;
 Minibatch[1001-1100]: loss = 3.088069 * 10000, metric = 81.03% * 10000;
 Minibatch[1101-1200]: loss = 3.079706 * 10000, metric = 82.81% * 10000;
 Minibatch[1201-1300]: loss = 3.075625 * 100

In [16]:
def sample(net, prime_text='', use_hardmax=True, length=100, temperature=1.0):

    # Применяем температуру: T < 1 - сглаживание; T=1.0 - без изменений; T > 1 - выделение пиков
    def apply_temp(p):
        p = np.power(p, (temperature))
        # повторно нормализуем
        return (p / np.sum(p))

    def sample_word(p):
        if use_hardmax:
            w = np.argmax(p, axis=2)[0,0]
        else:
            # выбираем случайным образом исходя из вероятностей
            p = np.exp(p) / np.sum(np.exp(p))            
            p = apply_temp(p)
            w = np.random.choice(range(vocab_size), p=p.ravel())
        return w

    plen = 1
    prime = -1

    # инициализируем sequence начальной строкой или случайными значениями
    x = np.zeros((1, vocab_size), dtype=np.float32)    
    if prime_text != '':
        plen = len(prime_text)
        prime = char_to_ix[prime_text[0]]
    else:
        prime = np.random.choice(range(vocab_size))
    x[0, prime] = 1
    arguments = ([x], [True])

    # переменная для хранения результата
    output = []
    output.append(prime)
    
    # обрабатываем начальную строку
    for i in range(plen):            
        p = net.eval(arguments)        
        x = np.zeros((1, vocab_size), dtype=np.float32)
        if i < plen-1:
            idx = char_to_ix[prime_text[i+1]]
        else:
            idx = sample_word(p)

        output.append(idx)
        x[0, idx] = 1            
        arguments = ([x], [False])
    
    # обрабатываем дальнейший текст
    for i in range(length-plen):
        p = net.eval(arguments)
        idx = sample_word(p)
        output.append(idx)
        x = np.zeros((1, vocab_size), dtype=np.float32)
        x[0, idx] = 1
        arguments = ([x], [False])

    # преобразуем к строке и возвращаем
    return ''.join([ix_to_char[c] for c in output])


In [18]:
sample(z,'Hello',False)

'Helloc; Dfatice, thes if chey Tuturojroq ce wonitr so aroling the GwasgroLgcor cnit to PYo preang pat'