# Генерация скороговорок с помощью марковских цепей и LSTM

Что нужно сделать:
1. Выкачать скороговорки с сайта  (588 штук)
2. Генерация с помощью марковских цепей
3. Генерация с помощью LSTM

### Выкачиваем скороговорки

In [38]:
from re import sub
from html.parser import HTMLParser
from urllib.request import urlopen
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter

In [45]:
class TwisterParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.twisters = []
        self.toggle = False

    def handle_starttag(self, tag, attrs):
        if tag == 'p' and dict(attrs).get('class', None) == 'TXT':
            self.toggle = True
        else:
            self.toggle = False

    def handle_endtag(self, tag):
        self.toggle = False

    def handle_data(self, data):
        if self.toggle:
            self.twisters.append(data)
            self.toggle = False


def fetch_twisters():
    url = 'http://www.uebersetzung.at/twister/en.htm'
    parser = TwisterParser()
    parser.feed(urlopen(url).read().decode())
    return [sub(r'[".,!?]','',t) for t in parser.twisters]

In [48]:
list_of_twisters = fetch_twisters()

In [49]:
#создаем txt-файл со скороговорками
with open('twisters.txt','w', encoding = "utf-8") as l:
    for i in list_of_twisters:
            l.write(i + '\n')

In [50]:
list_of_twisters

['Peter Piper picked a peck of pickled peppers',
 'I saw Susie sitting in a shoe shine shop',
 'How many boards',
 'How can a clam cram in a clean cream can',
 "Send toast to ten tense stout saints' ten tall tents",
 'Denise sees the fleece',
 'Coy knows pseudonoise codes',
 'Sheena leads Sheila needs',
 'The thirty-three thieves thought that they thrilled the throne throughout Thursday',
 'Something in a thirty-acre thermal thicket of thorns and thistles thumped and thundered threatening the three-D thoughts of Matthew the thug - although theatrically it was only the thirteen-thousand thistles and thorns through the underneath of his thigh that the thirty year old thug thought of that morning',
 'Can you can a can as a canner can can a can',
 "Seth at Sainsbury's sells thick socks",
 'You cuss I cuss we all cuss for asparagus',
 'Roberta ran rings around the Roman ruins',
 'Clean clams crammed in clean cans',
 'Six sick hicks nick six slick bricks with picks and sticks',
 "I wish to w

### Марковская модель

In [71]:
import numpy as np

twister = open('twisters.txt', encoding='utf8').read()
corpus = twister.split()
#формируем пары
def pairs(corpus):
    for i in range(len(corpus)-1):
        yield (corpus[i], corpus[i+1])
        
pairs = pairs(corpus)

word_dict = {}

for word_1, word_2 in pairs:
    if word_1 in word_dict.keys():
        word_dict[word_1].append(word_2)
    else:
        word_dict[word_1] = [word_2]
        
#случайно выбираем первое слово
first_word = np.random.choice(corpus)
# но с большой буквы
while first_word.islower():
    first_word = np.random.choice(corpus)

chain = [first_word]
# количество слов в скороговорке
n_words = 8

for i in range(n_words):
    chain.append(np.random.choice(word_dict[chain[-1]]))

' '.join(chain)

'Sheep Sheets Cheap Sheep Association The greedy Greek grapes'

Как мы видим, здесь результат довольно бессмысленный, но для тренировки произношения звуков сойдет.

### LSTM

In [10]:
import sys
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
# загружаем файл и переводим в нижний регистр
filename = "twisters.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

In [12]:
# создаем сопоставление уникальных символов целым числам и обратное сопоставление, чтобы потом можно было генерировать
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [13]:
# посмотрим какие данные мы имеем
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  34323
Total Vocab:  40


In [14]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  34223


In [15]:
# нужно трансформировать данные в форму [samples, time steps, features], которую ожидает увидеть LSTM
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# нормализуем
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [33]:
# определяем LSTM модель (2 слоя)
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# можно было сделать с 1 слоем
#model = Sequential()
#model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
#model.add(Dropout(0.2))
#model.add(Dense(y.shape[1], activation='softmax'))
#model.compile(loss='categorical_crossentropy', optimizer='adam')

In [21]:
# определяем чек-поинты, чтобы потом выбрать лучшую эпоху с наименьшей потерей
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# обучаем модель
model.fit(X, y, epochs=10, batch_size=128, callbacks=callbacks_list)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25d4321c9b0>

In [114]:
# выбираем из 10 сохраненых вариантов взвешивания тот, в котором наименьшая потеря (2.3321)
filename = "weights-improvement-10-2.3321.hdf5"
#filename = "weights-improvement-09-2.3979.hdf5"

model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# случайным образом выбираем seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

Seed:
" r ruth's writhings than roth's wrath
the third time the three three-toed tree toads tried tying thei "


In [115]:
# генерируем по-символьно
for i in range(60):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

t the siiek she thett the shitt she shitt she shitt she shit
Done.


А вот здесь все получилось хуже.