In [13]:
import numpy as np
import keras
from keras import layers
import re
from keras.backend import dot

# Обработка данных

In [14]:
def preprocess(line: str) -> str:
    words = list(filter(len, line.strip().lower().split()))
    for i, word in enumerate(words):
        word = re.sub('[?!]', '.', word)
        words[i] = re.sub('[^a-z.]', '', word)
    return ' '.join(words)

def read_preprocess(filename: str) -> list[str]:
    text = ''
    for line in open(filename):
        text += preprocess(line) + '.'

    sentences = map(str.strip, text.split('.'))
    sentences = filter(len, sentences)
    sentences = list(map(lambda s: s + '.', sentences))
    return sentences


data = read_preprocess('/Users/alv.popov/study/ml/data/lab7/book.txt')
data[:3]

['to sherlock holmes she is always the woman.',
 'i have seldom heard him mention her under any other name.',
 'in his eyes she eclipses and predominates the whole of her sex.']

In [15]:
max_len = max(map(len, data))
max_len

547

In [16]:
all_symbols = set()
for line in data:
    all_symbols.update(line)

all_symbols = list(all_symbols)
len(all_symbols)

28

In [17]:
sym2one_code = dict()
for i, s in enumerate(all_symbols):
    cur = np.zeros(len(all_symbols), dtype='int')
    cur[i] = 1
    sym2one_code[s] = cur


def encode_text(text: str) -> np.ndarray:
    res = []
    for sym in text:
        res.append(sym2one_code[sym])
    return np.array(res)


data_one_hot = list(map(encode_text, data))
len(data[0]), data_one_hot[0].shape, data[0], data_one_hot[0]

(43,
 (43, 28),
 'to sherlock holmes she is always the woman.',
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0]]))

In [18]:
def xy_split(sentences):
    x, y = [], []
    l = 10
    for sentence in sentences:
        if len(sentence) <= l:  # не обрабатываем маленькие последовательности
            continue

        for i in range(len(sentence) - l):
            x.append(sentence[i:i+l])
            y.append(sentence[i+l])
    return np.array(x), np.array(y)

x, y = xy_split(data_one_hot)
print(x.shape, y.shape)

(464340, 10, 28) (464340, 28)


In [19]:
lstm_model = keras.Sequential([
    keras.Input((None, x.shape[-1])),
    keras.layers.LSTM(128),
    keras.layers.Dropout(0.3),
    layers.Dense(len(all_symbols), activation='softmax'),
])

lstm_model.compile(loss='categorical_crossentropy', optimizer='adam')

lstm_model.fit(x, y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1287c2b80>

In [20]:
def finish_sentence(model, prefix: str) -> str:
    result_string = prefix
    encoded_suffix = encode_text(preprocess(prefix)).tolist()

    for _ in range(max_len):
        prediction = model.predict(np.array([encoded_suffix]))
        ind = np.random.choice(range(len(prediction[0])), p=prediction[0])
        new_sym = all_symbols[ind]
        result_string += new_sym
        if new_sym == '.':
            break
        encoded_suffix = encoded_suffix[1:]
        encoded_suffix.append(sym2one_code[new_sym])

    if result_string[-1] != '.':
        result_string += '.'

    return result_string

In [21]:
finish_sentence(lstm_model, 'Do you note ')



'Do you note  all the dress in end ot lock before.'

In [22]:
finish_sentence(lstm_model, 'Twice burglars in my pay ransacked her house')



'Twice burglars in my pay ransacked her house.'

In [23]:
finish_sentence(lstm_model, 'hello mister')



'hello mistertay watton you missed no of one which dlessable of happres of you.'

# Марковская цепь

In [24]:
class Markov:
    def __init__(self, texts: list[str], n: int = 10):
        edges = dict()
        for text in texts:
            for i in range(len(text) - n - 1):
                cur = text[i : i + n]
                next = text[i + 1 : i + n + 1]
                edges.setdefault(cur, dict())
                edges[cur].setdefault(next, 0)
                edges[cur][next] += 1

        for k1, v in edges.items():
            s = sum(v.values())
            for k2 in v.keys():
                edges[k1][k2] /= s

        self.p = edges
        self.n = n

    def predict(self, line: str) -> str:

        for _ in range(max_len):
            if line[-1] == '.':
                break

            suffix = line[-self.n:]
            if suffix not in self.p:
                prediction = '.'
            else:
                prediction = max(self.p[suffix], key=self.p[suffix].get)
            line += prediction[-1]

        if line[-1] != '.':
            line += '.'
        return line

model = Markov(data)

In [25]:
model.predict('Do you note ')

'Do you note the peculiar to him.'

In [26]:
model.predict('Twice burglars in my pay ransacked her house')

'Twice burglars in my pay ransacked her house is on fire her instinct is at once to the station and the more obvious facts that he had seen him get in with his head sunk upon his face which she had been a strong presumption that the corner of the bedroom window with my lens and lay down upon the stairs and in the passage and a tap at the door of the preceding night and he called next day to ask if we had got home all safe and sound.'

In [27]:
model.predict('hello mister')

'hello mister.'