# Рекуррентные нейросети

Построим простейшую нейросеть для посимвольной генерации текста

In [1]:
import pandas as pd  # для работы с данными
import time  # для оценки времени
import torch  # для написания нейросети

## Загрузка данных

Будем работать с датасетом реплик из Симпсонов. Нам нужно извлечь предобработанные тексты и закодировать их числами

In [9]:
df = pd.read_csv('simpsons_script_lines.csv',  on_bad_lines='skip')
df.head()

  df = pd.read_csv('simpsons_script_lines.csv',  on_bad_lines='skip')


Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31.0
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3.0
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22.0
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5.0
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40.0,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33.0


In [73]:
df.shape

(158248, 13)

In [10]:
phrases = df['normalized_text'].tolist()  # колонка с предобработанными текстами
phrases[:10]

['no actually it was a little of both sometimes when a disease is in all the magazines and all the news shows its only natural that you think you have it',
 'wheres mr bergstrom',
 'i dont know although id sure like to talk to him he didnt touch my lesson plan what did he teach you',
 'that life is worth living',
 'the polls will be open from now until the end of recess now just in case any of you have decided to put any thought into this well have our final statements martin',
 'i dont think theres anything left to say',
 'bart',
 'victory party under the slide',
 nan,
 'mr bergstrom mr bergstrom']

In [11]:
text = [[c for c in ph] for ph in phrases if type(ph) is str]

## Создаём массив с данными

Нужно

1. Разбить данные на токены (у нас символы)
2. Закодировать числами
3. Превратить в эмбеддинги

In [115]:
CHARS = set('abcdefghijklmnopqrstuvwxyz ')  # все символы, которые мы хотим использовать для кодировки = наш словарь
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]  # все неизвестные символы будут получать тег none
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}  # словарь токен-индекс

In [116]:
len(INDEX_TO_CHAR)

28

In [117]:
MAX_LEN = 50  # мы хотим ограничить максимальную длину ввода
X = torch.zeros((len(text), MAX_LEN), dtype=int)  # создаём пустой вектор для текста, чтобы класть в него индексы токенов
for i in range(len(text)):  # для каждого предложения
    for j, w in enumerate(text[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [118]:
X[0:5]

tensor([[12,  7,  9, 22, 11, 21,  4, 22,  1,  1, 23,  9, 25, 21,  9, 18, 22, 10,
          9, 22,  9,  1, 25, 21, 21,  1, 26,  9,  7,  2,  9,  5,  7, 21, 27,  9,
         10,  7, 14, 26, 21, 25, 14, 26, 10,  9, 18, 27, 26, 12],
        [18, 27, 26,  6, 26, 10,  9, 14,  6,  9,  5, 26,  6, 19, 10, 21,  6,  7,
         14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [25,  9, 17,  7, 12, 21,  9, 15, 12,  7, 18,  9, 22,  1, 21, 27,  7,  4,
         19, 27,  9, 25, 17,  9, 10,  4,  6, 26,  9,  1, 25, 15, 26,  9, 21,  7,
          9, 21, 22,  1, 15,  9, 21,  7,  9, 27, 25, 14,  9, 27],
        [21, 27, 22, 21,  9,  1, 25,  2, 26,  9, 25, 10,  9, 18,  7,  6, 21, 27,
          9,  1, 25, 24, 25, 12, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [21, 27, 26,  9, 13,  7,  1,  1, 10,  9, 18, 25,  1,  1,  9,  5, 26,  9,
       

## Embedding и RNN ячейки

Каждому токену мы хотим сопоставить не просто число, но вектор. Поэтому вектор текста нам нужно умножить на матрицу эмбеддингов, которая тоже будет учиться в процессе обучения нейросети. Для создания такой матрицы нам нужен слой `nn.Embedding`

In [119]:
X[0:5].shape

torch.Size([5, 50])

In [120]:
embeddings = torch.nn.Embedding(len(INDEX_TO_CHAR), 28)  # размер словаря * размер вектора для кодировки каждого слова
t = embeddings(X[0:5])
t.shape

torch.Size([5, 50, 28])

In [121]:
t.shape, X[0:5].shape

(torch.Size([5, 50, 28]), torch.Size([5, 50]))

In [122]:
rnn = torch.nn.RNN(28, 256, batch_first=True)  # на вход - размер эмбеддинга, размер скрытого состояния и порядок размерностей
o, s = rnn(t)
# вектора для слов: батч * число токенов * размер скрытого состояния
# вектор скрытого состояния: число вектров (один) * батч * размер скрытого состояния
o.shape, s.shape

(torch.Size([5, 50, 256]), torch.Size([1, 5, 256]))

Можно применять несколько рекуррентных ячеек подряд

In [123]:
o, s2 = rnn(t, s)
o.shape, s2.shape

(torch.Size([5, 50, 256]), torch.Size([1, 5, 256]))

In [124]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Реализация сети с RNN
3 слоя:
1. Embeding (30)
2. RNN (hidden_dim=128)
3. Полносвязный слой для предсказания буквы (28, то есть размер словаря)

In [125]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embedding = torch.nn.Embedding(28, 30)
        self.rnn = torch.nn.RNN(30, 128)
       # self.out = torch.nn.Linear(128, 28)
        self.out = torch.nn.Sequential(
                torch.nn.Linear(128, 64),
                torch.nn.ReLU(),
                torch.nn.Dropout(0.5),
                torch.nn.Linear(64, 28)
                )
    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        x, s = self.rnn(x) # берём выход с последнего слоя для всех токенов, а не скрытое состояние
        return self.out(x)

In [126]:
model = Network()
model.to(device)

Network(
  (embedding): Embedding(28, 30)
  (rnn): RNN(30, 128)
  (out): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=64, out_features=28, bias=True)
  )
)

In [127]:
criterion = torch.nn.CrossEntropyLoss()  # типичный лосс многоклассовой классификации
optimizer = torch.optim.Adam(model.parameters(), lr=.001)

Обучение:

In [128]:
for ep in range(20):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        # берём батч в 100 элементов
        batch = X[i * 100:(i + 1) * 100]
        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        optimizer.zero_grad()
        answers = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 4.313, Train loss: 1.790
Epoch 1. Time: 4.307, Train loss: 1.722
Epoch 2. Time: 4.095, Train loss: 1.716
Epoch 3. Time: 4.231, Train loss: 1.711
Epoch 4. Time: 4.212, Train loss: 1.712
Epoch 5. Time: 4.225, Train loss: 1.707
Epoch 6. Time: 4.214, Train loss: 1.708
Epoch 7. Time: 4.337, Train loss: 1.705
Epoch 8. Time: 4.292, Train loss: 1.705
Epoch 9. Time: 4.238, Train loss: 1.714
Epoch 10. Time: 4.239, Train loss: 1.709
Epoch 11. Time: 4.374, Train loss: 1.708
Epoch 12. Time: 4.248, Train loss: 1.709
Epoch 13. Time: 4.258, Train loss: 1.708
Epoch 14. Time: 4.286, Train loss: 1.707
Epoch 15. Time: 4.422, Train loss: 1.708
Epoch 16. Time: 4.197, Train loss: 1.715
Epoch 17. Time: 4.305, Train loss: 1.710
Epoch 18. Time: 4.390, Train loss: 1.707
Epoch 19. Time: 4.302, Train loss: 1.709


## Генерация


- Сначала отправлем в модель буквы из предложения (прогревая состояние)
- Затем берём самую вероятную букву и добавляем её в предложение
- Повторяем пока не получим none (0)

In [129]:
CHAR_TO_INDEX['none']

0

In [130]:
def generate_sentence(word):
    sentence = list(word.lower())
    sentence = [CHAR_TO_INDEX.get(s, 0) for s in sentence]
    answers = model.forward(torch.tensor(sentence).to(device))
    probas, indices = answers.topk(1)
    return ''.join([INDEX_TO_CHAR[ind.item()] for ind in indices.flatten()])

In [149]:
generate_sentence('homer')

'e  le'