In [None]:
!pip3 -qq install torch==0.4.1
!pip -qq install torchtext==0.3.1
!pip -qq install spacy==2.0.16
!pip install -qq gensim==3.6.0
!python -m spacy download en
!wget -O squad.zip -qq --no-check-certificate "https://drive.google.com/uc?export=download&id=1h8dplcVzRkbrSYaTAbXYEAjcbApMxYQL"
!unzip squad.zip
!wget -O opensubs.zip -qq --no-check-certificate "https://drive.google.com/uc?export=download&id=1x1mNHweP95IeGFbDJPAI7zffgxrbqb7b"
!unzip opensubs.zip

In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


if torch.cuda.is_available():
    from torch.cuda import FloatTensor, LongTensor
    DEVICE = torch.device('cuda')
else:
    from torch import FloatTensor, LongTensor
    DEVICE = torch.device('cpu')

np.random.seed(42)

# General Conversation

Today we are analyzing how the talker is arranged.

<center>
<img src="https://meduza.io/image/attachments/images/002/547/612/large/RLnxN4VdUmWFcBp8GjxUmA.jpg" width="20%">
</center>

In general, we have already discussed the Seq2Seq models that can be used to implement chatters - however, they have a drawback: there is a high probability of generating something ungrammatical. Well, like those pies.

Therefore, almost always go the other way - instead of generating apply ranking. You need to make a large base of answers in advance and simply choose the most appropriate to the context each time.

## DSSM

To do this, use DSSM (Deep Structured Semantic Models):

<center>
<img src="https://qph.fs.quoracdn.net/main-qimg-b90431ff9b4c60c5d69069d7bc048ff0" width="20%">
</center>
    
*From [What are Siamese neural networks, what applications are they good for, and why?](https://www.quora.com/What-are-Siamese-neural-networks-what-applications-are-they-good-for-and-why)*

This network consists of (usually) a pair of towers: the left one encodes the request, the right one the answer. The task is to learn to count the proximity between request and response.

Then they gather a large body of request-response pairs (a request can be either one question or a context — the last few questions / answers).

For answers, their vectors are pre-calculated, each new request is encoded with the help of the right tower, and the nearest one is found among the pre-calculated vectors.

## Data

We will use to start [Stanford Question Answering Dataset (SQuAD)] (https://rajpurkar.github.io/SQuAD-explorer/). In general, the task there is to find the answer to the question in the text. But we will simply choose among the text sentences that are closest to the question.

* This part of the laptop is heavily based on [shadovsky laptop](https://github.com/yandexdataschool/nlp_course/blob/master/week10_dialogue/seminar.ipynb)*.

In [None]:
import pandas as pd

train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

In [None]:
row = train_data.iloc[40]
print('QUESTION:', row.question, '\n')
for i, cand in enumerate(row.options):
    print('[ ]' if i not in row.correct_indices else '[v]', cand)

Токенизируем предложения:

In [None]:
import spacy

spacy = spacy.load('en')

train_data.question = train_data.question.apply(lambda text: [tok.text.lower() for tok in spacy.tokenizer(text)])
train_data.options = train_data.options.apply(lambda options: [[tok.text.lower() for tok in spacy.tokenizer(text)] for text in options])

test_data.question = test_data.question.apply(lambda text: [tok.text.lower() for tok in spacy.tokenizer(text)])
test_data.options = test_data.options.apply(lambda options: [[tok.text.lower() for tok in spacy.tokenizer(text)] for text in options])

У нас не так-то много данных, чтобы учить всё с нуля, поэтому будем сразу использовать предобученные эмбеддинги:

In [None]:
import gensim.downloader as api

w2v_model = api.load('glove-wiki-gigaword-100')

**Задание** Постройте матрицу предобученных эмбеддингов для самых частотных слов в выборке.

In [None]:
from collections import Counter


def build_word_embeddings(data, w2v_model, min_freq=5):
    words = Counter()
    
    for text in data.question:
        for word in text:
            words[word] += 1
            
    for options in data.options:
        for text in options:
            for word in text:
                words[word] += 1
                
    word2ind = {
        '<pad>': 0,
        '<unk>': 1
    }
    
    embeddings = [
        np.zeros(w2v_model.vectors.shape[1]),
        np.zeros(w2v_model.vectors.shape[1])
    ]
    
    <build embeddings>

    return word2ind, np.array(embeddings)

In [None]:
word2ind, embeddings = build_word_embeddings(train_data, w2v_model, min_freq=8)
print('Vocab size =', len(word2ind))

Для генерации батчей будем использовать такой класс:

In [None]:
import random
import math


def to_matrix(lines, word2ind):
    max_sent_len = max(len(line) for line in lines)
    matrix = np.zeros((len(lines), max_sent_len))

    for batch_ind, line in enumerate(lines):
        matrix[batch_ind, :len(line)] = [word2ind.get(word, 1) for word in line]

    return LongTensor(matrix)


class BatchIterator():
    def __init__(self, data, batch_size, word2ind, shuffle=True):
        self._data = data
        self._num_samples = len(data)
        self._batch_size = batch_size
        self._word2ind = word2ind
        self._shuffle = shuffle
        self._batches_count = int(math.ceil(len(data) / batch_size))
        
    def __len__(self):
        return self._batches_count
    
    def __iter__(self):
        return self._iterate_batches()

    def _iterate_batches(self):
        indices = np.arange(self._num_samples)
        if self._shuffle:
            np.random.shuffle(indices)

        for start in range(0, self._num_samples, self._batch_size):
            end = min(start + self._batch_size, self._num_samples)

            batch_indices = indices[start: end]

            batch = self._data.iloc[batch_indices]
            questions = batch['question'].values
            correct_answers = np.array([
                row['options'][random.choice(row['correct_indices'])]
                for i, row in batch.iterrows()
            ])
            wrong_answers = np.array([
                row['options'][random.choice(row['wrong_indices'])]
                for i, row in batch.iterrows()
            ])

            yield {
                'questions': to_matrix(questions, self._word2ind),
                'correct_answers': to_matrix(correct_answers, self._word2ind),
                'wrong_answers': to_matrix(wrong_answers, self._word2ind)
            }

In [None]:
train_iter = BatchIterator(train_data, 64, word2ind)
test_iter = BatchIterator(test_data, 128, word2ind)

Он просто сэмплирует последовательности из вопросов, правильных и неправильных ответов на них:

In [None]:
batch = next(iter(train_iter))

batch

## Model

** Task ** Implement an encoder model for texts - DSSM model towers.

* It doesn’t have to be a complex model, a convolutional model will do, which will learn much faster. *

In [None]:
class Encoder(nn.Module):
    def __init__(self, embeddings, hidden_dim=128, output_dim=128):
        super().__init__()
        
        <build some model>
        
    def forward(self, inputs):
        <apply it>

### Triplet Loss

We do not want just to teach the encoder to build embeddings for proposals. We want to attract vectors of correct answers to questions and push away the wrong ones. For this use, for example, * Triplet Loss *:

$$ L = \frac 1N \underset {q, a^+, a^-} \sum max(0, \space \delta - sim[V_q(q), V_a(a^+)] + sim[V_q(q), V_a(a^-)] ),$$

Where
* $ sim [a, b] $ similarity function (for example, dot product or cosine similarity)
* $ \delta $ - model hyperparameter. If $ sim [a, b] $ is linear in $ b $, then all $ \delta> 0 $ are equivalent.

<center>
<img src="https://raw.githubusercontent.com/yandexdataschool/nlp_course/master/resources/margin.png" width="20%">
</center>

** Assignment ** Implement triplet loss, as well as counting the recall - the percentage of cases where the correct answer was closer to the wrong one.

In [None]:
class DSSM(nn.Module):
    def __init__(self, question_encoder, answer_encoder):
        super().__init__()
        self.question_encoder = question_encoder
        self.answer_encoder = answer_encoder
        
    def forward(self, questions, correct_answers, wrong_answers):
        <perform forward pass>

    def calc_triplet_loss(self, question_embeddings, correct_answer_embeddings, wrong_answer_embeddings, delta=1.0):
        """Returns the triplet loss based on the equation above"""
        <do it>
        
    def calc_recall_at_1(self, question_embeddings, correct_answer_embeddings, wrong_answer_embeddings):
        """Returns the number of cases when the correct answer were more similar than incorrect one"""
        <and it>
        
    @staticmethod
    def similarity(question_embeddings, answer_embeddings):
        """Returns sim[a, b]"""
        <and it too>

In [None]:
class ModelTrainer():
    def __init__(self, model, optimizer):
        self._model = model
        self._optimizer = optimizer
        
    def on_epoch_begin(self, is_train, name, batches_count):
        """
        Initializes metrics
        """
        self._epoch_loss = 0
        self._correct_count, self._total_count = 0, 0
        self._is_train = is_train
        self._name = name
        self._batches_count = batches_count
        
        self._model.train(is_train)
        
    def on_epoch_end(self):
        """
        Outputs final metrics
        """
        return '{:>5s} Loss = {:.5f}, Recall@1 = {:.2%}'.format(
            self._name, self._epoch_loss / self._batches_count, self._correct_count / self._total_count
        )
        
    def on_batch(self, batch):
        """
        Performs forward and (if is_train) backward pass with optimization, updates metrics
        """
        
        question_embs, correct_answer_embs, wrong_answer_embs = self._model(
            batch['questions'], batch['correct_answers'], batch['wrong_answers']
        )
        loss = self._model.calc_triplet_loss(question_embs, correct_answer_embs, wrong_answer_embs)
        correct_count = self._model.calc_recall_at_1(question_embs, correct_answer_embs, wrong_answer_embs)
        total_count = len(batch['questions'])
        
        self._correct_count += correct_count
        self._total_count += total_count
        self._epoch_loss += loss.item()
        
        if self._is_train:
            self._optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(self._model.parameters(), 1.)
            self._optimizer.step()

        return '{:>5s} Loss = {:.5f}, Recall@1 = {:.2%}'.format(
            self._name, loss.item(), correct_count / total_count
        )

In [None]:
import math
from tqdm import tqdm
tqdm.get_lock().locks = []


def do_epoch(trainer, data_iter, is_train, name=None):
    trainer.on_epoch_begin(is_train, name, batches_count=len(data_iter))
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=len(data_iter)) as progress_bar:
            for i, batch in enumerate(data_iter):
                batch_progress = trainer.on_batch(batch)

                progress_bar.update()
                progress_bar.set_description(batch_progress)
                
            epoch_progress = trainer.on_epoch_end()
            progress_bar.set_description(epoch_progress)
            progress_bar.refresh()

            
def fit(trainer, train_iter, epochs_count=1, val_iter=None):
    best_val_loss = None
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        do_epoch(trainer, train_iter, is_train=True, name=name_prefix + 'Train:')
        
        if not val_iter is None:
            do_epoch(trainer, val_iter, is_train=False, name=name_prefix + '  Val:')

Запустим, наконец, учиться модель:

In [None]:
embeddings = FloatTensor(embeddings)

model = DSSM(
    Encoder(embeddings),
    Encoder(embeddings)
).to(DEVICE)

optimizer = optim.Adam(model.parameters())

trainer = ModelTrainer(model, optimizer)

fit(trainer, train_iter, epochs_count=30, val_iter=test_iter)

### Prediction accuracy

Let us evaluate how well the model predicts the correct answer.

** Task ** For each question, find the response index generated by the network:

In [None]:
predictions = []
<collect prediction indices>
    
accuracy = np.mean([
    answer in correct_ind
    for answer, correct_ind in zip(predictions, test_data['correct_indices'].values)
])
print("Accuracy: %0.5f" % accuracy)

In [None]:
def draw_results(question, possible_answers, predicted_index, correct_indices):
    print("Q:", ' '.join(question), end='\n\n')
    for i, answer in enumerate(possible_answers):
        print("#%i: %s %s" % (i, '[*]' if i == predicted_index else '[ ]', ' '.join(answer)))
    
    print("\nVerdict:", "CORRECT" if predicted_index in correct_indices else "INCORRECT", 
          "(ref: %s)" % correct_indices, end='\n' * 3)

In [None]:
for i in [1, 100, 1000, 2000, 3000, 4000, 5000]:
    draw_results(test_data.iloc[i].question, test_data.iloc[i].options,
                 predictions[i], test_data.iloc[i].correct_indices)

## Hard-negatives mining

In fact, in most cases we have negative examples.

For example, there is a base of dialogues - and where to take negative examples to the answers?

To do this, use * hard-negatives mining *. Take as a negative example the closest of the wrong examples in the batch:
$$a^-_{hard} = \underset {a^-} {argmax} \space sim[V_q(q), V_a(a^-)]$$

Wrong in this case - everything except the right :)

It is implemented somehow like this:
* Butch consists of the correct question-answer pairs.
* For all questions and all answers, consider embeddings.
* We have positive examples - it remains to find for each question the most similar answers that were intended for other questions.

** Assignment ** Update `DSSM` to do hard-negatives mining inside it.

* It may be necessary to normalize the vectors using `F.normalize` before calculating` similarity` *

In [None]:
class DSSM(nn.Module):
    def __init__(self, question_encoder, answer_encoder):
        super().__init__()
        self.question_encoder = question_encoder
        self.answer_encoder = answer_encoder
        
    def forward(self, questions, correct_answers, wrong_answers):
        """Ignore wrong_answers, they are here just for compatibility sake"""
        <perform forward pass>

    def calc_triplet_loss(self, question_embeddings, answer_embeddings, delta=1.0):
        """Returns the triplet loss based on the equation above"""
        <calc triple loss with hard-negatives>
        
    def calc_recall_at_1(self, question_embeddings, correct_answer_embeddings, wrong_answer_embeddings):
        """Returns the number of cases when the correct answer were more similar than incorrect one"""
        <calc recall>
        
    @staticmethod
    def similarity(question_embeddings, answer_embeddings):
        <calc it>

In [None]:
model = DSSM(
    question_encoder=Encoder(embeddings),
    answer_encoder=Encoder(embeddings)
).to(DEVICE)

optimizer = optim.Adam(model.parameters())

trainer = ModelTrainer(model, optimizer)

fit(trainer, train_iter, epochs_count=30, val_iter=test_iter)

** Assignment ** There is also an option with semi-hard negatives - when the best example is taken among those whose similarity is less than the similarity of a question with a positive example. Try to implement it.

# Chatty

To implement a chat, you need a normal body with dialogs. For example, OpenSubtitles.

In [None]:
!head train.txt

Ну, примерно нормальный.

Считаем датасет.

In [None]:
from nltk import wordpunct_tokenize

def read_dataset(path):
    data = []
    with open(path) as f:
        for line in tqdm(f):
            query, response = line.strip().split('\t')
            data.append((
                wordpunct_tokenize(query.strip()),
                wordpunct_tokenize(response.strip())
            ))
    return data

train_data = read_dataset('train.txt')
val_data = read_dataset('valid.txt')
test_data = read_dataset('test.txt')

In [None]:
from torchtext.data import Field, Example, Dataset, BucketIterator

query_field = Field(lower=True)
response_field = Field(lower=True)

fields = [('query', query_field), ('response', response_field)]

train_dataset = Dataset([Example.fromlist(example, fields) for example in train_data], fields)
val_dataset = Dataset([Example.fromlist(example, fields) for example in val_data], fields)
test_dataset = Dataset([Example.fromlist(example, fields) for example in test_data], fields)

query_field.build_vocab(train_dataset, min_freq=5)
response_field.build_vocab(train_dataset, min_freq=5)

print('Query vocab size =', len(query_field.vocab))
print('Response vocab size =', len(response_field.vocab))

train_iter, val_iter, test_iter = BucketIterator.splits(
    datasets=(train_dataset, val_dataset, test_dataset), batch_sizes=(512, 1024, 1024), 
    shuffle=True, device=DEVICE, sort=False
)

**Задание** Реализовать болталку по аналогии с тем, что уже написали.

# Referrence
Learning Deep Structured Semantic Models for Web Search using Clickthrough Data, 2013 [[pdf]](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf)  
Deep Learning and Continuous Representations for Natural Language Processing, Microsoft tutorial [[pdf]](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/NAACL-HLT-2015_tutorial.pdf)

[Neural conversational models: как научить нейронную сеть светской беседе](https://habr.com/company/yandex/blog/333912/)  
[Искусственный интеллект в поиске. Как Яндекс научился применять нейронные сети, чтобы искать по смыслу, а не по словам](https://habr.com/company/yandex/blog/314222/)  
[Triplet loss, Olivier Moindrot](https://omoindrot.github.io/triplet-loss)