In [1]:
!wget -q https://www.dropbox.com/s/43l702z5a5i2w8j/gazeta_train.txt
!wget -q https://www.dropbox.com/s/k2egt3sug0hb185/gazeta_val.txt
!wget -q https://www.dropbox.com/s/3gki5n5djs9w0v6/gazeta_test.txt

In [1]:
!pip install --upgrade tokenizers razdel torch networkx pymorphy2 nltk rouge==0.3.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Посмотрим на то, как устроен датасет

In [1]:
import json
import random
from tqdm import tqdm

def read_gazeta_records(file_name):
    records = []
    with open(file_name, "r") as r:
        for line in r:
            records.append(json.loads(line))

    random.shuffle(records)

    return records

In [2]:
train_records = read_gazeta_records("gazeta_train.txt")
val_records = read_gazeta_records("gazeta_val.txt")
test_records = read_gazeta_records("gazeta_test.txt")

In [3]:
train_records[20]

{'url': 'https://www.gazeta.ru/tech/2019/03/25_a_12263077.shtml',
 'text': 'На входе в новый магазин Tele2 посетитель замечает, что привычную надпись «салон связи» заменяет вывеска «Другие правила», а описание режима работы – призыв «Общайтесь». Компания заявляет, что каждый элемент новой розницы прошел через фильтр диджитализации и концепцию «других правил». Цифровой подход воплотился в сервисе, ассортименте и коммуникациях. Аудитория оператора становится все более цифровой, поэтому запуск формата – логичный ответ на тренд диждитализации. «По итогам прошлого года потребление интернет-трафика на одного пользователя Tele2 выросло почти в полтора раза. Клиенты приобрели в два раза больше смартфонов, чем годом ранее. Аудитория нашего мобильного приложения также удвоилась. Вывод очевиден – наши клиенты уходят в цифру», – заключил Игорь Майстренко , директор по продажам и развитию массового сегмента Tele2. Витрины нового магазина оформлены в непривычном для салона связи дизайне. Оператор ор

In [4]:
len(train_records)

52400

## Lead-3

Бейзлайн - первые 3 предложения текста в качестве summary.


В качестве метрик здесь и далее используем BLEU и ROUGE.

In [5]:
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge

def calc_scores(references, predictions, metric="all"):
    print("Tar:", references[0])
    print("Pred:", predictions[0])

    if metric in ("bleu", "all"):
        print("BLEU: ", corpus_bleu([[r.split()] for r in references], [p.split() for p in predictions]))
    if metric in ("rouge", "all"):
        rouge = Rouge()
        scores = rouge.get_scores(predictions, references, avg=True)
        print("ROUGE: ", scores)

In [6]:
import razdel

def calc_lead_n_score(records, n=1, lower=True, nrecords=1000):
    references = []
    predictions = []

    for record in records[:nrecords]:
        summary = record["summary"]
        summary = summary if not lower else summary.lower()
        references.append(summary)

        text = record["text"]
        sentences = [sentence.text.lower() for sentence in razdel.sentenize(text)]
        prediction = " ".join(sentences[:n])
        predictions.append(prediction)

    calc_scores(references, predictions)

calc_lead_n_score(test_records, n=3)

Tar: хорею гентингтона, тяжелое наследственное заболевание нервной системы, можно вылечить, считает международная группа исследователей. им удалось снизить число тринуклеотидных повторов, которое приводит к появлению симптомов. также новый способ лечения будет эффективен и против других заболеваний похожей природы.
Pred: международная команда исследователей под руководством специалистов из канадской детской больницы sickkids нашла способ вылечить хорею гентингтона или, как минимум, замедлить ее развитие, а также облегчить симптомы других заболеваний сходной природы. работа была опубликована в журнале nature genetics. хорея гентингтона — генетическое заболевание нервной системы, которое возникает из-за мутации гена htt.
BLEU:  0.07892949436329745
ROUGE:  {'rouge-1': {'f': 0.26346609918664804, 'p': 0.24937931490857518, 'r': 0.29722136891010914}, 'rouge-2': {'f': 0.11164812927783638, 'p': 0.10454227138078512, 'r': 0.12930484255529515}, 'rouge-l': {'f': 0.22648967863851122, 'p': 0.22688581

##TextRank

TextRank - unsupervised метод для составления кратких выжимок из текста. 

In [None]:
from itertools import combinations
import networkx as nx
import pymorphy2
import numpy as np

def unique_words_similarity(words1, words2):
    '''
    Функция подсчёта близости предложений на основе пересечения слов
    ''' 
    words1 = set(words1)
    words2 = set(words2)
  
    if not len(words1) or not len(words2):
        return 0.0

    return len(words1 & words2) / np.log10(len(words1) * len(words2))

def gen_text_rank_summary(text, calc_similarity=unique_words_similarity, summary_part=0.1, lower=True, morph=None):
    '''
    Составление summary с помощью TextRank
    '''
    # Разбиваем текст на предложения
    sentences = [sentence.text for sentence in razdel.sentenize(text)]
    n_sentences = len(sentences)

    # Токенизируем предложения
    sentences_words = [[token.text.lower() if lower else token.text for token in razdel.tokenize(sentence)] for sentence in sentences]

    # При необходимости лемматизируем слова
    if morph is not None:
        sentences_words = [[morph.parse(word)[0].normal_form for word in words] for words in sentences_words]

    # Для каждой пары предложений считаем близость
    pairs = combinations(range(n_sentences), 2)
    scores = [(i, j, calc_similarity(sentences_words[i], sentences_words[j])) for i, j in pairs]

    # Строим граф с рёбрами, равными близости между предложениями
    g = nx.Graph()
    g.add_weighted_edges_from(scores)

    # Считаем PageRank
    pr = nx.pagerank(g)
    result = [(i, pr[i], s) for i, s in enumerate(sentences) if i in pr]
    result.sort(key=lambda x: x[1], reverse=True)

    # Выбираем топ предложений
    n_summary_sentences = max(int(n_sentences * summary_part), 1)
    result = result[:n_summary_sentences]

    # Восстанавливаем оригинальный их порядок
    result.sort(key=lambda x: x[0])

    # Восстанавливаем текст выжимки
    predicted_summary = " ".join([sentence for i, proba, sentence in result])
    predicted_summary = predicted_summary.lower() if lower else predicted_summary
    return predicted_summary

def calc_text_rank_score(records, calc_similarity=unique_words_similarity, summary_part=0.1, lower=True, nrows=1000, morph=None):
    references = []
    predictions = []

    for record in records[:nrows]:
        summary = record["summary"]
        summary = summary if not lower else summary.lower()
        references.append(summary)

        text = record["text"]
        predicted_summary = gen_text_rank_summary(text, calc_similarity, summary_part, lower, morph=morph)
        text = text if not lower else text.lower()
        predictions.append(predicted_summary)

    calc_scores(references, predictions)

morph = pymorphy2.MorphAnalyzer()
calc_text_rank_score(test_records)
calc_text_rank_score(test_records, morph=morph)

Tar: в воронежской области врач-рентгенолог оказалась не в состоянии помочь ребенку, поскольку была сильно пьяна. на видеозаписи, сделанной в медучреждении, видно, что женщина не может подняться с кровати и не реагирует на возмущенные крики пациентов в свой адрес. позднее она пояснила, что употребила алкоголь из-за сложных семейных обстоятельств.
Pred: как рассказал отец ребенка в соцсетях, он привел сына на рентген, поскольку опасался, что у того сломана рука. в ее объяснительной сказано, что у нее сложились вот такие семейные обстоятельства», – подчеркнул представитель црб. житель приморья также выложил в соцсети видео и заявил, что если бы врач находился на работе в трезвом состоянии, возможно, он смог бы оказать пострадавшему квалифицированную помощь. некоторые пользователи узнали медика и сообщили в комментариях, что у него давно наблюдаются проблемы с алкоголем.
BLEU:  0.021070976947306745
ROUGE:  {'rouge-1': {'f': 0.15962779073032757, 'p': 0.13369862066804186, 'r': 0.21490563304

### **Задание 1**

Сделайте TextRank с другой мерой близости предложений: по FastText, ELMo или BERT эмбеддингам

In [12]:
from google.colab import drive
drive.mount('/content/gdrive')

!unzip '/content/gdrive/My Drive/Colab Notebooks/213.zip'

Mounted at /content/gdrive
Archive:  /content/gdrive/My Drive/Colab Notebooks/213.zip
  inflating: meta.json               
  inflating: model.model             
  inflating: model.model.vectors_ngrams.npy  
  inflating: model.model.vectors.npy  
  inflating: model.model.vectors_vocab.npy  
  inflating: README                  


In [None]:
import gensim
import numpy as np

model_path = 'model.model'

model = gensim.models.fasttext.FastTextKeyedVectors.load(model_path)
model.adjust_vectors()

In [None]:
from scipy import spatial
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity

def fast_text_sentences_similarity(words1, words2):
    '''
    Функция подсчёта близости предложений на основе пересечения слов
    ''' 
    words1 = set(words1)
    words2 = set(words2)
    embeddings1 = np.array([model.get_vector(word.lower()) if word.lower() in model.key_to_index else np.zeros((model.vector_size,))
                        for word in words1])
    
    embeddings2 = np.array([model.get_vector(word.lower()) if word.lower() in model.key_to_index else np.zeros((model.vector_size,))
                    for word in words2])
    
    sent1 = np.mean(embeddings1, axis=0)
    sent2 = np.mean(embeddings2, axis=0)

    return 1 - spatial.distance.cosine(sent1, sent2)

In [None]:
morph = pymorphy2.MorphAnalyzer()
calc_text_rank_score(test_records, calc_similarity=fast_text_sentences_similarity)
calc_text_rank_score(test_records, calc_similarity=fast_text_sentences_similarity, morph=morph)

  dist = 1.0 - uv / np.sqrt(uu * vv)


Tar: светлана лобода отпраздновала свой 37-й день рождения в компании лидера немецкой рок-группы rammstein тилля линдеманна. певица поделилась с подписчиками фотографией с камерного застолья в свою честь, на котором помимо предполагаемого возлюбленного присутствовали друзья и команда знаменитости.
Pred: в самом начале беседы собчак заверила лободу, что не станет расспрашивать ее о знаменитом рокере, поскольку ей известно, что та подобные вопросы не любит и не отвечает на них. следом же журналистка напомнила об интервью певицы в шоу рэпера басты, когда та обмолвилась о том, что не может публично говорить о своей личной жизни из-за наличия некоего контракта. дальнейшие попытки обсудить волнующую всю страну тему лобода, не без труда маскируя раздражение, отклонила смягчающей фразой о том, что собчак ей очень симпатична, но она «ничего не получит в ответ».
BLEU:  0.016490425580728312
ROUGE:  {'rouge-1': {'f': 0.14890736534842813, 'p': 0.12012349893901877, 'r': 0.20965926075082728}, 'rouge-

## Oracle summary

Для сведения задачи к extractive summarization мы должны выбрать те предложения из оригинального текста, которые наиболее похожи на наше целевое summary по нашим метрикам.

In [7]:
import copy

def build_oracle_summary_greedy(text, gold_summary, calc_score, lower=True, max_sentences=30, max_summary_sentences=5):
    '''
    Жадное построение oracle summary
    '''
    gold_summary = gold_summary.lower() if lower else gold_summary
    # Делим текст на предложения
    sentences = [sentence.text.lower() if lower else sentence.text for sentence in razdel.sentenize(text)][:max_sentences]
    n_sentences = len(sentences)
    oracle_summary_sentences = set()
    score = -1.0
    summaries = []
    for _ in range(min(max_summary_sentences, n_sentences)):
        for i in range(n_sentences):
            if i in oracle_summary_sentences:
                continue
            current_summary_sentences = copy.copy(oracle_summary_sentences)
            # Добавляем какое-то предложения к уже существующему summary
            current_summary_sentences.add(i)
            current_summary = " ".join([sentences[index] for index in sorted(list(current_summary_sentences))])
            # Считаем метрики
            current_score = calc_score(current_summary, gold_summary)
            summaries.append((current_score, current_summary_sentences))
        # Если получилось улучшить метрики с добавлением какого-либо предложения, то пробуем добавить ещё
        # Иначе на этом заканчиваем
        best_summary_score, best_summary_sentences = max(summaries)
        if best_summary_score <= score:
            break
        oracle_summary_sentences = best_summary_sentences
        score = best_summary_score
    oracle_summary = " ".join([sentences[index] for index in sorted(list(oracle_summary_sentences))])
    return oracle_summary, oracle_summary_sentences

def calc_single_score(pred_summary, gold_summary, rouge):
    return rouge.get_scores([pred_summary], [gold_summary], avg=True)['rouge-2']['f']

In [8]:
def calc_oracle_score(records, nrows=100, lower=True):
    references = []
    predictions = []
    rouge = Rouge()
  
    for record in tqdm(records[:nrows]):
        summary = record["summary"]
        summary = summary if not lower else summary.lower()
        references.append(summary)

        text = record["text"]
        predicted_summary, _ = build_oracle_summary_greedy(text, summary, calc_score=lambda x, y: calc_single_score(x, y, rouge))
        predictions.append(predicted_summary)

    calc_scores(references, predictions)


calc_oracle_score(test_records)

100%|██████████| 100/100 [00:20<00:00,  4.88it/s]

Tar: хорею гентингтона, тяжелое наследственное заболевание нервной системы, можно вылечить, считает международная группа исследователей. им удалось снизить число тринуклеотидных повторов, которое приводит к появлению симптомов. также новый способ лечения будет эффективен и против других заболеваний похожей природы.
Pred: хорея гентингтона — генетическое заболевание нервной системы, которое возникает из-за мутации гена htt. чем больше количество тринуклеотидных повторов, тем раньше проявляются симптомы. выраженность данных симптомов носит прогрессирующий характер и со временем приводит к инвалидности.
BLEU:  0.14839518337610041
ROUGE:  {'rouge-1': {'f': 0.37838584385776053, 'p': 0.40855677080118313, 'r': 0.3826106742462572}, 'rouge-2': {'f': 0.21369184587032666, 'p': 0.24043335812746858, 'r': 0.21216642563063615}, 'rouge-l': {'f': 0.3337345310991497, 'p': 0.3835169293425399, 'r': 0.35807248520225543}}





## Extractive RNN

Теперь пробуем предсказать oracle summary

### BPE
Для начала сделаем BPE токенизацию

In [9]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer(lowercase=True)

tokenizer.train_from_iterator([record['text'] for record in train_records[:1000]], vocab_size=8192, show_progress=True)

### Кэш oracle summary
Закэшируем oracle summary, чтобы не пересчитывать их каждый раз

In [None]:
from rouge import Rouge
import razdel

from tqdm import tqdm

def add_oracle_summary_to_records(records, max_sentences=30, lower=True, nrows=1000):
    rouge = Rouge()
    for record in tqdm(records[:nrows]):
        text = record["text"]
        summary = record["summary"]
        summary = summary.lower() if lower else summary
        sentences = [sentence.text.lower() if lower else sentence.text for sentence in razdel.sentenize(text)][:max_sentences]
        oracle_summary, sentences_indicies = build_oracle_summary_greedy(text, summary, calc_score=lambda x, y: calc_single_score(x, y, rouge),
                                                                         lower=lower)
        record["sentences"] = sentences
        record["oracle_sentences"] = list(sentences_indicies)
        record["oracle_summary"] = oracle_summary
    return records[:nrows]

ext_train_records = add_oracle_summary_to_records(train_records, nrows=16384)
ext_val_records = add_oracle_summary_to_records(val_records, nrows=1024)
ext_test_records = add_oracle_summary_to_records(test_records, nrows=1024)

100%|██████████| 16384/16384 [40:12<00:00,  6.79it/s]
100%|██████████| 1024/1024 [03:05<00:00,  5.52it/s]
100%|██████████| 1024/1024 [02:55<00:00,  5.84it/s]


In [None]:
import json

def write_gazeta_records(records, file_name):
    with open(file_name, "w") as w:
        for record in records:
            record["oracle_sentences"] = list(record["oracle_sentences"])
            w.write(json.dumps(record, ensure_ascii=False).strip() + "\n")

write_gazeta_records(ext_train_records, "gazeta_train_with_oracle.txt")
write_gazeta_records(ext_val_records, "gazeta_val_with_oracle.txt")
write_gazeta_records(ext_test_records, "gazeta_test_with_oracle.txt")

In [10]:
ext_train_records = read_gazeta_records("gazeta_train_with_oracle.txt")
ext_val_records = read_gazeta_records("gazeta_val_with_oracle.txt")
ext_test_records = read_gazeta_records("gazeta_test_with_oracle.txt")

### Составление батчей

In [None]:
import random
import math
import razdel
import torch
import numpy as np
from rouge import Rouge


class BatchIterator():
    def __init__(self, records, batch_size, tokenizer, max_sentences=30, max_sentence_length=50, device=torch.device('cuda')):
        self.records = records
        self.num_samples = len(records)
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.batches_count = int(math.ceil(self.num_samples / batch_size))
        self.rouge = Rouge()
        self.max_sentences = max_sentences
        self.max_sentence_length = max_sentence_length
        self.device = device #torch.device('cuda')
        
    def __len__(self):
        return self.batches_count
    
    def __iter__(self):
        indices = np.arange(self.num_samples)
        np.random.shuffle(indices)

        for start in range(0, self.num_samples, self.batch_size):
            end = min(start + self.batch_size, self.num_samples)
            batch_indices = indices[start:end]
            batch_inputs = []
            batch_outputs = []
            max_sentence_length = 0
            max_sentences = 0
            batch_records = []
            for data_ind in batch_indices:
                record = self.records[data_ind]
                batch_records.append(record)
                text = record["text"]
                summary = record["summary"]
                summary = summary.lower()

                if "sentences" not in record:
                    sentences = [sentence.text.lower() for sentence in razdel.sentenize(text)][:self.max_sentences]
                else:
                    sentences = record["sentences"]
                max_sentences = max(len(sentences), max_sentences)

                if "oracle_sentences" not in record:
                    calc_score = lambda x, y: calc_single_score(x, y, self.rouge)
                    sentences_indicies = build_oracle_summary_greedy(text, summary, calc_score=calc_score, max_sentences=self.max_sentences)[1]
                else:
                    sentences_indicies = record["oracle_sentences"]

                inputs = [tokenizer.encode(sentence).ids[:self.max_sentence_length] for sentence in sentences]
                max_sentence_length = max(max_sentence_length, max([len(tokens) for tokens in inputs]))
                outputs = [int(i in sentences_indicies) for i in range(len(sentences))]
                batch_inputs.append(inputs)
                batch_outputs.append(outputs)
            tensor_inputs = torch.zeros((self.batch_size, max_sentences, max_sentence_length), dtype=torch.long, device=self.device)
            tensor_outputs = torch.zeros((self.batch_size, max_sentences), dtype=torch.float32, device=self.device)
            for i, inputs in enumerate(batch_inputs):
                for j, sentence_tokens in enumerate(inputs):
                    tensor_inputs[i][j][:len(sentence_tokens)] = torch.LongTensor(sentence_tokens)
            for i, outputs in enumerate(batch_outputs):
                tensor_outputs[i][:len(outputs)] = torch.LongTensor(outputs)

            yield {
                'inputs': tensor_inputs,
                'outputs': tensor_outputs,
                'records': batch_records
            }

In [None]:
train_iterator = BatchIterator(ext_train_records, 32, tokenizer)

### Обучение

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import time

def train_model(model, train_records, val_records, tokenizer, batch_size=32,
                epochs_count=10, loss_every_nsteps=16, lr=0.001, device_name="cuda"):
    params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params: {}".format(params_count))
    device = torch.device(device_name)
    model = model.to(device)
    total_loss = 0
    start_time = time.time()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.BCEWithLogitsLoss().to(device)
    for epoch in range(epochs_count):
        for step, batch in enumerate(BatchIterator(train_records, batch_size, tokenizer, device=device)):
            model.train()
            logits = model(batch["inputs"])
            loss = loss_function(logits, batch["outputs"])
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            if step % loss_every_nsteps == 0 and step != 0:
                val_total_loss = 0
                val_batch_count = 0
                model.eval()
                for _, val_batch in enumerate(BatchIterator(val_records, batch_size, tokenizer, device=device)):
                    logits = model(val_batch["inputs"])
                    val_total_loss += loss_function(logits, batch["outputs"])
                    val_batch_count += 1
                avg_val_loss = val_total_loss/val_batch_count
                print(f"Epoch = {epoch}, Avg Train Loss = {total_loss / loss_every_nsteps:.4f}, Avg val loss = {avg_val_loss:.4f}, Time = {time.time() - start_time:.2f}s")
                total_loss = 0
                start_time = time.time()
        total_loss = 0
        start_time = time.time()

In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack

class SentenceEncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, n_layers=3, dropout=0.3):
        super(SentenceEncoderRNN, self).__init__()

        self.embedding_dim = embedding_dim
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding_layer = nn.Embedding(input_size, embedding_dim)
        self.rnn_layer = nn.LSTM(embedding_dim, hidden_size, n_layers, dropout=dropout, bidirectional=True, batch_first=True)
        self.dropout_layer = nn.Dropout(dropout)

    def forward(self, inputs, hidden=None):
        embedded = self.embedding_layer(inputs)
        outputs, _ = self.rnn_layer(embedded, hidden)
        sentences_embeddings = torch.mean(outputs, 1)
        return sentences_embeddings

class SentenceTaggerRNN(nn.Module):
    def __init__(self,
                 vocabulary_size,
                 token_embedding_dim=256,
                 sentence_encoder_hidden_size=256,
                 hidden_size=256,
                 sentence_encoder_n_layers=2,
                 sentence_encoder_dropout=0.3,
                 n_layers=1,
                 dropout=0.3):
        super(SentenceTaggerRNN, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.sentence_encoder = SentenceEncoderRNN(vocabulary_size, token_embedding_dim,
                                                   sentence_encoder_hidden_size // 2, sentence_encoder_n_layers, 
                                                   sentence_encoder_dropout)
        self.rnn_layer = nn.LSTM(sentence_encoder_hidden_size, hidden_size, n_layers, dropout=dropout,
                           bidirectional=True, batch_first=True)
        self.dropout_layer = nn.Dropout(dropout)
        self.content_linear_layer = nn.Linear(hidden_size * 2, 1)
        self.document_linear_layer = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.salience_linear_layer = nn.Linear(hidden_size * 2, hidden_size * 2)
        

    def forward(self, inputs, hidden=None):
        batch_size = inputs.size(0)
        sentences_count = inputs.size(1)
        tokens_count = inputs.size(2)
        inputs = inputs.reshape(-1, tokens_count)
        
        embedded_sentences = self.sentence_encoder(inputs)
        embedded_sentences = embedded_sentences.reshape(batch_size, sentences_count, -1)
        
        outputs, _ = self.rnn_layer(embedded_sentences, hidden)
        outputs = self.dropout_layer(outputs)
        document_embedding = self.document_linear_layer(outputs.mean(dim=1)).tanh()
        
        content = self.content_linear_layer(outputs).squeeze(2)
        salience = torch.bmm(outputs, self.salience_linear_layer(document_embedding).unsqueeze(2)).squeeze(2)
        
        return content + salience

model = SentenceTaggerRNN(tokenizer.get_vocab_size())
train_model(model, ext_train_records, ext_val_records, tokenizer, device_name="cuda", batch_size=32)



Trainable params: 4466177
Epoch = 0, Avg Train Loss = 0.3345, Avg val loss = 0.2425, Time = 13.44s
Epoch = 0, Avg Train Loss = 0.2461, Avg val loss = 0.2337, Time = 8.15s
Epoch = 0, Avg Train Loss = 0.2366, Avg val loss = 0.2534, Time = 8.86s
Epoch = 0, Avg Train Loss = 0.2388, Avg val loss = 0.2448, Time = 9.49s
Epoch = 0, Avg Train Loss = 0.2341, Avg val loss = 0.2348, Time = 7.90s
Epoch = 0, Avg Train Loss = 0.2356, Avg val loss = 0.2531, Time = 9.03s
Epoch = 0, Avg Train Loss = 0.2299, Avg val loss = 0.2389, Time = 9.07s
Epoch = 0, Avg Train Loss = 0.2350, Avg val loss = 0.2393, Time = 8.38s
Epoch = 0, Avg Train Loss = 0.2281, Avg val loss = 0.2239, Time = 9.48s
Epoch = 0, Avg Train Loss = 0.2281, Avg val loss = 0.2471, Time = 7.85s
Epoch = 0, Avg Train Loss = 0.2269, Avg val loss = 0.2541, Time = 9.13s
Epoch = 0, Avg Train Loss = 0.2361, Avg val loss = 0.2834, Time = 10.73s
Epoch = 0, Avg Train Loss = 0.2368, Avg val loss = 0.2764, Time = 7.97s
Epoch = 0, Avg Train Loss = 0.2243, 

KeyboardInterrupt: ignored

In [None]:
device = torch.device("cuda")

references = []
predictions = []
for step, batch in enumerate(BatchIterator(ext_test_records, 32, tokenizer, device=device)):
    logits = model(batch["inputs"])
    records = batch["records"]
    for record, record_logits in zip(records, logits):
        sentences = record["sentences"]
        predicted_summary = []
        for i, logit in enumerate(record_logits):
            if logit > 0.0:
                predicted_summary.append(sentences[i])
        if not predicted_summary:
            predicted_summary.append(sentences[torch.max(record_logits, dim=0)[1].item()])
        predicted_summary = " ".join(predicted_summary)
        references.append(record["summary"].lower())
        predictions.append(predicted_summary)

calc_scores(references, predictions)

Tar: наличие оргазма — не признак хорошего секса, сообщают американские исследователи. он может наступить и при нежеланном контакте, неся лишь болезненность и смятение.
Pred: наличие оргазма — не признак хорошего секса, заявляют ученые из мичиганского университета на основании результатов своего исследования.
BLEU:  0.04461730041418153
ROUGE:  {'rouge-1': {'f': 0.23533772085273544, 'p': 0.36965082381956776, 'r': 0.1836141877424964}, 'rouge-2': {'f': 0.11232423476358996, 'p': 0.18141057010646255, 'r': 0.08733431960907764}, 'rouge-l': {'f': 0.17614494478587653, 'p': 0.3314016191131096, 'r': 0.16360844212318426}}


### **Задание 2**
Доделайте модель в соответствии с https://arxiv.org/pdf/1611.04230.pdf

In [11]:
import random
import math
import razdel
import torch
import numpy as np
from rouge import Rouge


class BatchIterator():
    def __init__(self, records, batch_size, tokenizer, max_sentences=30, max_sentence_length=50, device=torch.device('cuda')):
        self.records = records
        self.num_samples = len(records)
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.batches_count = int(math.ceil(self.num_samples / batch_size))
        self.rouge = Rouge()
        self.max_sentences = max_sentences
        self.max_sentence_length = max_sentence_length
        self.device = device #torch.device('cuda')
        
    def __len__(self):
        return self.batches_count
    
    def __iter__(self):
        indices = np.arange(self.num_samples)
        np.random.shuffle(indices)

        for start in range(0, self.num_samples, self.batch_size):
            end = min(start + self.batch_size, self.num_samples)
            batch_indices = indices[start:end]
            batch_inputs = []
            doc_lens = []
            batch_outputs = []
            max_sentence_length = 0
            max_sentences = 0
            batch_records = []
            for data_ind in batch_indices:
                record = self.records[data_ind]
                batch_records.append(record)
                text = record["text"]
                summary = record["summary"]
                summary = summary.lower()

                if "sentences" not in record:
                    sentences = [sentence.text.lower() for sentence in razdel.sentenize(text)][:self.max_sentences]
                else:
                    sentences = record["sentences"]

                doc_lens.append(len(sentences))
                
                max_sentences = max(len(sentences), max_sentences)

                if "oracle_sentences" not in record:
                    calc_score = lambda x, y: calc_single_score(x, y, self.rouge)
                    sentences_indicies = build_oracle_summary_greedy(text, summary, calc_score=calc_score, max_sentences=self.max_sentences)[1]
                else:
                    sentences_indicies = record["oracle_sentences"]

                inputs = [tokenizer.encode(sentence).ids[:self.max_sentence_length] for sentence in sentences]
                max_sentence_length = max(max_sentence_length, max([len(tokens) for tokens in inputs]))
                outputs = [int(i in sentences_indicies) for i in range(len(sentences))]
                batch_inputs.append(inputs)
                batch_outputs.append(outputs)
            tensor_inputs = torch.zeros((self.batch_size, max_sentences, max_sentence_length), dtype=torch.long, device=self.device)
            tensor_outputs = torch.zeros((self.batch_size, max_sentences), dtype=torch.float32, device=self.device)
            for i, inputs in enumerate(batch_inputs):
                for j, sentence_tokens in enumerate(inputs):
                    tensor_inputs[i][j][:len(sentence_tokens)] = torch.LongTensor(sentence_tokens)
            for i, outputs in enumerate(batch_outputs):
                tensor_outputs[i][:len(outputs)] = torch.LongTensor(outputs)

            yield {
                'inputs': tensor_inputs,
                'outputs': tensor_outputs,
                'records': batch_records,
                'doc_lens': doc_lens
            }

In [12]:
train_iterator = BatchIterator(ext_train_records, 32, tokenizer)

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import time

def train_model(model, train_records, val_records, tokenizer, batch_size=32,
                epochs_count=10, loss_every_nsteps=16, lr=0.001, device_name="cuda"):
    params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params: {}".format(params_count))
    device = torch.device(device_name)
    model = model.to(device)
    total_loss = 0
    start_time = time.time()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.BCELoss().to(device)
    for epoch in range(epochs_count):
        for step, batch in enumerate(BatchIterator(train_records, batch_size, tokenizer, device=device)):
            model.train()
            logits = model(batch["inputs"], batch["doc_lens"], device)
            loss = loss_function(logits, batch["outputs"])
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            if step % loss_every_nsteps == 0 and step != 0:
                val_total_loss = 0
                val_batch_count = 0
                model.eval()
                for _, val_batch in enumerate(BatchIterator(val_records, batch_size, tokenizer, device=device)):
                    logits = model(val_batch["inputs"], batch["doc_lens"], device)
                    val_total_loss += loss_function(logits, batch["outputs"])
                    val_batch_count += 1
                avg_val_loss = val_total_loss/val_batch_count
                print(f"Epoch = {epoch}, Avg Train Loss = {total_loss / loss_every_nsteps:.4f}, Avg val loss = {avg_val_loss:.4f}, Time = {time.time() - start_time:.2f}s")
                total_loss = 0
                start_time = time.time()
        total_loss = 0
        start_time = time.time()

In [14]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack

class SentenceEncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, n_layers=3, dropout=0.3):
        super(SentenceEncoderRNN, self).__init__()

        self.embedding_dim = embedding_dim
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding_layer = nn.Embedding(input_size, embedding_dim)
        self.rnn_layer = nn.LSTM(embedding_dim, hidden_size, n_layers, dropout=dropout, bidirectional=True, batch_first=True)
        self.dropout_layer = nn.Dropout(dropout)

    def forward(self, inputs, hidden=None):
        embedded = self.embedding_layer(inputs)
        outputs, _ = self.rnn_layer(embedded, hidden)
        sentences_embeddings = torch.mean(outputs, 1)
        return sentences_embeddings

class SentenceTaggerRNN(nn.Module):
    def __init__(self,
                 vocabulary_size,
                 token_embedding_dim=256,
                 seg_num=3,
                 pos_num=30,
                 pos_dim=50,
                 sentence_encoder_hidden_size=256,
                 hidden_size=256,
                 sentence_encoder_n_layers=2,
                 sentence_encoder_dropout=0.3,
                 n_layers=1,
                 dropout=0.3):
        super(SentenceTaggerRNN, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.abs_pos_embed = nn.Embedding(pos_num, pos_dim)  # absolute postion
        self.rel_pos_embed = nn.Embedding(seg_num, pos_dim)  # relative position
        
        self.sentence_encoder = SentenceEncoderRNN(vocabulary_size, token_embedding_dim,
                                                   sentence_encoder_hidden_size // 2, sentence_encoder_n_layers, 
                                                   sentence_encoder_dropout)
        self.rnn_layer = nn.LSTM(sentence_encoder_hidden_size, hidden_size, n_layers, dropout=dropout,
                           bidirectional=True, batch_first=True)
        self.dropout_layer = nn.Dropout(dropout)
        self.document_linear_layer = nn.Linear(hidden_size * 2, hidden_size * 2)

        self.content = nn.Linear(2*hidden_size, 1, bias=False)
        self.salience = nn.Bilinear(2*hidden_size, 2*hidden_size, 1, bias=False)
        self.novelty = nn.Bilinear(2*hidden_size, 2*hidden_size, 1, bias=False)
        self.abs_pos = nn.Linear(pos_dim, 1, bias=False)
        self.rel_pos = nn.Linear(pos_dim, 1, bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1, 0.1))
        

    def forward(self, inputs, doc_lens, device, hidden=None):
        batch_size = inputs.size(0)
        sentences_count = inputs.size(1)
        tokens_count = inputs.size(2)
        inputs = inputs.reshape(-1, tokens_count)
        
        embedded_sentences = self.sentence_encoder(inputs)
        embedded_sentences = embedded_sentences.reshape(batch_size, sentences_count, -1)
        
        outputs, _ = self.rnn_layer(embedded_sentences, hidden)
        outputs = self.dropout_layer(outputs)
        document_embedding = self.document_linear_layer(outputs.mean(dim=1)).tanh()

        probs = []

        for index in range(batch_size):
          valid_hidden = outputs[index, :, :]
          doc = document_embedding[index, :].view(1, -1)
          s = torch.zeros([1, 2*self.hidden_size], device=device, requires_grad=True)
          sent_probs = []

          for position, h in enumerate(valid_hidden):
              h = h.view(1, -1)

              abs_index = torch.tensor([[position]], dtype=torch.long).to(device) 
              abs_features = self.abs_pos_embed(abs_index).squeeze(0)

              rel_index = round((position + 1) * 9.0 / sentences_count)
              rel_index = torch.tensor([[rel_index]], dtype=torch.long).to(device)
              rel_features = self.rel_pos_embed(rel_index).squeeze(0)

              content = self.content(h)
              salience = self.salience(h, doc)
              novelty = -1 * self.novelty(h, torch.tanh(s))
              abs_p = self.abs_pos(abs_features)
              rel_p = self.rel_pos(rel_features)

              prob = torch.sigmoid(content + salience + novelty + abs_p + rel_p + self.bias)
              s = s + torch.mm(prob, h)
              sent_probs.append(prob)
        
          probs.append(torch.cat(sent_probs).squeeze())
        
        return torch.cat(probs, dim=0).reshape(batch_size, sentences_count)

model = SentenceTaggerRNN(tokenizer.get_vocab_size())
train_model(model, ext_train_records, ext_val_records, tokenizer, device_name="cuda", batch_size=32)



Trainable params: 4729559


RuntimeError: ignored

In [17]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack

class SentenceEncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, n_layers=3, dropout=0.3):
        super(SentenceEncoderRNN, self).__init__()

        self.embedding_dim = embedding_dim
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding_layer = nn.Embedding(input_size, embedding_dim)
        self.rnn_layer = nn.LSTM(embedding_dim, hidden_size, n_layers, dropout=dropout, bidirectional=True, batch_first=True)
        self.dropout_layer = nn.Dropout(dropout)

    def forward(self, inputs, hidden=None):
        embedded = self.embedding_layer(inputs)
        outputs, _ = self.rnn_layer(embedded, hidden)
        sentences_embeddings = torch.mean(outputs, 1)
        return sentences_embeddings

class SentenceTaggerRNN(nn.Module):
    def __init__(self,
                 vocabulary_size,
                 token_embedding_dim=256,
                 seg_num=10,
                 pos_num=100,
                 pos_dim=50,
                 sentence_encoder_hidden_size=256,
                 hidden_size=256,
                 sentence_encoder_n_layers=2,
                 sentence_encoder_dropout=0.3,
                 n_layers=1,
                 dropout=0.3):
        super(SentenceTaggerRNN, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout

        self.abs_pos_embed = nn.Embedding(pos_num, pos_dim)  # absolute postion
        self.rel_pos_embed = nn.Embedding(seg_num, pos_dim)  # relative position
        
        self.sentence_encoder = SentenceEncoderRNN(vocabulary_size, token_embedding_dim,
                                                   sentence_encoder_hidden_size // 2, sentence_encoder_n_layers, 
                                                   sentence_encoder_dropout)
        self.rnn_layer = nn.LSTM(sentence_encoder_hidden_size, hidden_size, n_layers, dropout=dropout,
                           bidirectional=True, batch_first=True)
        self.dropout_layer = nn.Dropout(dropout)
        self.document_linear_layer = nn.Linear(hidden_size * 2, hidden_size * 2)

        self.content = nn.Linear(2*hidden_size, 1, bias=False)
        self.salience = nn.Bilinear(2*hidden_size, 2*hidden_size, 1, bias=False)
        self.novelty = nn.Bilinear(2*hidden_size, 2*hidden_size, 1, bias=False)
        self.abs_pos = nn.Linear(pos_dim, 1, bias=False)
        self.rel_pos = nn.Linear(pos_dim, 1, bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1, 0.1))
        

    def forward(self, inputs, doc_lens, device, hidden=None):
        batch_size = inputs.size(0)
        sentences_count = inputs.size(1)
        tokens_count = inputs.size(2)
        inputs = inputs.reshape(-1, tokens_count)
        
        embedded_sentences = self.sentence_encoder(inputs)
        embedded_sentences = embedded_sentences.reshape(batch_size, sentences_count, -1)
        
        outputs, _ = self.rnn_layer(embedded_sentences, hidden)
        outputs = self.dropout_layer(outputs)
        document_embedding = self.document_linear_layer(outputs.mean(dim=1)).tanh()

        probs = []

        for index in range(batch_size):
          valid_hidden = outputs[index, :, :]
          doc = document_embedding[index, :].view(1, -1)
          s = Variable(torch.zeros(1, 2*self.hidden_size)).to(device)
          sent_probs = []

          for position, h in enumerate(valid_hidden):
              h = h.view(1, -1)

              abs_index = Variable(torch.LongTensor([[position]])).to(device)
              abs_features = self.abs_pos_embed(abs_index).squeeze(0)

              rel_index =int(round((position + 1) * 9.0 / sentences_count))
              rel_index = Variable(torch.LongTensor([[rel_index]])).to(device)

              rel_features = self.rel_pos_embed(rel_index).squeeze(0)

              content = self.content(h)
              salience = self.salience(h, doc)
              novelty = -1 * self.novelty(h, torch.tanh(s))
              abs_p = self.abs_pos(abs_features)
              rel_p = self.rel_pos(rel_features)

              prob = torch.sigmoid(content + salience + novelty + abs_p + rel_p + self.bias)
              s = s + torch.mm(prob, h)
              sent_probs.append(prob)
        
          probs.append(torch.cat(sent_probs).squeeze())
        
        return torch.cat(probs, dim=0).reshape(batch_size, sentences_count)

model = SentenceTaggerRNN(tokenizer.get_vocab_size())
train_model(model, ext_train_records, ext_val_records, tokenizer, device_name="cuda", batch_size=32)



Trainable params: 4733409
Epoch = 0, Avg Train Loss = 0.2949, Avg val loss = 0.2196, Time = 64.95s
Epoch = 0, Avg Train Loss = 0.2438, Avg val loss = 0.2167, Time = 63.10s
Epoch = 0, Avg Train Loss = 0.2371, Avg val loss = 0.2008, Time = 62.93s
Epoch = 0, Avg Train Loss = 0.2330, Avg val loss = 0.2627, Time = 63.70s
Epoch = 0, Avg Train Loss = 0.2274, Avg val loss = 0.2168, Time = 63.66s
Epoch = 0, Avg Train Loss = 0.2265, Avg val loss = 0.2063, Time = 62.42s
Epoch = 0, Avg Train Loss = 0.2273, Avg val loss = 0.2296, Time = 64.00s
Epoch = 0, Avg Train Loss = 0.2218, Avg val loss = 0.2250, Time = 63.86s
Epoch = 0, Avg Train Loss = 0.2353, Avg val loss = 0.2559, Time = 62.69s
Epoch = 0, Avg Train Loss = 0.2194, Avg val loss = 0.2039, Time = 64.04s
Epoch = 0, Avg Train Loss = 0.2304, Avg val loss = 0.2125, Time = 63.29s
Epoch = 0, Avg Train Loss = 0.2204, Avg val loss = 0.2686, Time = 64.01s
Epoch = 0, Avg Train Loss = 0.2243, Avg val loss = 0.2001, Time = 64.56s
Epoch = 0, Avg Train Loss

KeyboardInterrupt: ignored

In [23]:
device = torch.device("cuda")

references = []
predictions = []
for step, batch in enumerate(BatchIterator(ext_test_records, 32, tokenizer, device=device)):
    logits = model(batch["inputs"], batch["doc_lens"], device)
    records = batch["records"]
    for record, record_logits in zip(records, logits):
        sentences = record["sentences"]
        predicted_summary = []
        for i, logit in enumerate(record_logits):
            if (logit > 0.0 and i < len(sentences)):
                predicted_summary.append(sentences[i])
        if not predicted_summary:
            predicted_summary.append(sentences[torch.max(record_logits, dim=0)[1].item()])
        predicted_summary = " ".join(predicted_summary)
        references.append(record["summary"].lower())
        predictions.append(predicted_summary)

calc_scores(references, predictions)

Tar: по указу президента россии владимира путина, в ск, мвд, мчс и фсин был произведен ряд кадровых перестановок. ряд сотрудников этих силовых структур покинули свои должности, а некоторые заняли новые посты. незадолго до этого стало известно, что глава государства отправил в отставку директора фсин геннадия корниенко.
Pred: президент россии владимир путин произвел ряд перестановок в следственном комитете (ск), мвд , мчс и федеральной службе исполнения наказаний ( фсин ) россии. соответствующий указ был опубликован 3 октября на официальном интернет-портале правовой информации. глава государства освободил от своих должностей начальника управления фсин по владимирской области андрея винограда, начальника гу мчс по республике коми александра князева , руководителя су ск рф по калининградской области виктора леденева, заместителя начальника гу мвд по новосибирской области, начальника главного следственного управления андрея неупокоева и главного инспектора мвд рф эдуарда соболя. также в от