# Scraping

In [None]:
# Выкачиваем и парсим новости

from newspaper import Article
from tqdm.notebook import tqdm
import pandas as pd
import os

def parse_urls(urls):
    data = []
    for url in tqdm(urls):
        try:
            article = Article(url)
            article.download()
            article.parse()
            data.append((article.title, article.text, article.tags, article.publish_date))
        except newspaper.ArticleException:
            print('Could not download {}'.format(url))

    return pd.DataFrame(data, columns=['Title', 'Text', 'Tags', 'Date'])

## Reuters

In [None]:
# Набираем урлы нужных новостей

import requests
import re

def get_reuters_urls(url_template, num_pages, html_regex):
    prefix = "http://www.reuters.com"
    reuters_urls = []
    for page_ind in range(1, num_pages + 1):
        url = url_template.format(str(page_ind))
        html = requests.get(url).text
        reuters_urls += list(map(lambda x: prefix + x, list(set(re.findall(html_regex, html)))))
    return list(set(reuters_urls))

In [None]:
# Мировые новости

reuters_urls = get_reuters_urls(
    'https://www.reuters.com/news/archive/?view=page&page={}&pageSize=10',
    350,
    '<a href="(/article/.*)">')
reuters_data = parse_urls(reuters_urls)
reuters_data.head()

In [None]:
reuters_data.to_csv(os.path.join(drive_root, 'covid_world_reuters.csv'), index=False)

## Интерфакс

In [None]:
# Набираем урлы

def get_interfax_urls(url_template, num_articles):
    start_id = 704968
    interfax_urls = []
    for news_id in range(start_id, start_id - num_articles, -1):
        interfax_urls.append(url_template.format(str(news_id)))
    return interfax_urls

In [None]:
interfax_urls = get_interfax_urls(
    'https://www.interfax.ru/russia/{}',  # На самом деле там все категории подряд
    8000)
interfax_data = parse_urls(interfax_urls)
interfax_data.head()

In [None]:
interfax_data.to_csv(os.path.join(drive_root, 'covid_world_interfax.csv'), index=False)

# Cleaning and segmentating

In [None]:
import pandas as pd
import os

reuters_data = pd.read_csv(os.path.join(drive_root, 'covid_world_target.csv'))
interfax_data = pd.read_csv(os.path.join(drive_root, 'covid_world_source.csv'))

In [None]:
print(len(reuters_data))
print(len(interfax_data))

4399
7857


In [None]:
# На выходе должны получиться мапы из id предложения в предложение.

from deeppavlov.models.tokenizers.ru_sent_tokenizer import RuSentTokenizer
from mosestokenizer import MosesSentenceSplitter
import re
from tqdm.notebook import tqdm

def get_interfax_sents(data, id_prefix, min_sent_len=30):
    sents = {}
    sent_id = 0
    texts = data['Text'].tolist()
    tokenize = RuSentTokenizer()
    for text in tqdm(texts):
        text = re.sub('(.*INTERFAX.RU -)', '', text)
        text = re.sub('(\n\n)', ' ', text)
        for sent in tokenize([text]):
            sent = sent[0]
            if len(sent) > min_sent_len:
                sents[id_prefix + str(sent_id)] = sent
                sent_id += 1
    # Добавим заголовки отдельно
    titles = data['Title'].tolist()
    for title in titles:
        sents[id_prefix + str(sent_id)] = title
        sent_id += 1
    return sents


def get_reuters_sents(data, id_prefix, min_sent_len=30):
    sents = {}
    sent_id = 0
    texts = data['Text'].tolist()
    with MosesSentenceSplitter('ru') as splitsents:
        for text in tqdm(texts):
            text = re.sub('(.*\(Reuters\) - )', '', text)
            text = re.sub('(FILE PHOTO:.*\n\n)', '', text)
            text = re.sub('(\n\n)', ' ', text)
            for sent in splitsents([text]):
                if sent[0] == '"' or sent[0] == '“':
                    sent = sent[1:]
                if sent[-1] == '"' or sent[-1] == '“':
                    sent = sent[:-1]
                if len(sent) > min_sent_len:
                    sents[id_prefix + str(sent_id)] = sent
                    sent_id += 1
    # Добавим заголовки отдельно
    titles = data['Title'].tolist()
    for title in titles:
        sents[id_prefix + str(sent_id)] = title
        sent_id += 1
    return sents

In [None]:
interfax_sents = get_interfax_sents(interfax_data, 'ru-covid-interfax-')
print(len(interfax_sents))

In [None]:
reuters_sents = get_reuters_sents(reuters_data, 'en-covid-reuters-')
print(len(reuters_sents))

In [None]:
import pickle

def save_data(data, name):
    with open('{}.pkl'.format(name), 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

save_data(interfax_sents, os.path.join(drive_root, 'covid_world_source'))
save_data(reuters_sents, os.path.join(drive_root, 'covid_world_target'))

# Немного статистики

In [None]:
import pickle
import os

def load_data(name):
    with open('{}.pkl'.format(name), 'rb') as f:
        return pickle.load(f)

def save_data(data, name):
    with open('{}.pkl'.format(name), 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

id2source = load_data(os.path.join(drive_root, 'covid_world_target'))
id2target = load_data(os.path.join(drive_root, 'covid_world_source'))

In [None]:
import pandas as pd

ru_df = pd.DataFrame({'id': list(id2source.keys()), 'sentence': list(id2source.values())})
en_df = pd.DataFrame({'id': list(id2target.keys()), 'sentence': list(id2target.values())})
ru_df.to_csv(os.path.join(drive_root, 'covid_en.csv'), index=False)
en_df.to_csv(os.path.join(drive_root, 'covid_ru.csv'), index=False)
en_df.head()

Unnamed: 0,id,sentence
0,ru-covid-interfax-0,Мантуров посоветовал пить травяные чаи в предд...
1,ru-covid-interfax-1,Умер композитор Александр Вустин В Москве на ...
2,ru-covid-interfax-2,В Новосибирск из Бангкока самолетом вывезли 18...
3,ru-covid-interfax-3,"""Индекс самоизоляции"" показал, что большинство..."
4,ru-covid-interfax-4,Эпидемиолог объяснил рост числа выявленных в Р...


In [None]:
import nltk
import pandas as pd
import os
from tqdm.notebook import tqdm
from collections import Counter

nltk.download('punkt')

def sentence_stats(id2sent):
    
    print('\nSentences: {}'.format(len(id2sent)))
    
    words = []
    for t in tqdm(id2sent.values()):
        words += nltk.word_tokenize(t)
    print('Words: {}'.format(len(words)))
    
    counter = Counter(words)
    print('Unique words: {}'.format(len(counter)))
    print('Most common words: {}'.format(counter.most_common()[:20]))
    
    return len(id2sent)


def stats(id2source, id2target):

    print('Processing source (en)...')
    lang1_len = sentence_stats(id2source)
    print('\n')
    
    print('Processing target (ru)...')
    lang2_len = sentence_stats(id2target)
    print('\n')

In [None]:
stats(id2source, id2target)

In [None]:
interfax_sents = id2target
reuters_sents = id2source

## Поиск предложений с помощью LASER



In [None]:
import faiss
from collections import defaultdict
import torch
assert torch.cuda.is_available()

from knn_cuda import KNN
from laser_wrapper.laser import Laser
import numpy as np

assert faiss.get_num_gpus() > 0

def get_emb(model, batch):
    return model(batch)

def knn(x, y, k, mem=5*1024*1024*1024):
  dim = x.shape[1]
  batch_size = mem // (dim*4)
  sim = np.zeros((x.shape[0], k), dtype=np.float32)
  ind = np.zeros((x.shape[0], k), dtype=np.int64)
  for xfrom in range(0, x.shape[0], batch_size):
    xto = min(xfrom + batch_size, x.shape[0])
    bsims, binds = [], []
    for yfrom in range(0, y.shape[0], batch_size):
      yto = min(yfrom + batch_size, y.shape[0])
      print('{}-{}  ->  {}-{}'.format(xfrom, xto, yfrom, yto))
      idx = faiss.IndexFlatIP(dim)
      idx = faiss.index_cpu_to_all_gpus(idx)
      idx.add(y[yfrom:yto])
      bsim, bind = idx.search(x[xfrom:xto], min(k, yto-yfrom))
      bsims.append(bsim)
      binds.append(bind + yfrom)
      del idx
    bsims = np.concatenate(bsims, axis=1)
    binds = np.concatenate(binds, axis=1)
    aux = np.argsort(-bsims, axis=1)
    for i in range(xfrom, xto):
      for j in range(k):
        sim[i, j] = bsims[i-xfrom, aux[i-xfrom, j]]
        ind[i, j] = binds[i-xfrom, aux[i-xfrom, j]]
  return sim, ind


def get_embeddings(model, id2sent, batch_size=512):
    sent_list = list(id2sent.values())
    ids = list(id2sent.keys())
    with torch.no_grad():
        vectors = []
        for start in tqdm(range(0, len(sent_list), batch_size)):
            end = min(start + batch_size, len(sent_list))
            batch_list = sent_list[start:end]
            vectors.extend(get_emb(model, batch_list))
        
    assert len(vectors) == len(ids) == len(sent_list)
    vectors = np.array(vectors)
    faiss.normalize_L2(vectors)
    return vectors


def score_pair(x, y, fwd_mean, bwd_mean, margin, dist='cosine'):
  if dist == 'cosine':
    return margin(x.dot(y), (fwd_mean + bwd_mean) / 2)
  else:
    l2 = ((x - y) ** 2).sum()
    sim = 1 / (1 + l2)
    return margin(sim, (fwd_mean + bwd_mean) / 2)


def score_candidates(x, y, candidate_inds, fwd_mean, bwd_mean, margin, dist='cosine'):
  scores = np.zeros(candidate_inds.shape)
  for i in range(scores.shape[0]):
    for j in range(scores.shape[1]):
      k = candidate_inds[i, j]
      scores[i, j] = score_pair(x[i], y[k], fwd_mean[i], bwd_mean[k], margin, dist)
  return scores


def shift_embeddings(x, y):
  print(' - shift embeddings')
  delta = x.mean(axis=0) - y.mean(axis=0)
  x2y = x - delta
  y2x = y + delta
  return x2y, y2x


def get_candidates(model, sources, targets, return_all=False, do_save=True, save_prefix='wiki_',
                   n_candidates=10, batch_size=512, margin='ratio', threshold=0.5, retrieval='max', use_shift=True, do_load=True):
    if do_load and os.path.exists(os.path.join(drive_root, save_prefix + 'source_vectors.pkl')):
        print('Loading source embeddings...')
        source_vectors = load_data(os.path.join(drive_root, save_prefix + 'source_vectors'))
    else:
        print('Computing source embeddings...')
        source_vectors = get_embeddings(model, sources)
        if do_save:
            save_data(source_vectors, os.path.join(drive_root, save_prefix + 'source_vectors'))
    print(source_vectors.shape)
    assert len(sources) == len(source_vectors)

    if do_load and os.path.exists(os.path.join(drive_root, save_prefix + 'target_vectors.pkl')):
        print('Loading target embeddings...')
        target_vectors = load_data(os.path.join(drive_root, save_prefix + 'target_vectors'))
    else:
        print('Computing target embeddings...')
        target_vectors = get_embeddings(model, targets)
        if do_save:
            save_data(target_vectors, os.path.join(drive_root, save_prefix + 'target_vectors'))
    print(target_vectors.shape)
    assert len(targets) == len(target_vectors)

    if use_shift:
        x2y, y2x = shift_embeddings(source_vectors, target_vectors)
    
    print('Computing distances...')
    if use_shift:
        x2y_sim, x2y_ind = knn(x2y, target_vectors, min(target_vectors.shape[0], n_candidates))
        x2y_mean = x2y_sim.mean(axis=1)
    else:
        x2y_sim, x2y_ind = knn(source_vectors, target_vectors, min(target_vectors.shape[0], n_candidates))
        x2y_mean = x2y_sim.mean(axis=1)

    print('Computing reverse distances...')
    if use_shift:
        y2x_sim, y2x_ind = knn(y2x, source_vectors, min(source_vectors.shape[0], n_candidates))
        y2x_mean = y2x_sim.mean(axis=1)
    else:
        y2x_sim, y2x_ind = knn(target_vectors, source_vectors, n_candidates)
        y2x_mean = y2x_sim.mean(axis=1)

    if margin == 'absolute':
        margin = lambda a, b: a
    elif margin == 'distance':
        margin = lambda a, b: a - b
    else:  # margin == 'ratio':
        margin = lambda a, b: a / b

    print('Scoring candidates...')
    if use_shift:
        fwd_scores = score_candidates(x2y, target_vectors, x2y_ind, x2y_mean, y2x_mean, margin)
        bwd_scores = score_candidates(y2x, source_vectors, y2x_ind, y2x_mean, x2y_mean, margin)
    else:
        fwd_scores = score_candidates(source_vectors, target_vectors, x2y_ind, x2y_mean, y2x_mean, margin)
        bwd_scores = score_candidates(target_vectors, source_vectors, y2x_ind, y2x_mean, x2y_mean, margin)

    fwd_best = x2y_ind[np.arange(source_vectors.shape[0]), fwd_scores.argmax(axis=1)]
    bwd_best = y2x_ind[np.arange(target_vectors.shape[0]), bwd_scores.argmax(axis=1)]

    print('Retrieving results...')
    source_keys = list(sources.keys())
    target_keys = list(targets.keys())
    predicted = []
    distances = []
    if retrieval == 'intersection':
        for i, j in enumerate(fwd_best):
            if bwd_best[j] == i:
                predicted.append((source_keys[i], target_keys[j]))
                distances.append(fwd_scores[i].max())

    if retrieval == 'max':
        indices = np.stack((np.concatenate((np.arange(source_vectors.shape[0]), bwd_best)),
                            np.concatenate((fwd_best, np.arange(target_vectors.shape[0])))), axis=1)
        scores = np.concatenate((fwd_scores.max(axis=1), bwd_scores.max(axis=1)))
        seen_src, seen_trg = set(), set()
        for i in np.argsort(-scores):
            src_ind, trg_ind = indices[i]
            if not src_ind in seen_src and not trg_ind in seen_trg:
                seen_src.add(src_ind)
                seen_trg.add(trg_ind)
                if scores[i] > 1.115:
                    predicted.append((source_keys[src_ind], target_keys[trg_ind]))
                    distances.append(scores[i])

    id2candidates_full = defaultdict(list)
    id2distances_full = defaultdict(list)
    for i, _ in enumerate(x2y_ind):
        for jj, j in enumerate(x2y_ind[i]):
            id2candidates_full[source_keys[i]].append(target_keys[j])
            id2distances_full[source_keys[i]].append(fwd_scores[i][jj])

    if return_all:
        return predicted, distances, id2candidates_full, id2distances_full
    return predicted, distances

In [None]:
from tqdm.notebook import tqdm

laser = Laser('LASER/models/bilstm.93langs.2018-12-26.pt', 'LASER/models/93langs.fcodes', use_gpu=True)
predicted, id2dist, id2candidates_full, id2distances_full = get_candidates(laser, id2source, id2target, return_all=True)

In [None]:
for d, (source, target) in zip(id2dist, predicted):
    print(f'{id2source[source]}')
    print(f'{id2target[target]}\n')

In [None]:
import pandas as pd

pairs_df = pd.DataFrame({'en': [id2source[s] for s, t in predicted],
                         'ru': [id2target[t] for s, t in predicted]})
pairs_df.to_csv(os.path.join(drive_root, 'covid_en_ru.csv'), index=False)
pairs_df.head()

Unnamed: 0,en,ru
0,Luxembourg becomes first country to make publi...,Люксембург стал первой страной в мире с беспла...
1,Putin said his talks with Erdogan has lasted m...,Переговоры Путина и Эрдогана продлились почти ...
2,Related Coverage Armenia and Russia to restric...,РФ и Армения договорились ограничить на две не...
3,Putin extends Russia's coronavirus non-working...,Путин объявил о продлении нерабочей недели в Р...
4,I like to look at involuntary part-time employ...,В РЖД перешли на неполный рабочий день
