In [None]:
import pickle
import os

map_file_prefix = 'ttw'

def load_map(name):
    with open(os.path.join('{}_{}.pkl'.format(map_file_prefix, name)), 'rb') as f:
        return pickle.load(f)

In [None]:
source_sents = load_map('source')
target_sents = load_map('target')
gold = load_map('gold')

In [None]:
# Clean too short sents

def clean_sents(sents, min_len=30):
    new_sents = {}
    for sent_id, sent in sents.items():
        if len(sent) >= min_len:
            new_sents[sent_id] = sent
    return new_sents


def clean_gold(source_sents, target_sents, gold):
    cleaned_gold = []
    for source_id, target_id in gold:
        if source_id in source_sents and target_id in target_sents:
            cleaned_gold.append((source_id, target_id))
    return cleaned_gold

In [None]:
source_sents = clean_sents(source_sents)
target_sents = clean_sents(target_sents)
gold = clean_gold(source_sents, target_sents, gold)

In [None]:
# Делим на train и test

import random

print('--- gold ---')
random.shuffle(gold)
train_ratio = 0.7
train_size = int(train_ratio * len(gold))
train_gold, test_gold = gold[:train_size], gold[train_size:]
print(len(train_gold))
print(len(test_gold))

source_in_train_gold = set()
target_in_train_gold = set()
for source, target in train_gold:
    source_in_train_gold.add(source)
    target_in_train_gold.add(target)

source_in_test_gold = set()
target_in_test_gold = set()
for source, target in test_gold:
    source_in_test_gold.add(source)
    target_in_test_gold.add(target)

def split_train_test(source_sents, source_in_train_gold, source_in_test_gold):
    train_source_sents = {}
    test_source_sents = {}
    for source_id, source_sent in source_sents.items():
        # Если есть и там и там, добавляем и в train и test
        if source_id in source_in_train_gold:
            train_source_sents[source_id] = source_sent
        if source_id in source_in_test_gold:
            test_source_sents[source_id] = source_sent
        # Иначе рандомно распределяем
        if source_id not in source_in_train_gold and source_id not in source_in_test_gold:
            if random.randint(0, 1) <= train_ratio:
                train_source_sents[source_id] = source_sent
            else:
                test_source_sents[source_id] = source_sent

    print(len(source_sents))
    print(len(train_source_sents))
    print(len(test_source_sents))

    return train_source_sents, test_source_sents

print('--- source ---')
train_source_sents, test_source_sents = split_train_test(source_sents, source_in_train_gold, source_in_test_gold)
print('--- target ---')
train_target_sents, test_target_sents = split_train_test(target_sents, target_in_train_gold, target_in_test_gold)


# Проверим

def check_sents(source_sents, target_sents, gold):
    for source, target in gold:
        assert source in source_sents
        assert target in target_sents

check_sents(train_source_sents, train_target_sents, train_gold)
check_sents(test_source_sents, test_target_sents, test_gold)

--- gold ---
42646
18278
--- source ---
826906
425467
402091
--- target ---
345927
185276
160789


In [None]:
import pickle

def save_data(data, name):
    with open('{}.pkl'.format(name), 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)


save_data(train_source_sents, 'ttw_train_source_sents')
save_data(test_source_sents, 'ttw_test_source_sents')
save_data(train_target_sents, 'ttw_train_target_sents')
save_data(test_target_sents, 'ttw_test_target_sents')
save_data(train_gold, 'ttw_test_gold')
save_data(test_gold, 'ttw_test_gold')

In [None]:
from tqdm.notebook import tqdm

min_len = 30
count = 0
prefix = 'wikimatrix_'
total = 200000
for _, row in tqdm(data.head(total).iterrows(), total=total):
    if len(row['en']) > min_len:
        source_id = prefix + 'en_' + str(count)
        target_id = prefix + 'ru_' + str(count)
        source_sents[source_id] = str(row['en'])
        target_sents[target_id] = str(row['ru'])
        gold.append((source_id, target_id))
        count += 1
print(count)

In [None]:
print(len(source_sents))
print(len(target_sents))
print(len(gold))

## LASER candidates

In [None]:
import torch
assert torch.cuda.is_available()

from knn_cuda import KNN
from laser_wrapper.laser import Laser
from tqdm.notebook import tqdm
import numpy as np

In [None]:
def get_laser_candidates(sources, targets, n_candidates=10, batch_size=1024):
    
    knn = KNN(k=n_candidates, transpose_mode=True)
    laser = Laser('LASER/models/bilstm.93langs.2018-12-26.pt', 'LASER/models/93langs.fcodes', use_gpu=True)

    source_list = list(sources.values())
    target_list = list(targets.values())
    
    source_ids = list(sources.keys())
    target_ids = list(targets.keys())
    
    print('Computing source vectors...')
    source_vectors = []
    source_batches_ids = []
    for start in tqdm(range(0, len(source_list), batch_size)):
        end = min(start + batch_size, len(source_list))
        batch_source_list = source_list[start:end]
        source_vectors.append(laser(batch_source_list))
        source_batches_ids.append(source_ids[start:end])
    
    batch_source_vectors = np.concatenate(source_vectors, axis=0)
    
    print('Computing target vectors...')
    target_vectors = []
    target_batches_ids = []
    for start in tqdm(range(0, len(target_list), batch_size)):
        end = min(start + batch_size, len(target_list))
        batch_target_list = target_list[start:end]
        target_vectors.append(laser(batch_target_list))
        target_batches_ids.append(target_ids[start:end])

    id2candidates = {}  # dict {target_id: list of source_ids}
    id2distances = {}
    
    print('Computing distances...')
    for batch_target_vectors, batch_target_ids in tqdm(zip(target_vectors, target_batches_ids), total=len(target_vectors)):
    
        dist, knn_ind = knn(torch.from_numpy(batch_source_vectors).cuda(), torch.from_numpy(batch_target_vectors).cuda())

        for target_index in range(len(batch_target_ids)):
            target_id = batch_target_ids[target_index]
            if target_id in id2candidates:
                for i, d in enumerate(dist[target_index]):
                    d = d.item()
                    if d < id2distances[target_id][i]:
                        id2candidates[target_id][i] = source_ids[knn_ind[target_index][i]]
                        id2distances[target_id][i] = d
            else:
                id2candidates[target_id] = [source_ids[ind] for ind in knn_ind[target_index]]
                id2distances[target_id] = [d.item() for d in dist[target_index]]
    
    return id2candidates, id2distances

In [None]:
%%time

candidates, id2dist = get_laser_candidates(test_source_sents, test_target_sents)

In [None]:
%%time

candidates_reverse, id2dist_reverse = get_laser_candidates(target_sents, source_sents)

In [None]:
import pickle

def save_data(data, name):
    with open('{}.pkl'.format(name), 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

save_data(candidates, 'ttw_candidates')
save_data(id2dist, 'ttw_id2dist')
save_data(candidates_reverse, 'ttw_candidates_reverse')
save_data(id2dist_reverse, 'ttw_id2dist_reverse')

In [None]:
def load_data(name):
    with open('{}.pkl'.format(name), 'rb') as f:
        return pickle.load(f)

candidates = load_data('ttw_candidates')
id2dist = load_data('ttw_id2dist')
candidates_reverse = load_data('ttw_candidates_reverse')
id2dist_reverse = load_data('ttw_id2dist_reverse')

In [None]:
for target_id, cand_ids in list(candidates.items())[:20]:
    print(target_sents[target_id])
    for i, cand_id in enumerate(cand_ids):
        print('(dist={:.2f}) {}'.format(id2dist[target_id][i], source_sents[cand_id]))
    print('\n')

In [None]:
for target_id, cand_ids in list(candidates_reverse.items())[:20]:
    print(source_sents[target_id])
    for i, cand_id in enumerate(cand_ids):
        print('(dist={:.2f}) {}'.format(id2dist_reverse[target_id][i], target_sents[cand_id]))
    print('\n')

## Generate training data for classifier

In [None]:
# Возвращает выборку, на которой можно обучать классификатор
# Представляет собой некое подобие hard negative mining
# Сбалансированные выборки были на [0.3, 0.4]
def get_train_data(gold, candidates, id2dist, low_threshold=0.4, high_threshold=0.45):
    used_sents = set()  # Делаем так, чтобы предложения в выборках вообще не пересекались
    
    # Положительные примеры - голд
    positives = []
    for source, target in gold:
        if source not in used_sents and target not in used_sents:
            used_sents.add(source)
            used_sents.add(target)
            positives.append((source, target))
    
    # Отрицательные примеры - наиболее близкие по лазеру (но не слишком), но не из голд
    negatives = []
    for target, sources in tqdm(candidates.items()):
        for i, source in enumerate(sources):
            if (source, target) not in gold and \
                source not in used_sents and \
                target not in used_sents and \
                id2dist[target][i] < high_threshold and id2dist[target][i] > low_threshold:
                negatives.append((source, target))
                used_sents.add(source)
                used_sents.add(target)
                break
    return positives, negatives

In [None]:
positives, negatives = get_train_data(gold, candidates, id2dist)

In [None]:
len(positives), len(negatives)

(26392, 40255)

In [None]:
import random
import pandas as pd

sent_dataset = []
for source, target in positives:
    try:
        sent_dataset.append((source_sents[source], target_sents[target], 1))
    except KeyError:
        pass
for source, target in negatives:
    sent_dataset.append((source_sents[source], target_sents[target], 0))

random.shuffle(sent_dataset)
data = pd.DataFrame(sent_dataset, columns=['source', 'target', 'label'])
data.to_csv('data_ru-en.csv', index=False)
data.head(20)

Unnamed: 0,source,target,label
0,Too long a holiday makes one reluctant to star...,После долгих праздников не хочется выходить на...,1
1,He claimed that he had returned the book to th...,"Он сказал мне, что пошёл в библиотеку, а сам б...",0
2,"And I asked the question, who was getting that...","Я задал вопрос, кто получил 3,5 миллиарда долл...",1
3,"She sent us a telegram, informing that she wou...","И были даже опросы, которые говорили нам, что ...",0
4,Ten years ago the ward office gave us ballpoin...,"10 лет назад, на день совершеннолетия в админи...",0
5,And I think this is what we've done with clima...,"Тем не менее, нам удалось прийти к соглашению ...",0
6,Now we're going to have the real radical exper...,А теперь мы проведем действительно радикальный...,1
7,"To become an astronomer, you have to study the...","Чтобы быть астрономом, нужно учиться, а чтобы ...",0
8,You don't seem to be as careless as Tom seems ...,"Вы должны знать, что Том не такой милый и безо...",0
9,You must pull yourself together and face up to...,"Люби себя, чихай на всех — и в жизни ждёт тебя...",0


## Данные для triplet loss (hard negative)

In [None]:
# Возвращает выборку, на которой можно обучать триплет сеть
# Берет голд пример и ближайший к нему отрицательный пример
def get_triplet_train_data(gold, candidates, id2dist, low_threshold=0.35, high_threshold=0.65, max_triplet_count=4):
    
    triplets_ru2en = []
    used_sents = set()
    for positive_source, anchor in gold:
        triplet_count = 0
        assert anchor in candidates
        for i, negative_source in enumerate(candidates[anchor]):
            if negative_source != positive_source and id2dist[anchor][i] < high_threshold and id2dist[anchor][i] > low_threshold:
                if anchor not in used_sents and positive_source not in used_sents and negative_source not in used_sents:
                    triplets_ru2en.append((anchor, positive_source, negative_source))
                    used_sents.add(anchor)
                    used_sents.add(positive_source)
                    used_sents.add(negative_source)
                    triplet_count += 1
                    if triplet_count > max_triplet_count:
                        break
    
    # Наоборот, anchor на англ (source)
    triplets_en2ru = []
    for anchor, positive_target in gold:
        triplet_count = 0
        assert anchor in candidates_reverse
        for i, negative_target in enumerate(candidates_reverse[anchor]):
            if negative_target != positive_target and id2dist_reverse[anchor][i] < high_threshold and id2dist_reverse[anchor][i] > low_threshold:
                if anchor not in used_sents and positive_target not in used_sents and negative_target not in used_sents:
                    triplets_en2ru.append((anchor, positive_target, negative_target))
                    used_sents.add(anchor)
                    used_sents.add(positive_target)
                    used_sents.add(negative_target)
                    triplet_count += 1
                    if triplet_count > max_triplet_count:
                        break


    return list(set(triplets_ru2en)), list(set(triplets_en2ru))

In [None]:
triplets_ru2en, triplets_en2ru = get_triplet_train_data(gold, candidates, id2dist)

In [None]:
len(triplets_ru2en), len(triplets_en2ru)

(41422, 5174)

In [None]:
for anchor, source, target in triplets_ru2en[:20]:
    try:
        print(target_sents[anchor])
        print(source_sents[source])
        print(source_sents[target])
        print('\n')
    except KeyError:
        pass

Ты не думал проверить уровень масла?
Have you thought of checking the oil level?
Don't forget to check the oil level.


Другое объяснение – тяга к подобному, иначе говоря, «рыбак рыбака видит издалека». В данном случае, двух людей привязывает именно схожесть собственных размеров.
Another possibility, very obvious, is homophily, or, birds of a feather flock together; here, I form my tie to you because you and I share a similar body size.
My way of looking at things is not at all similar to other people; for a long time, people have criticized my point of view.


Большой адронный коллайдер — это крупнейший в мире ускоритель заряженных частиц.
The Large Hadron Collider is the world's largest particle accelerator.
Hydrogen is the most abundant element in the universe.


Я не могу выговорить имя этой девчонки!
I can't pronounce this girl's name!
I don't remember that guy's name.


А когда моя история иссякала, я представлял, что Вселенная этого дровосека это один атом в топоре другого дрово

In [None]:
for anchor, source, target in triplets_en2ru[:20]:
    try:
        print(source_sents[anchor])
        print(target_sents[source])
        print(target_sents[target])
        print('\n')
    except KeyError:
        pass

Tom and Mary weren't invited to John's party.
Тома и Мэри не пригласили на вечеринку к Джону.
Том и Мэри не пригласили меня на свою свадьбу.


It'll be a free download -- thank you, Craig Mundie -- and it'll be available at the website WorldWideTelescope.org, which is something new.
Он будет бесплатен для скачивания. Спасибо Крейг Мунди. И его можно будет скачать на сайте WorldwideTelescope.org, недавно созданном.
Джеймс Суровики: Фуф... Оба ролика были размещены на Waveofdestruction.org.


I haven't been there since October.
Я не был там с октября месяца.
Этого не случалось с прошлого сентября.


Tom is suffering from a nervous disorder.
Том страдает нервным расстройством.
Том впал в депрессию от переутомления.


But, every moment of human history, from the Stone Age to the Information Age, from Sumer and Babylon to the iPod and celebrity gossip, they've all been carried out -- every book that you've read, every poem, every laugh, every tear -- they've all happened here.
Но каждое мгн

In [None]:
import random
import pandas as pd

sent_dataset = []
for anchor, source, target in triplets_ru2en:
    try:
        sent_dataset.append((target_sents[anchor], source_sents[source], source_sents[target]))
    except KeyError:
        pass
for anchor, source, target in triplets_en2ru:
    try:
        sent_dataset.append((source_sents[anchor], target_sents[source], target_sents[target]))
    except KeyError:
        pass

random.shuffle(sent_dataset)
data = pd.DataFrame(sent_dataset, columns=['anchor', 'positive', 'negative'])
data = data.dropna()  # бывает
data.to_csv('triplet_en-ru_clean.csv', index=False)
data.head(20)

Unnamed: 0,anchor,positive,negative
0,Что ты думаешь о стихотворении Тома?,What do you think of Tom's poem?,What did you really think of Tom's singing?
1,"Ты думаешь, он всё ещё читает мои сообщения?",Do you think he still reads my messages?,Do you think he still loves my letters?
2,"ГБ: Думаю, в каждой религии, каждой вере - и я...","GB: I think every religion, every faith, and I...",RB: I don't actually think that the stereotype...
3,"Как антрополог, я знаю, что именно ходьба сдел...","You know, as an anthropologist, walking is wha...",Making mistakes is what makes us human.
4,"Вот, например. Этого молодого человека зовут Д...","This, for example, this gentleman is called Jo...",He has a son whose name is John.
5,Каждый год из миллионов абалон составляются по...,"Now, millions of abalone every year make this ...",Across the landscapes of Earth were dotted the...
6,"Значит, здравый смысл и добрые намерения вступ...","So my common sense, my good intentions, were i...","(Laughter) So my common sense, my good intenti..."
7,"And in fact, let's take that one step further.","На самом деле, давайте сделаем тот самый шаг в...",Давай сделаем в отношениях шаг вперёд.
8,"Вы можете разбить материал, например, на корот...","You can break up the material, for example, in...",So you could have something that climbs along ...
9,"Мой близкий друг Дэвид понял, о чем я думаю. О...","And my close friend David, he saw the way I wa...",I went there without knowing him. He was 35. I...
