In [None]:
import os
import xml.etree.ElementTree as ET

## Parallel data

In [None]:
lang_source = 'en'
lang_target = 'ru'
raw_root = 'parallel'

### TED + Tatoeba

In [None]:
def get_sents(raw_xml, id_prefix):
    tree = ET.parse(raw_xml)
    root = tree.getroot()

    sents = {}
    for sent in root.iter('s'):
        sents['{}_{}'.format(id_prefix, sent.attrib['id'])] = sent.text
    return sents

def clear_sents(sents):
    new_sents = {}
    for key, value in sents.items():
        if value is not None and \
            'http' not in value and \
            len(value) > 5 and \
            not value.isnumeric() and \
            value.count(' ') > 0:
            new_sents[key] = value
    return new_sents

def parse_parallel_corpus(corpus_name):

    corpus_root = os.path.join(raw_root, corpus_name.lower())
    raw_source = os.path.join(corpus_root, '{lang}.xml'.format(lang=lang_source))
    raw_target = os.path.join(corpus_root, '{lang}.xml'.format(lang=lang_target))
    index = os.path.join(corpus_root, '{source}-{target}.xml'.format(source=lang_source, target=lang_target))

    def get_pairs(source_sents, target_sents, pair_xml, id_prefix=corpus_name.lower()):
        tree = ET.parse(pair_xml)
        root = tree.getroot()

        pairs = []
        for link in root.iter('link'):
            source_id, target_id = link.attrib['xtargets'].split(';')
            source_id = '{}_{}'.format(id_prefix, source_id)
            target_id = '{}_{}'.format(id_prefix, target_id)
            if source_id in source_sents and target_id in target_sents:
                pairs.append((source_id, target_id))
        return pairs


    source_sents = clear_sents(get_sents(raw_source, corpus_name.lower()))
    target_sents = clear_sents(get_sents(raw_target, corpus_name.lower()))
    pairs = get_pairs(source_sents, target_sents, index)
    
    return source_sents, target_sents, pairs

In [None]:
ted_source, ted_target, ted_pairs = parse_parallel_corpus('TED2013')
tatoeba_source, tatoeba_target, tatoeba_pairs = parse_parallel_corpus('Tatoeba')

### WMT

In [None]:
def parse_parallel_multiple_editions(corpus_name, edition_name):
    corpus_root = os.path.join(raw_root, corpus_name.lower())
    raw_source = os.path.join(corpus_root, '{edition_name}-{lang}.xml'
                              .format(edition_name=edition_name, lang=lang_source))
    raw_target = os.path.join(corpus_root, '{edition_name}-{lang}.xml'
                              .format(edition_name=edition_name, lang=lang_target))
    index = os.path.join(corpus_root, '{source}-{target}.xml'
                         .format(source=lang_source, target=lang_target))

    def get_pairs(source_sents, target_sents, pair_xml, id_prefix=edition_name.lower()):
        tree = ET.parse(pair_xml)
        root = tree.getroot()

        pairs = []
        for link_group in root.iter('linkGrp'):
            if edition_name not in link_group.attrib['fromDoc']:
                continue
            for link in link_group.iter('link'):
                source_id, target_id = link.attrib['xtargets'].split(';')
                source_id = '{}_{}'.format(id_prefix, source_id)
                target_id = '{}_{}'.format(id_prefix, target_id)
                if source_id in source_sents and target_id in target_sents:
                    pairs.append((source_id, target_id))
        return pairs


    source_sents = clear_sents(get_sents(raw_source, edition_name.lower()))
    target_sents = clear_sents(get_sents(raw_target, edition_name.lower()))
    pairs = get_pairs(source_sents, target_sents, index)
    
    return source_sents, target_sents, pairs

In [None]:
wmt_source = {}
wmt_target = {}
wmt_pairs = []
for edition_year in [2015, 2016, 2017, 2018, 2019]:
    source, target, pairs = parse_parallel_multiple_editions('wmt', 'newstest{}'.format(edition_year))
    wmt_source.update(source)
    wmt_target.update(target)
    wmt_pairs += pairs

### Parallel data altogether

In [None]:
parallel_source = {}
parallel_source.update(ted_source)
parallel_source.update(tatoeba_source)
parallel_source.update(wmt_source)

parallel_target = {}
parallel_target.update(ted_target)
parallel_target.update(tatoeba_target)
parallel_target.update(wmt_target)

gold_pairs = ted_pairs + tatoeba_pairs + wmt_pairs

print(len(gold_pairs))

## Adding monolingual data

In [None]:
monolingual_data = 'monolingual'

In [None]:
def parse_monolingual(corpus_name, lang):
    corpus_root = os.path.join(monolingual_data, corpus_name.lower())
    raw = os.path.join(corpus_root, '{lang}.xml'.format(lang=lang))
    source_sents = clear_sents(get_sents(raw, corpus_name.lower()))
    return source_sents

In [None]:
tedhren_monolingual_source = parse_monolingual('TedHrEn', lang_source)
len(tedhren_monolingual_source)

In [None]:
def remove_duplicates(mono, parallel):
    mono_set = set(mono.values())
    parallel_set = set(parallel.values())
    intersection = mono_set.intersection(parallel_set)
    cleaned_mono = {}
    for key, value in mono.items():
        if value not in intersection:
            cleaned_mono[key] = value
    return cleaned_mono

In [None]:
tedhren_monolingual_source = remove_duplicates(tedhren_monolingual_source, parallel_source)
len(tedhren_monolingual_source)

In [None]:
# Reduce gold to get more monolingual data

import random

random.shuffle(gold_pairs)
result_size = len(gold_pairs) // 2

error_count = 0
for i, (source_id, target_id) in enumerate(gold_pairs[result_size:]):
    try:
        if i % 2 == 0:
            # Keep target sentence, remove source
            del(parallel_source[source_id])
        else:
            del(parallel_target[target_id])
    except KeyError:
        error_count += 1
        
gold_pairs = gold_pairs[:result_size]
print(len(gold_pairs))
print(len(parallel_source))
print(len(parallel_target))
print(error_count)

In [None]:
# Fix gold by removing missing ids and identical pairs

cleaned_gold_pairs = []
for source_id, target_id in gold_pairs:
    if source_id in parallel_source and target_id in parallel_target:
        if parallel_source[source_id] != parallel_target[target_id]:
            cleaned_gold_pairs.append((source_id, target_id))
        else:
            del parallel_source[source_id]
            del parallel_target[target_id]
gold_pairs = cleaned_gold_pairs
len(cleaned_gold_pairs)

In [None]:
# Sanity check

for source_id, target_id in gold_pairs[:10]:
    print('{}: {}'.format(source_id, parallel_source[source_id]))
    print('{}: {}'.format(target_id, parallel_target[target_id]))
    print()

ted2013_45302: In fact, it's not even really sequenced that much.
ted2013_45302: Его генетические последовательности выделены не полностью.

ted2013_77290: You have time to -- you have time to alert one house. What do you do?
ted2013_77290: И у Вас есть время только -- ... у Вас есть время оповестить только один из домов. Что Вы сделаете?

ted2013_57527: And I think the imagination is the only limit of what you can think of when this kind of technology merges with real life.
ted2013_57527: Думаю, что единственным ограничителем станет ваша фантазия когда эта технология проникнет в нашу жизнь.

tatoeba_4495275: Why are you sad?
tatoeba_4743980: Ты чего грустный?

tatoeba_284753: He didn't dare say anything.
tatoeba_3676761: Он не осмелился ничего сказать.

tatoeba_2094165: I don't think she is right.
tatoeba_5607112: Не думаю, что она права.

tatoeba_2255418: You've convinced me.
tatoeba_2694787: Ты меня убедил.

tatoeba_7185599: When was the first time that Tom kissed you?
tatoeba_69343

### Pickling

In [None]:
import pickle
import os

def save_data(data, name, pkl_root='ttw_corpus'):
    with open(os.path.join(pkl_root, '{}.pkl').format(name), 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!mkdir ttw_corpus

parallel_source.update(tedhren_monolingual_source)
source = parallel_source
target = parallel_target

save_data(source, 'ttw_source')
save_data(target, 'ttw_target')
save_data(gold_pairs, 'ttw_gold')

A subdirectory or file ttw_corpus already exists.
