In [1]:
import pandas as pd
import os 
import random
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from difflib import SequenceMatcher
import re
import pickle
import numpy as np
from transformers import MarianMTModel, MarianTokenizer
import time

tqdm.pandas()

  from pandas import Panel


In [2]:
# Regexes to find Tom and Mary in several languages
tom_mary = {
    'Tom':   {
        'eng': 'Tom',
        'spa': 'Tom(ás)?',
        'rus': 'Том(а|ом|у)?',
        'ita': 'Tom(maso)?',
        'fra': '(Tom|Thomas)',
        'hat': 'Tom',
        'por': 'Tom',
        'deu': 'Tom(s)?',
        'nld': '(Tom(s)?|Thomas)',
        'dan': 'Tom(s)?',
        'nob': 'Tom(s|i)?',
        'swe': 'Tom(s|i)?',
        'fin': "Tom(m|')?(i(n|lla|a|lle|sta|lta|in|sta|ssa|kin|i|l|kaan|st|tta)?|kaan)?"
    },
    'Mary': {
        'eng': 'Mar(y|ia|ie)',
        'spa': 'Mar(y|ía|ia)',
        'fra': 'Mar(y|ie|ia)',
        'rus': '(Мэри|Мари(я|и|у|ей)|Маш(у|а|ей|и))',
        'ita': 'Mar(ie|ia|y|i)',
        'hat': 'Mary',
        'por': 'Mar(y|ia)',
        'deu': 'Mar(y|ia|i)(s)?',
        'nld': 'Mar(y|ia|ie|yam)',
        'dan': 'Mar(y|ia|ie|i)(s)?',
        'nob': 'Mar(y|ia|ie|i)(s)?',
        'swe': 'Mar(y|ia|ie|i)(s)?',
        'fin': 'Mar(i|y)(n|a|lle|sta|a|lla|lta|ä|ja|in|stä|yn|llä|aa|en)?'
    }
}

# Common names for substitution
names = {
    'eng': {
        'male':  [
            'Omar', 'Peter', 'Santiago', 'Daniel', 'William', 'Luis', 'James', 'John', 'Robert', 'Gabriel',
            'Oliver', 'Jonas', 'Charlie', 'Jack', 'Leonardo', 'David', 'Alexander', 'Sergei', 'Abraham', 'Tatsuki'
        ],
        'female': [
            'Anna', 'Emily', 'Natalia', 'Salma', 'Valentina', 'Olivia', 'Amelia', 'Viktoria', 'Anastasia',
            'Maryam', 'Sakura', 'Charlotte', 'Sarah', 'Ashley', 'Samantha', 'Laura', 'Latifa', 'Carlota', 'Eva',
            'Olga'
        ]
    }
}

In [3]:
class DataSource:
    iso_map = {
        "eng": "en",
        "rus": "ru",
        "ita": "it",
        "deu": "de",
        "fra": "fr",
        "por": "pt",
        "spa": "es",
        "nld": "nl",
        "fin": "fi",
        "dan": "da",
        "swe": "sv",
        "nob": "no",
        "hat": "ht"
    }
    
    
    def __init__(self, *, data_path: str, name_patterns : dict, new_names: dict, seed: int = 42):
        self.name_patterns = name_patterns
        self.new_names = new_names
        self.data_path = data_path
        self.languages = list(self.iso_map.keys())
        self.sentences = self._load_sentences()
        self.translations = self._load_translations()
        random.seed(seed)
        
        
    def _load_sentences(self):
        path = os.path.join(self.data_path, 'sentences.csv')
        sentences = pd.read_csv(path, sep='\t', names=['tatoeba_id', 'language', 'sentence'])
        sentences = sentences[sentences['language'].isin(self.languages)]
        return sentences.set_index('tatoeba_id', drop=False).sort_index()
    
    
    def _load_translations(self):
        path = os.path.join(self.data_path, 'links.csv')
        translations = pd.read_csv(path, sep='\t', names = ['source', 'target'])
        translations = translations.set_index('target', drop=False).join(self.sentences).dropna()[[
            'source', 'target', 'language'
        ]].rename(columns = {'language': 'target_language'})

        translations = translations.set_index('source', drop=False).join(self.sentences).dropna()[[
            'source', 'target', 'language', 'target_language'
        ]].rename(columns = {'language': 'source_language'})

        return translations
    
    
    def random_name(self, gender: str, lang: str):
        return random.sample(self.new_names[lang][gender], 1)[0]
    
    
    def name_sentences(self, name: str, lang: str):
        pattern = self.name_patterns[name][lang]
        lang_sentences = self.sentences.loc[self.sentences['language'] == lang]
        name_regex = r'\b' + pattern + r'\b'
        return lang_sentences[lang_sentences['sentence'].str.contains(name_regex, regex=True)]
    
    
    def name_pattern(self, name: str, lang: str):
        return self.name_patterns[name][lang]
    
    
    def iso(self, lang: str):
        return self.iso_map[lang]

    
    def find_translations(self, source_ids, tgt_lang):
        ids = self.translations[
            (self.translations['source'].isin(source_ids)) & (self.translations['target_language'] == 'eng')
        ]
        groups = ids.groupby(['source'])['target'].apply(list)
        keys = []
        mapping = {}
        for source_id in source_ids:
            try: 
                key = groups.loc[[source_id]].to_list()[0][0]
                keys.append(key)
                mapping[source_id] = key
            except KeyError:
                mapping[source_id] = None

        targets = self.sentences.loc[keys]['sentence'].to_dict()
        return {k: targets[v] if v is not None else None for (k,v) in mapping.items()}

In [4]:
class NameGenerator: 
    def __init__(self, *, data: DataSource, name: str, gender: str, init_lang: str, max_depth: int = 15):
        self.data = data
        self.replacement_map = {}
        self.max_depth = max_depth
        self.name = name
        self.gender = gender
        self.init_lang = init_lang
        self.takeback_ids = []
        
        
    def generate(self):
        self._fill_replacement_map()
        return len(self.replacement_map)
    
    
    def save(self, data_path):
        path = os.path.join(data_path, self.name.lower() + '_replacements.pkl')
        with open(path, 'wb') as file:
            pickle.dump(self.replacement_map, file, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    def load(self, data_path):
        path = os.path.join(data_path, self.name.lower() + '_replacements.pkl')
        with open(path, 'rb') as file:
            self.replacement_map = pickle.load(file)
    
    
    def get_name_replacement(self, tatoeba_id):
        if tatoeba_id not in self.replacement_map:
            self._fill_for_ids([tatoeba_id])
        return self.replacement_map[tatoeba_id]
    
    
    def takeback(self, tatoeba_id, depth=0):
        if depth > self.max_depth:
            return
        if tatoeba_id not in self.takeback_ids:
            self.takeback_ids.append(tatoeba_id)
            translations = self.data.translations[self.data.translations['source'] == tatoeba_id]
            self._takeback_translations(translations, depth)
    
    
    def _takeback_translations(self, translations, depth):
        for tatoeba_id in translations['target']:
            self.takeback(tatoeba_id, depth+1)
        

    def _fill_replacement_map(self):
        sentences = self.data.name_sentences(self.name, self.init_lang)
        self._fill_for_ids(sentences['tatoeba_id'])
    
    
    def _fill_for_ids(self, sentence_ids, depth=0):
        if depth > self.max_depth:
            return
        for tatoeba_id in sentence_ids:
            if tatoeba_id not in self.replacement_map:
                translations = self.data.translations[self.data.translations['source'] == tatoeba_id]
                self.replacement_map[tatoeba_id] = self._get_name(translations)
                print(len(self.replacement_map), end='\r')
                self._fill_translations(translations, depth)
            
    
    def _fill_translations(self, translations, depth):
        for language in self.data.languages:
            targets = translations[translations['target_language'] == language]['target']
            self._fill_for_ids(targets, depth + 1)
    
    
    def _get_name(self, translations):
        # First try to find if some translation already has a name
        for tatoeba_id in translations['target']:
            if tatoeba_id in self.replacement_map:
                return self.replacement_map[tatoeba_id]
        # If not, generate a new name at random
        return self.data.random_name(self.gender, self.init_lang)

In [5]:
class Translator: 
    models = {}
    cache = {}

    def translate(self, text: str, src_lang: str, tgt_lang: str):
        cached = self.cache.pop(text, None)
        if cached:
            return cached
        translations = self.translate_batch([text], src_lang, tgt_lang)
        return translations[0]


    def translate_batch(self, texts: list, src_lang: str, tgt_lang: str):
        model, tokenizer = self._load_model(src_lang, tgt_lang)

        # The target language is specified as a special token within the source string
        batch = [f'>>{tgt_lang}<< {text}' for text in texts]

        translated = model.generate(**tokenizer.prepare_translation_batch(batch))
        translations = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

        # Resulting translations sometimes contain weird characters at the start, remove them
        translations = [re.sub(r'^(▸|▪|::|\-|□|–|"|\*|♪)\s*', '', t) for t in translations]

        translations = [
            re.sub(r'\.$', '', translations[i]) if texts[i][-1] != '.' else translations[i] 
            for i in range(len(translations))
        ]
 
        return translations

    
    def translate_fast(self, texts: list, src_lang: str, tgt_lang: str):
        model, tokenizer = self._load_model(src_lang, tgt_lang)
        
        texts = [' @123. '.join(t) for t in np.array_split(texts, len(texts)//2 + 1)]
        
        # The target language is specified as a special token within the source string
        batch = [f'>>{tgt_lang}<< {text}' for text in texts]

        translated = model.generate(**tokenizer.prepare_translation_batch(batch))
        translations = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

        # Resulting translations sometimes contain weird characters at the start, remove them
        translations = [re.sub(r'^(▸|▪|::|\-|□|–|"|\*|♪)\s*', '', t) for t in translations]

        translations = [
            re.sub(r'\.$', '', translations[i]) if texts[i][-1] != '.' else translations[i] 
            for i in range(len(translations))
        ]
        
        translations = [re.split('@123\.?', t) for t in translations]
        return [t.strip() for tl in translations for t in tl if t.strip() != '']
 

    def cache_translations(self, texts: list, src_lang: str, tgt_lang: str, fast: bool):
        if fast:
            translations = self.translate_fast(texts, src_lang, tgt_lang)
            if len(texts) != len(translations):
                print('Failed fast translation')
                translations = self.translate_batch(texts, src_lang, tgt_lang)
        else:
            translations = self.translate_batch(texts, src_lang, tgt_lang)
            
        for idx, translation in enumerate(translations):
            self.cache[texts[idx]] = translation
    
    
    def _get_model_name(self, src_lang, tgt_lang):
        # There are only two models from romance languages to English and back
        romance_langs = ["fr", "es", "it", "pt", "ro", "ca", "gl", "la", "wa", "fur", "oc", "sc", "an", "frp", "lad", "vec", "co", "lld", "lij", "lmo", "nap", "rm", "scn", "mwl"]
        src_lang = 'ROMANCE' if src_lang in romance_langs and tgt_lang == 'en' else src_lang
        tgt_lang = 'ROMANCE' if tgt_lang in romance_langs and src_lang == 'en' else tgt_lang
        tgt_lang = 'NORWAY' if tgt_lang == 'no' else tgt_lang
        return 'Helsinki-NLP/opus-mt-{0}-{1}'.format(src_lang, tgt_lang)


    def _load_model(self, src_lang, tgt_lang):
        model_name = self._get_model_name(src_lang, tgt_lang)
        if model_name not in self.models:
            print('Loading ' + model_name)
            model = MarianMTModel.from_pretrained(model_name)
            tokenizer = MarianTokenizer.from_pretrained(model_name)
            # Save the loaded model in 'models' to speed up its later use
            self.models[model_name] = (model, tokenizer)
        else:
            model, tokenizer = self.models[model_name]
        return model, tokenizer 

In [None]:
sentences = data.name_sentences('Tom', 'eng').sample(150)['sentence'].to_list()
translator = Translator()
translations = translator.translate_fast(sentences, 'en', 'fr')
print(len(translations))
translations

In [6]:
class NameReplacer:
    basic_langs = ['eng', 'spa', 'ita', 'hat', 'por']
    
    def __init__(self, *, name: str, translator: Translator, data: DataSource, init_lang: str):
        assert init_lang in self.basic_langs
        self.translator = translator
        self.data = data
        self.init_lang = init_lang
        self.name = name
    
    
    def replace_name(self, *, new_name: str, lang: str, sentence: str, ref_sentence: str = None):
        if lang in self.basic_langs:
            return self._replace_basic(new_name, sentence, lang)
        else:
            return self._replace_inflected(new_name, sentence, ref_sentence, lang)
    
    
    def is_basic(self, lang):
        return lang in self.basic_langs
    

    def _replace_basic(self, new_name: str, sentence: str, lang: str):
        old_name = self.data.name_pattern(self.name, lang)
        return re.sub(old_name, new_name, sentence)


    def _replace_inflected(self, new_name: str, sentence: str, ref_sentence: str, lang: str):
        assert ref_sentence is not None
         
        # Translate the original sentence (with the old name) into the target language
        ref_translated = self.translator.translate(
            ref_sentence, self.data.iso(self.init_lang), self.data.iso(lang)
        )
        # Replace the name in the original sentence with the new name
        ref_replaced = self.replace_name(
            new_name=new_name, lang=self.init_lang, sentence=ref_sentence
        )
        # Translate the previous sentence (with the new name) into the target language 
        ref_replaced_translated = self.translator.translate(
            ref_replaced, self.data.iso(self.init_lang), self.data.iso(lang)
        )
        
        # Get the difference between both translations, hoping that the thing that changed was the name or some
        # sorrounding text, in order to find the most likely inflection of the replaced name in the target language
        repls = self._find_replacements(ref_translated, ref_replaced_translated)
        
        #print(ref_translated, ref_replaced_translated, repls)

        # Don't replace if it finds more than one replacement, because the ambiguity might cause serious mistakes.
        if len(repls) > 1 or len(repls) == 0:
            return None

        repl = repls[0]
        old_name = self.data.name_pattern(self.name, lang)
        # If it doesn't detect the name to replace, just return the sentence untouched
        if re.search(old_name, repl[0]) is None:
            return None

        # Replace name by its regular expression to match all variations
        name_pattern = re.sub(old_name, old_name, repl[0])
        return re.sub(name_pattern, repl[1], sentence)

    
    def _find_replacements(self, base, replaced): 
        base = word_tokenize(base)
        replaced = word_tokenize(replaced)
        s = SequenceMatcher(None, base, replaced)
        replacements = []
        for tag, i1, i2, j1, j2 in s.get_opcodes():
            if tag == 'replace':
                replacements.append((' '.join(base[i1:i2]), ' '.join(replaced[j1:j2])))
        return replacements

In [7]:
class BulkReplacer:
    def __init__(self, *, generator: NameGenerator, data: DataSource):
        self.data = data
        self.generator = generator
        self.init_lang = generator.init_lang
        self.translator = Translator()
        self.replacer = NameReplacer(
            name=generator.name, init_lang=self.init_lang, translator=self.translator, data=data
        )
        
    
    def replace_lang_names(self, lang, fast=False):
        lang_sentences = self.data.name_sentences(self.generator.name, lang)
        chunks = []
        for chunk in tqdm(np.array_split(lang_sentences, len(lang_sentences)//150 + 1)):
            refs = []
            if not self.replacer.is_basic(lang):
                refs = self._prepare_refs(chunk, lang, fast)
            chunk['sentence_new'] = chunk.apply(
                lambda row: self._replace_row(row, lang, refs), axis=1
            )
            chunks.append(chunk)
        return pd.concat(chunks)
        
    
    def _prepare_refs(self, chunk, lang, fast):
        ids = chunk['tatoeba_id'].to_list()
        refs = self.data.find_translations(ids, self.init_lang)
        texts = [v for (k,v) in refs.items() if v is not None]
        self.translator.cache_translations(texts, self.data.iso(self.init_lang), self.data.iso(lang), fast)
        tatoeba_ids = chunk['tatoeba_id'].to_list()
        sentences = chunk['sentence'].to_list()
        
        replaced_refs = []
        for tatoeba_id in tatoeba_ids:
            if refs[tatoeba_id] is not None:
                new_name = self.generator.get_name_replacement(tatoeba_id)
                ref_replaced = self.replacer.replace_name(
                    new_name=new_name, lang=self.init_lang, sentence=refs[tatoeba_id]
                )
                replaced_refs.append(ref_replaced)
        
        self.translator.cache_translations(replaced_refs, self.data.iso(self.init_lang), self.data.iso(lang), fast)
        return refs
    
    
    def _replace_row(self, row, lang, refs):
        new_name = self.generator.get_name_replacement(row['tatoeba_id'])
        ref = refs[row['tatoeba_id']] if row['tatoeba_id'] in refs else None
        if ref is None and not self.replacer.is_basic(lang):
            ref = self.translator.translate(
                row['sentence'], self.data.iso(lang), self.data.iso(self.init_lang)
            )
        
        new_sentence = self.replacer.replace_name(
            new_name=new_name, lang=lang, sentence=row['sentence'], ref_sentence=ref
        )
        
        if new_sentence is None:
            new_sentence = row['sentence']
            self.generator.takeback(row['tatoeba_id'])
        return new_sentence

In [8]:
data = DataSource(data_path='../data/sources/tatoeba', name_patterns=tom_mary, new_names=names)

In [9]:
generator = NameGenerator(name='Tom', gender='male', init_lang='eng', data=data)
#generator.generate()

generator.load('../data/objects')

#generator.generate()
#generator.save('../data/objects')

bulk = BulkReplacer(generator=generator, data=data)

In [None]:
start = time.time()
rus = bulk.replace_lang_names('rus')
end = time.time()
end-start

  return func(self, *args, **kwargs)
  0%|          | 0/880 [00:00<?, ?it/s]

Loading Helsinki-NLP/opus-mt-en-ru




Loading Helsinki-NLP/opus-mt-ru-en


  0%|          | 2/880 [03:50<26:48:55, 109.95s/it]

818442

  0%|          | 3/880 [05:07<24:21:29, 99.99s/it] 

818447

  0%|          | 4/880 [06:19<22:18:04, 91.65s/it]

818457

  1%|          | 5/880 [07:16<19:46:07, 81.33s/it]

818459

  1%|          | 6/880 [08:32<19:18:52, 79.56s/it]

818462

  1%|          | 7/880 [09:29<17:39:47, 72.84s/it]

818464

  1%|          | 8/880 [10:23<16:16:06, 67.16s/it]

818476

  1%|          | 9/880 [12:04<18:40:25, 77.18s/it]

818500

  1%|          | 10/880 [13:45<20:23:59, 84.41s/it]

818522

  1%|▏         | 11/880 [15:20<21:07:54, 87.54s/it]

818542

  1%|▏         | 12/880 [17:11<22:49:26, 94.66s/it]

818578

  1%|▏         | 13/880 [18:46<22:49:39, 94.79s/it]

818592

  2%|▏         | 14/880 [19:53<20:45:42, 86.31s/it]

818615

  2%|▏         | 15/880 [21:17<20:37:30, 85.84s/it]

818646

  2%|▏         | 16/880 [22:44<20:38:47, 86.03s/it]

818686

  2%|▏         | 17/880 [24:23<21:35:53, 90.10s/it]

818715

  2%|▏         | 18/880 [26:08<22:35:40, 94.36s/it]

818727

  2%|▏         | 19/880 [27:37<22:12:51, 92.88s/it]

818743

  2%|▏         | 21/880 [30:07<19:51:33, 83.23s/it]

818746

  2%|▎         | 22/880 [31:16<18:53:07, 79.24s/it]

818747818748818749818750818751818752818753818754818755

  3%|▎         | 23/880 [32:47<19:41:23, 82.71s/it]

818763

  3%|▎         | 24/880 [34:29<21:00:27, 88.35s/it]

818767

  3%|▎         | 25/880 [37:23<27:04:46, 114.02s/it]

818768818769

  3%|▎         | 27/880 [40:03<22:47:15, 96.17s/it] 

818773

  3%|▎         | 28/880 [41:16<21:06:04, 89.16s/it]

818790

  3%|▎         | 29/880 [42:55<21:46:05, 92.09s/it]

818806

  3%|▎         | 30/880 [44:41<22:43:40, 96.26s/it]

818812

  4%|▎         | 31/880 [46:40<24:18:15, 103.06s/it]

818851

  4%|▎         | 32/880 [48:13<23:35:19, 100.14s/it]

818871

  4%|▍         | 33/880 [49:48<23:12:52, 98.67s/it] 

818875

  4%|▍         | 34/880 [51:27<23:11:12, 98.67s/it]

818882

  4%|▍         | 35/880 [52:31<20:45:39, 88.45s/it]

818887

  4%|▍         | 36/880 [53:48<19:54:52, 84.94s/it]

818889

  4%|▍         | 37/880 [55:06<19:22:55, 82.77s/it]

818894

  4%|▍         | 38/880 [56:33<19:40:24, 84.11s/it]

818897

  4%|▍         | 39/880 [57:35<18:05:52, 77.47s/it]

818976

  5%|▍         | 40/880 [59:27<20:28:27, 87.75s/it]

818998

  5%|▍         | 41/880 [1:01:12<21:40:00, 92.97s/it]

819006

  5%|▍         | 42/880 [1:02:51<22:05:59, 94.94s/it]

819014

  5%|▍         | 43/880 [1:04:12<21:06:06, 90.76s/it]

819049

  5%|▌         | 44/880 [1:05:32<20:19:16, 87.51s/it]

819059

  5%|▌         | 45/880 [1:07:21<21:46:19, 93.87s/it]

819078

  5%|▌         | 46/880 [1:08:58<21:55:40, 94.65s/it]

819087

  5%|▌         | 47/880 [1:10:41<22:30:41, 97.29s/it]

819091

  5%|▌         | 48/880 [1:11:47<20:17:55, 87.83s/it]

819094

  6%|▌         | 49/880 [1:13:07<19:44:47, 85.55s/it]

819104

  6%|▌         | 50/880 [1:14:34<19:48:32, 85.92s/it]

819128

  6%|▌         | 51/880 [1:16:01<19:52:40, 86.32s/it]

819140

  6%|▌         | 52/880 [1:17:35<20:23:13, 88.64s/it]

819158

  6%|▌         | 53/880 [1:19:44<23:06:44, 100.61s/it]

819167

  6%|▌         | 54/880 [1:21:03<21:37:11, 94.23s/it] 

819178

  6%|▋         | 55/880 [1:22:18<20:17:51, 88.57s/it]

819192

  6%|▋         | 56/880 [1:23:50<20:30:12, 89.58s/it]

819211

  6%|▋         | 57/880 [1:25:39<21:47:56, 95.35s/it]

819215

  7%|▋         | 58/880 [1:27:12<21:35:21, 94.55s/it]

819235

  7%|▋         | 59/880 [1:28:47<21:37:48, 94.85s/it]

819236

  7%|▋         | 60/880 [1:31:20<25:35:10, 112.33s/it]

819272

  7%|▋         | 61/880 [1:33:11<25:24:14, 111.67s/it]

819279

  7%|▋         | 62/880 [1:34:44<24:07:25, 106.17s/it]

819287

  7%|▋         | 63/880 [1:36:49<25:21:32, 111.74s/it]

819288819289

  7%|▋         | 64/880 [1:37:52<22:00:23, 97.09s/it] 

819292

  7%|▋         | 65/880 [1:39:11<20:45:47, 91.71s/it]

819299

  8%|▊         | 66/880 [1:41:02<22:05:33, 97.71s/it]

819300

  8%|▊         | 67/880 [1:42:19<20:36:52, 91.28s/it]

819311

  8%|▊         | 68/880 [1:43:29<19:10:04, 84.98s/it]

819312819313

  8%|▊         | 69/880 [1:44:59<19:28:31, 86.45s/it]

819315

  8%|▊         | 70/880 [1:46:35<20:06:15, 89.35s/it]

819346

  8%|▊         | 71/880 [1:47:58<19:40:08, 87.53s/it]

819347819348

  8%|▊         | 72/880 [1:49:23<19:28:38, 86.78s/it]

819369

  8%|▊         | 73/880 [1:50:33<18:19:49, 81.77s/it]

819381

  8%|▊         | 74/880 [1:51:49<17:55:12, 80.04s/it]

819415

  9%|▊         | 75/880 [1:53:23<18:50:07, 84.23s/it]

819416

  9%|▊         | 76/880 [1:54:16<16:43:33, 74.89s/it]

819422

  9%|▉         | 77/880 [1:55:35<16:58:23, 76.09s/it]

819425

  9%|▉         | 78/880 [1:56:40<16:09:37, 72.54s/it]

819429

  9%|▉         | 79/880 [1:58:33<18:52:58, 84.87s/it]

819442

  9%|▉         | 80/880 [2:00:38<21:31:15, 96.84s/it]

819471

  9%|▉         | 82/880 [2:04:08<22:30:02, 101.51s/it]

819485

  9%|▉         | 83/880 [2:05:29<21:06:16, 95.33s/it] 

819503

 10%|▉         | 84/880 [2:07:03<20:57:10, 94.76s/it]

819512

 10%|▉         | 85/880 [2:08:37<20:52:59, 94.57s/it]

819522

 10%|▉         | 86/880 [2:10:18<21:17:21, 96.53s/it]

819564

 10%|▉         | 87/880 [2:11:52<21:08:34, 95.98s/it]

819583

 10%|█         | 88/880 [2:13:28<21:04:56, 95.83s/it]

819591

 10%|█         | 89/880 [2:14:47<19:57:05, 90.80s/it]

819615

 10%|█         | 90/880 [2:16:54<22:18:23, 101.65s/it]

819632

 10%|█         | 91/880 [2:18:37<22:23:11, 102.14s/it]

819677

 10%|█         | 92/880 [2:20:57<24:47:46, 113.28s/it]

819683

 11%|█         | 93/880 [2:22:00<21:28:43, 98.25s/it] 

819699

 11%|█         | 94/880 [2:23:39<21:31:21, 98.58s/it]

819709

 11%|█         | 95/880 [2:25:05<20:38:21, 94.65s/it]

819728

 11%|█         | 96/880 [2:27:19<23:10:46, 106.44s/it]

819764

 11%|█         | 97/880 [2:29:29<24:42:56, 113.63s/it]

819797

 11%|█         | 98/880 [2:31:14<24:07:15, 111.04s/it]

819807

 11%|█▏        | 99/880 [2:32:33<22:00:47, 101.47s/it]

819818

 11%|█▏        | 100/880 [2:34:17<22:08:33, 102.20s/it]

819833

 11%|█▏        | 101/880 [2:35:44<21:09:36, 97.79s/it] 

819850

 12%|█▏        | 102/880 [2:37:16<20:41:47, 95.77s/it]

819851

 12%|█▏        | 103/880 [2:38:22<18:46:02, 86.95s/it]

819856

 12%|█▏        | 104/880 [2:39:45<18:30:48, 85.89s/it]

819868

 12%|█▏        | 105/880 [2:41:40<20:22:56, 94.68s/it]

819875

 12%|█▏        | 106/880 [2:43:23<20:53:31, 97.17s/it]

819880

 12%|█▏        | 107/880 [2:44:50<20:10:37, 93.97s/it]

819889

 12%|█▎        | 110/880 [2:48:22<16:14:19, 75.92s/it]

819898

 13%|█▎        | 111/880 [2:49:33<15:53:55, 74.43s/it]

819906

 13%|█▎        | 112/880 [2:50:27<14:36:56, 68.51s/it]

819923

 13%|█▎        | 113/880 [2:51:45<15:10:58, 71.26s/it]

819928

 13%|█▎        | 114/880 [2:53:00<15:23:34, 72.34s/it]

819933

 13%|█▎        | 115/880 [2:54:50<17:44:58, 83.53s/it]

819947

 13%|█▎        | 116/880 [2:56:12<17:37:58, 83.09s/it]

819958

 13%|█▎        | 117/880 [2:57:40<17:56:17, 84.64s/it]

819988

 13%|█▎        | 118/880 [2:59:47<20:37:40, 97.45s/it]

820001

 14%|█▎        | 119/880 [3:01:13<19:50:55, 93.90s/it]

820009

 14%|█▎        | 120/880 [3:02:36<19:09:45, 90.77s/it]

820032

 14%|█▍        | 121/880 [3:04:27<20:23:15, 96.70s/it]

820041

 14%|█▍        | 122/880 [3:05:47<19:18:04, 91.67s/it]

820092

 14%|█▍        | 123/880 [3:07:32<20:09:23, 95.86s/it]

820108

 14%|█▍        | 124/880 [3:08:54<19:13:12, 91.52s/it]

820129

 14%|█▍        | 125/880 [3:10:34<19:44:03, 94.10s/it]

820134

 14%|█▍        | 126/880 [3:11:47<18:23:46, 87.83s/it]

820138

 14%|█▍        | 127/880 [3:13:08<17:55:35, 85.70s/it]

820144

 15%|█▍        | 128/880 [3:14:25<17:22:31, 83.18s/it]

820152

 15%|█▍        | 129/880 [3:16:06<18:27:51, 88.51s/it]

820155

 15%|█▍        | 130/880 [3:17:13<17:06:14, 82.10s/it]

820161

 15%|█▍        | 131/880 [3:18:22<16:14:59, 78.10s/it]

820167

 15%|█▌        | 132/880 [3:19:33<15:47:49, 76.03s/it]

820184

 15%|█▌        | 133/880 [3:20:56<16:12:57, 78.15s/it]

820187

 15%|█▌        | 134/880 [3:22:28<17:00:27, 82.07s/it]

820188

 15%|█▌        | 135/880 [3:23:25<15:25:38, 74.55s/it]

820201

 15%|█▌        | 136/880 [3:24:55<16:23:51, 79.34s/it]

820207

 16%|█▌        | 137/880 [3:25:56<15:13:07, 73.74s/it]

820213

 16%|█▌        | 138/880 [3:27:17<15:38:51, 75.92s/it]

820224

 16%|█▌        | 139/880 [3:28:25<15:09:53, 73.68s/it]

820232

 16%|█▌        | 140/880 [3:29:41<15:18:06, 74.44s/it]

820240

 16%|█▌        | 141/880 [3:30:57<15:22:41, 74.91s/it]

820247

 16%|█▌        | 142/880 [3:32:11<15:16:29, 74.51s/it]

820248820249

 16%|█▋        | 143/880 [3:33:21<14:57:48, 73.09s/it]

820259

 16%|█▋        | 144/880 [3:34:44<15:33:22, 76.09s/it]

820262

 16%|█▋        | 145/880 [3:35:58<15:22:56, 75.34s/it]

820263

 17%|█▋        | 146/880 [3:37:04<14:48:45, 72.65s/it]

820277

 17%|█▋        | 147/880 [3:38:19<14:55:42, 73.32s/it]

820280

 17%|█▋        | 148/880 [3:39:24<14:25:35, 70.95s/it]

820285

 17%|█▋        | 149/880 [3:40:22<13:35:14, 66.91s/it]

820290

 17%|█▋        | 150/880 [3:41:37<14:03:42, 69.35s/it]

820304

 17%|█▋        | 151/880 [3:43:05<15:09:51, 74.89s/it]

820317

 17%|█▋        | 152/880 [3:44:27<15:37:15, 77.25s/it]

820326

 17%|█▋        | 153/880 [3:45:47<15:44:20, 77.94s/it]

820338

 18%|█▊        | 154/880 [3:47:07<15:50:32, 78.56s/it]

820363

 18%|█▊        | 155/880 [3:48:37<16:31:52, 82.09s/it]

820383

 18%|█▊        | 156/880 [3:50:06<16:53:49, 84.02s/it]

820394

 18%|█▊        | 157/880 [3:51:30<16:52:49, 84.05s/it]

820401

 18%|█▊        | 158/880 [3:52:57<17:03:23, 85.05s/it]

820412

 18%|█▊        | 159/880 [3:54:50<18:42:02, 93.37s/it]

820420

 18%|█▊        | 160/880 [3:56:04<17:32:30, 87.71s/it]

820424

 18%|█▊        | 161/880 [3:57:06<15:58:36, 80.00s/it]

820425

 18%|█▊        | 162/880 [3:58:07<14:46:45, 74.10s/it]

820428

 19%|█▊        | 163/880 [3:59:05<13:49:42, 69.43s/it]

820433

 19%|█▊        | 164/880 [4:00:07<13:22:01, 67.21s/it]

820448

 19%|█▉        | 165/880 [4:01:34<14:31:43, 73.15s/it]

820453

 19%|█▉        | 166/880 [4:02:50<14:39:08, 73.88s/it]

820468

 19%|█▉        | 167/880 [4:05:13<18:44:45, 94.65s/it]

820479

 19%|█▉        | 168/880 [4:07:13<20:11:42, 102.11s/it]

820483

 19%|█▉        | 169/880 [4:08:07<17:21:37, 87.90s/it] 

820487

 19%|█▉        | 170/880 [4:09:11<15:54:01, 80.62s/it]

820501

 19%|█▉        | 171/880 [4:10:53<17:08:44, 87.06s/it]

820504

 20%|█▉        | 172/880 [4:12:14<16:44:32, 85.13s/it]

820514

 20%|█▉        | 173/880 [4:13:38<16:40:59, 84.95s/it]

820611

 20%|█▉        | 174/880 [4:16:05<20:16:34, 103.39s/it]

820646

 20%|█▉        | 175/880 [4:17:24<18:49:59, 96.17s/it] 

820770

 20%|██        | 176/880 [4:20:05<22:35:18, 115.51s/it]

820801

 20%|██        | 177/880 [4:21:23<20:23:39, 104.44s/it]

820864

 20%|██        | 178/880 [4:23:21<21:07:57, 108.37s/it]

820868

 20%|██        | 179/880 [4:24:36<19:09:15, 98.37s/it] 

820915

 20%|██        | 180/880 [4:26:29<20:00:18, 102.88s/it]

820964

 21%|██        | 181/880 [4:28:12<19:57:29, 102.79s/it]

820973

 21%|██        | 182/880 [4:29:24<18:08:39, 93.58s/it] 

820974

 21%|██        | 183/880 [4:30:38<16:59:54, 87.80s/it]

820981

 21%|██        | 184/880 [4:31:46<15:47:58, 81.72s/it]

820990

 21%|██        | 185/880 [4:32:48<14:38:24, 75.83s/it]

820992

 21%|██        | 186/880 [4:33:53<13:59:54, 72.61s/it]

821005

 21%|██▏       | 187/880 [4:35:08<14:07:57, 73.42s/it]

821057

 21%|██▏       | 188/880 [4:37:05<16:36:07, 86.37s/it]

821064

 21%|██▏       | 189/880 [4:38:11<15:25:40, 80.38s/it]

821069

 22%|██▏       | 190/880 [4:39:07<13:58:30, 72.91s/it]

821081

 22%|██▏       | 191/880 [4:40:21<14:00:46, 73.22s/it]

821114

 22%|██▏       | 192/880 [4:41:42<14:27:23, 75.64s/it]

821162

 22%|██▏       | 193/880 [4:43:20<15:41:51, 82.26s/it]

821187

 22%|██▏       | 194/880 [4:44:51<16:12:39, 85.07s/it]

821193

 22%|██▏       | 195/880 [4:46:11<15:52:09, 83.40s/it]

821289

 22%|██▏       | 196/880 [4:48:28<18:54:19, 99.50s/it]

821337

 22%|██▏       | 197/880 [4:50:10<19:01:33, 100.28s/it]

821393

 22%|██▎       | 198/880 [4:52:08<19:59:04, 105.49s/it]

821424

 23%|██▎       | 199/880 [4:53:33<18:49:05, 99.48s/it] 

821446

 23%|██▎       | 200/880 [4:55:19<19:09:44, 101.45s/it]

821482

 23%|██▎       | 201/880 [4:56:46<18:17:59, 97.02s/it] 

821488

 23%|██▎       | 202/880 [4:57:49<16:22:24, 86.94s/it]

821517

 23%|██▎       | 203/880 [4:59:22<16:41:13, 88.73s/it]

821524

 23%|██▎       | 204/880 [5:00:36<15:50:39, 84.38s/it]

821607

 23%|██▎       | 205/880 [5:02:41<18:06:58, 96.62s/it]

821616

 23%|██▎       | 206/880 [5:03:46<16:18:40, 87.12s/it]

821626

 24%|██▎       | 207/880 [5:04:53<15:06:26, 80.81s/it]

821642

 24%|██▎       | 208/880 [5:06:12<15:00:28, 80.40s/it]

821669

 24%|██▍       | 209/880 [5:07:25<14:34:48, 78.22s/it]

821705

 24%|██▍       | 210/880 [5:09:05<15:47:28, 84.85s/it]

821758

 24%|██▍       | 211/880 [5:10:48<16:44:08, 90.06s/it]

821777

 24%|██▍       | 212/880 [5:12:05<16:00:23, 86.26s/it]

821779

 24%|██▍       | 213/880 [5:13:02<14:22:21, 77.57s/it]

821800

 24%|██▍       | 214/880 [5:14:13<13:56:51, 75.39s/it]

821804

 24%|██▍       | 215/880 [5:15:10<12:55:10, 69.94s/it]

821821

 25%|██▍       | 216/880 [5:16:30<13:28:07, 73.02s/it]

821847

 25%|██▍       | 217/880 [5:18:07<14:45:18, 80.12s/it]

821857

 25%|██▍       | 218/880 [5:19:25<14:37:16, 79.51s/it]

821875

 25%|██▍       | 219/880 [5:21:05<15:45:15, 85.80s/it]

821903

 25%|██▌       | 220/880 [5:22:28<15:32:45, 84.80s/it]

821912

 25%|██▌       | 221/880 [5:23:49<15:18:18, 83.61s/it]

821937

 25%|██▌       | 222/880 [5:25:15<15:25:32, 84.40s/it]

821972

 25%|██▌       | 223/880 [5:26:55<16:14:17, 88.98s/it]

822016

 25%|██▌       | 224/880 [5:28:32<16:41:34, 91.61s/it]

822070

 26%|██▌       | 225/880 [5:30:26<17:52:34, 98.25s/it]

822108

 26%|██▌       | 226/880 [5:31:58<17:30:49, 96.41s/it]

822125

 26%|██▌       | 227/880 [5:33:08<16:02:59, 88.48s/it]

822147

 26%|██▌       | 228/880 [5:34:36<15:59:51, 88.33s/it]

822161

 26%|██▌       | 229/880 [5:36:02<15:51:35, 87.70s/it]

822199

 26%|██▌       | 230/880 [5:37:57<17:18:08, 95.83s/it]

822214

 26%|██▋       | 231/880 [5:39:37<17:29:10, 97.00s/it]

822250

 26%|██▋       | 232/880 [5:41:13<17:25:55, 96.84s/it]

822266

 26%|██▋       | 233/880 [5:43:35<19:50:43, 110.42s/it]

822289

 27%|██▋       | 234/880 [5:45:01<18:27:36, 102.87s/it]

822319

 27%|██▋       | 235/880 [5:46:31<17:45:12, 99.09s/it] 

822329

 27%|██▋       | 236/880 [5:51:00<26:50:10, 150.02s/it]

822347

 27%|██▋       | 237/880 [5:53:35<27:03:05, 151.46s/it]

822371

 27%|██▋       | 238/880 [5:55:06<23:46:50, 133.35s/it]

822399

 27%|██▋       | 239/880 [5:56:32<21:12:49, 119.14s/it]

822430

 27%|██▋       | 240/880 [5:57:56<19:18:39, 108.62s/it]

822457

 27%|██▋       | 241/880 [5:59:34<18:43:43, 105.51s/it]

822506

 28%|██▊       | 242/880 [6:01:32<19:21:10, 109.20s/it]

822515

 28%|██▊       | 243/880 [6:03:18<19:10:08, 108.33s/it]

822535

 28%|██▊       | 244/880 [6:04:56<18:35:49, 105.27s/it]

822570

 28%|██▊       | 245/880 [6:06:53<19:10:52, 108.74s/it]

822591

 28%|██▊       | 246/880 [6:08:09<17:25:13, 98.92s/it] 

822632

 28%|██▊       | 247/880 [6:10:00<18:01:37, 102.52s/it]

822640

 28%|██▊       | 248/880 [6:11:19<16:44:38, 95.38s/it] 

822662

 28%|██▊       | 249/880 [6:12:45<16:13:43, 92.59s/it]

822684

 28%|██▊       | 250/880 [6:14:18<16:15:30, 92.91s/it]

822699

 29%|██▊       | 251/880 [6:15:28<15:00:28, 85.90s/it]

822702

 29%|██▊       | 252/880 [6:16:34<13:57:53, 80.05s/it]

822718

 29%|██▉       | 253/880 [6:18:03<14:24:45, 82.75s/it]

822748

 29%|██▉       | 254/880 [6:19:33<14:45:44, 84.90s/it]

822754

 29%|██▉       | 255/880 [6:20:37<13:38:19, 78.56s/it]

822765

 29%|██▉       | 256/880 [6:21:41<12:50:34, 74.09s/it]

822768

 29%|██▉       | 257/880 [6:23:12<13:43:12, 79.28s/it]

822801

 29%|██▉       | 258/880 [6:25:16<15:59:25, 92.55s/it]

822808

 29%|██▉       | 259/880 [6:26:33<15:09:33, 87.88s/it]

822817

 30%|██▉       | 260/880 [6:27:46<14:22:32, 83.47s/it]

822851

 30%|██▉       | 262/880 [6:30:01<12:36:34, 73.45s/it]

822854

 30%|██▉       | 263/880 [6:30:56<11:40:00, 68.07s/it]

822855

 30%|███       | 264/880 [6:31:55<11:11:49, 65.44s/it]

822879

 30%|███       | 265/880 [6:33:20<12:10:29, 71.27s/it]

822882

 30%|███       | 266/880 [6:35:00<13:36:08, 79.75s/it]

822886

 30%|███       | 267/880 [6:36:05<12:50:25, 75.41s/it]

822891

 30%|███       | 268/880 [6:37:54<14:30:17, 85.32s/it]

822898

 31%|███       | 269/880 [6:38:56<13:19:22, 78.50s/it]

822901

 31%|███       | 270/880 [6:40:04<12:46:38, 75.41s/it]

822907

 31%|███       | 271/880 [6:41:13<12:26:04, 73.50s/it]

822931

 31%|███       | 272/880 [6:42:43<13:12:52, 78.24s/it]

822980

 31%|███       | 273/880 [6:44:38<15:05:01, 89.46s/it]

822987

 31%|███▏      | 276/880 [6:48:03<12:20:34, 73.57s/it]

822993

 31%|███▏      | 277/880 [6:49:21<12:34:25, 75.07s/it]

823043

 32%|███▏      | 278/880 [6:51:11<14:16:05, 85.33s/it]

823085

 32%|███▏      | 279/880 [6:52:48<14:50:11, 88.87s/it]

823089

 32%|███▏      | 280/880 [6:53:46<13:17:22, 79.74s/it]

823095

 32%|███▏      | 281/880 [6:54:51<12:30:03, 75.13s/it]

823107

 32%|███▏      | 282/880 [6:56:00<12:10:41, 73.31s/it]

823114

 32%|███▏      | 283/880 [6:57:03<11:38:14, 70.18s/it]

823118

 32%|███▏      | 284/880 [6:58:10<11:30:01, 69.46s/it]

823124

 32%|███▏      | 285/880 [6:59:18<11:24:08, 68.99s/it]

823127

 32%|███▎      | 286/880 [7:00:21<11:04:54, 67.16s/it]

823137

 33%|███▎      | 287/880 [7:01:26<10:57:42, 66.55s/it]

823162

 33%|███▎      | 288/880 [7:03:09<12:43:36, 77.39s/it]

823187

 33%|███▎      | 289/880 [7:04:25<12:39:31, 77.11s/it]

823213

 33%|███▎      | 290/880 [7:05:44<12:42:36, 77.55s/it]

823225

 33%|███▎      | 291/880 [7:07:16<13:23:50, 81.88s/it]

823227

 33%|███▎      | 293/880 [7:09:47<12:46:49, 78.38s/it]

823241

 33%|███▎      | 294/880 [7:11:16<13:16:54, 81.59s/it]

823251

 34%|███▎      | 295/880 [7:12:52<13:56:26, 85.79s/it]

823283

 34%|███▎      | 296/880 [7:14:22<14:06:32, 86.97s/it]

823302

 34%|███▍      | 297/880 [7:15:46<13:56:58, 86.14s/it]

823323

 34%|███▍      | 298/880 [7:17:15<14:04:01, 87.01s/it]

823346

 34%|███▍      | 299/880 [7:18:50<14:24:55, 89.32s/it]

823347

 34%|███▍      | 300/880 [7:19:58<13:23:25, 83.11s/it]

823352

 34%|███▍      | 301/880 [7:21:19<13:15:22, 82.42s/it]

823357

 34%|███▍      | 302/880 [7:22:48<13:33:18, 84.43s/it]

823370

 34%|███▍      | 303/880 [7:24:12<13:29:40, 84.19s/it]

823379

 35%|███▍      | 304/880 [7:25:27<13:02:30, 81.51s/it]

823380823381

 35%|███▍      | 305/880 [7:26:37<12:27:40, 78.02s/it]

823387

 35%|███▍      | 306/880 [7:28:04<12:52:39, 80.77s/it]

823388

 35%|███▍      | 307/880 [7:29:25<12:51:01, 80.74s/it]

823396

 35%|███▌      | 308/880 [7:30:40<12:32:36, 78.94s/it]

823426

 35%|███▌      | 309/880 [7:32:12<13:09:33, 82.97s/it]

823466

 35%|███▌      | 310/880 [7:33:40<13:23:02, 84.53s/it]

823509

 35%|███▌      | 311/880 [7:35:06<13:26:01, 84.99s/it]

823523

 35%|███▌      | 312/880 [7:36:37<13:40:50, 86.71s/it]

823537

 36%|███▌      | 313/880 [7:37:44<12:44:35, 80.91s/it]

823559

 36%|███▌      | 314/880 [7:39:12<13:03:20, 83.04s/it]

823580

 36%|███▌      | 315/880 [7:40:43<13:22:56, 85.27s/it]

823594

 36%|███▌      | 316/880 [7:42:26<14:12:06, 90.65s/it]

823605

 36%|███▌      | 317/880 [7:43:54<14:03:59, 89.95s/it]

823652

 36%|███▌      | 318/880 [7:46:08<16:05:32, 103.08s/it]

823667

 36%|███▋      | 319/880 [7:47:49<15:58:45, 102.54s/it]

823696

 36%|███▋      | 320/880 [7:49:29<15:47:40, 101.54s/it]

823712

 36%|███▋      | 321/880 [7:51:22<16:20:30, 105.24s/it]

823722

 37%|███▋      | 322/880 [7:52:32<14:40:13, 94.65s/it] 

823738

 37%|███▋      | 323/880 [7:53:55<14:04:53, 91.01s/it]

823757

 37%|███▋      | 324/880 [7:55:33<14:24:23, 93.28s/it]

823791

 37%|███▋      | 325/880 [7:57:19<14:55:48, 96.84s/it]

823794

 37%|███▋      | 326/880 [7:58:26<13:33:35, 88.11s/it]

823800

 37%|███▋      | 327/880 [7:59:49<13:16:12, 86.39s/it]

823826

 37%|███▋      | 328/880 [8:01:28<13:49:35, 90.17s/it]

823850

 37%|███▋      | 329/880 [8:02:54<13:38:30, 89.13s/it]

823859

 38%|███▊      | 330/880 [8:04:27<13:47:55, 90.32s/it]

823861

 38%|███▊      | 331/880 [8:05:57<13:44:54, 90.15s/it]

823888

 38%|███▊      | 332/880 [8:07:26<13:39:25, 89.72s/it]

823904

 38%|███▊      | 333/880 [8:08:38<12:50:46, 84.55s/it]

823914

 38%|███▊      | 334/880 [8:09:48<12:07:27, 79.94s/it]

823921

 38%|███▊      | 335/880 [8:10:35<10:37:09, 70.15s/it]

823931

 38%|███▊      | 336/880 [8:11:45<10:35:17, 70.07s/it]

823967

 38%|███▊      | 337/880 [8:13:22<11:48:24, 78.28s/it]

824008

 38%|███▊      | 338/880 [8:14:58<12:34:44, 83.55s/it]

824026

 39%|███▊      | 339/880 [8:16:09<11:59:56, 79.84s/it]

824035

 39%|███▊      | 340/880 [8:17:23<11:42:58, 78.11s/it]

824064

 39%|███▉      | 341/880 [8:18:45<11:50:40, 79.11s/it]

824081

 39%|███▉      | 342/880 [8:20:01<11:42:13, 78.32s/it]

824098

 39%|███▉      | 343/880 [8:21:21<11:44:19, 78.69s/it]

824102

 39%|███▉      | 344/880 [8:22:21<10:52:41, 73.06s/it]

824139

 39%|███▉      | 345/880 [8:23:45<11:21:45, 76.46s/it]

824168

 39%|███▉      | 346/880 [8:25:04<11:26:41, 77.16s/it]

824197

 39%|███▉      | 347/880 [8:26:53<12:51:18, 86.83s/it]

824210

 40%|███▉      | 348/880 [8:28:23<12:57:16, 87.66s/it]

824215

 40%|███▉      | 349/880 [8:29:26<11:50:01, 80.23s/it]

824238

 40%|███▉      | 350/880 [8:30:54<12:09:00, 82.53s/it]

824246

 40%|███▉      | 351/880 [8:32:08<11:46:40, 80.15s/it]

824254

 40%|████      | 352/880 [8:33:22<11:27:30, 78.13s/it]

824257

 40%|████      | 353/880 [8:34:33<11:07:48, 76.03s/it]

824294

 40%|████      | 354/880 [8:35:58<11:29:43, 78.68s/it]

824307

 40%|████      | 355/880 [8:37:08<11:06:31, 76.17s/it]

824313

 40%|████      | 356/880 [8:38:17<10:46:25, 74.02s/it]

824343

 41%|████      | 358/880 [8:40:47<10:33:42, 72.84s/it]

824344

 41%|████      | 359/880 [8:41:42<9:46:20, 67.53s/it] 

824360

 41%|████      | 360/880 [8:43:06<10:28:20, 72.50s/it]

824369

 41%|████      | 361/880 [8:44:20<10:30:18, 72.87s/it]

824376

 41%|████      | 361/880 [8:45:11<12:35:03, 87.29s/it]


In [21]:
rus.head(50)

Unnamed: 0_level_0,tatoeba_id,language,sentence,sentence_new
tatoeba_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
341066,341066,por,Tom está tocando violino agora.,Luis está tocando violino agora.
341105,341105,por,Tom lidera o time de futebol.,Gabriel lidera o time de futebol.
376504,376504,por,Tom saiu com a namorada na noite do sábado.,Jonas saiu com a namorada na noite do sábado.
377704,377704,por,De manhã Tom disse que a irmã dele ainda está ...,De manhã Santiago disse que a irmã dele ainda ...
377798,377798,por,Ela casou-se com Tom mês passado.,Ela casou-se com Peter mês passado.
384783,384783,por,Tom é um amigo meu.,Jonas é um amigo meu.
384821,384821,por,"Oi, Tom. Bom dia.","Oi, Leonardo. Bom dia."
390826,390826,por,Tom fala mais devagar do que Bill.,Sergei fala mais devagar do que Bill.
393426,393426,por,Eu pedi a Tom para abrir a janela.,Eu pedi a Abraham para abrir a janela.
401058,401058,por,"Tom, o que você gostaria de comer no jantar?","Alexander, o que você gostaria de comer no jan..."


In [22]:
generator.save('../data/objects')

In [23]:
len(generator.takeback_ids)

16558

In [24]:
rus.to_csv('por_tom.csv')

In [66]:
source_ids = [78, 79, 80]

ids = data.translations[
    (data.translations['source'].isin(source_ids)) & (data.translations['target_language'] == 'eng')
]
groups = ids.groupby(['source'])['target'].apply(list)
keys = []
mapping = {}
for source_id in source_ids:
    try: 
        key = groups.loc[[source_id]].to_list()[0][0]
        keys.append(key)
        mapping[source_id] = key
    except KeyError:
        mapping[source_id] = None

targets = data.sentences.loc[keys]['sentence'].to_dict()
{k: targets[v] if v is not None else None for (k,v) in mapping.items()}

{78: 'I have to go to sleep.', 79: None, 80: 'What is it?'}

In [140]:
generator.replacement_map[37052]

'Jonas'

In [None]:
replacer.replace_name(new_name='Mario', lang='spa', sentence='Tom es un cabrón')

In [89]:
replacer.replace_name(
    new_name='Alejandro', 
    lang='fra',
    sentence='Il m\'a dit que Tom est gros', 
    ref_sentence='He told me that Tom is fat'
)

Il m'a dit que Tom était gros Il m'a dit qu'Alejandro était gros [('que Tom', "qu'Alejandro")]


"Il m'a dit qu'Alejandro est gros"

In [76]:
import collections
import stanza
import torch

from nltk.tokenize import word_tokenize

# Models that are able to do Named Entity Recognition.
ner_languages = ['spa', 'rus', 'deu', 'fra', 'nld']

stanza_models = {}

def count_people(sentences, lang, regex, sample = 500):
    torch.cuda.empty_cache()
    nlp = None
    if lang in ner_languages:
        if lang in stanza_models:
            nlp = stanza_models[lang]
        else:
            nlp = stanza_models[lang] = stanza.Pipeline(iso_map[lang], dir='../data/models/stanza')
    
    lang_sentences = sentences.loc[sentences['language'] == lang].sample(sample)
    
    people = collections.Counter()

    for index, row in lang_sentences.iterrows():
        if nlp is not None:
            doc = nlp(row['sentence'])
            persons = [entity.text for entity in doc.entities if entity.type == 'PER']
        else:
            persons = word_tokenize(row['sentence'])
        
        for person in persons:
            if re.match(regex, person):
                people[person] += 1

    return sorted(people, key=people.get, reverse=True)

In [81]:
people = [name for name in count_people(sentences, 'eng', '(Tom|Mar)', 30000)]

In [82]:
people

['Tom',
 'Mary',
 'Maria',
 'Mars',
 'Marie',
 'Tomorrow',
 'Marika',
 'Marine',
 'March',
 'Marilla',
 'Martin',
 'Markku',
 'Tombaugh',
 'Marius',
 'Maritsch',
 'Marseille',
 'Tomo',
 'Martian',
 'Marry',
 'Marcy',
 'Marita',
 'Mariner',
 'Marines',
 'Tomatoes',
 'Maraghna',
 'Tomsk',
 'Marathi',
 'Marek',
 'Marcus',
 'Mark']