In [5]:
import pandas as pd
import os 
import random
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from difflib import SequenceMatcher
import re
import pickle
import numpy as np
from transformers import MarianMTModel, MarianTokenizer
import time

tqdm.pandas()

In [6]:
# Regexes to find Tom and Mary in several languages
tom_mary = {
    'Tom':   {
        'eng': 'Tom',
        'spa': 'Tom(ás)?',
        'rus': 'Том(а|ом|у)?',
        'ita': 'Tom(maso)?',
        'fra': '(Tom|Thomas)',
        'hat': 'Tom',
        'por': 'Tom',
        'deu': 'Tom(s)?',
        'nld': '(Tom(s)?|Thomas)',
        'dan': 'Tom(s)?',
        'nob': 'Tom(s|i)?',
        'swe': 'Tom(s|i)?',
        'fin': "Tom(m|')?(i(n|lla|a|lle|sta|lta|in|sta|ssa|kin|i|l|kaan|st|tta)?|kaan)?"
    },
    'Mary': {
        'eng': 'Mar(y|ia|ie)',
        'spa': 'Mar(y|ía|ia)',
        'fra': 'Mar(y|ie|ia)',
        'rus': '(Мэри|Мари(я|и|у|ей)|Маш(у|а|ей|и))',
        'ita': 'Mar(ie|ia|y|i)',
        'hat': 'Mary',
        'por': 'Mar(y|ia)',
        'deu': 'Mar(y|ia|i)(s)?',
        'nld': 'Mar(y|ia|ie|yam)',
        'dan': 'Mar(y|ia|ie|i)(s)?',
        'nob': 'Mar(y|ia|ie|i)(s)?',
        'swe': 'Mar(y|ia|ie|i)(s)?',
        'fin': 'Mar(i|y)(n|a|lle|sta|a|lla|lta|ä|ja|in|stä|yn|llä|aa|en)?'
    }
}

# Common names for substitution
names = {
    'eng': {
        'male':  [
            'Omar', 'Peter', 'Santiago', 'Daniel', 'William', 'Luis', 'James', 'John', 'Robert', 'Gabriel',
            'Oliver', 'Jonas', 'Charlie', 'Jack', 'Leonardo', 'David', 'Alexander', 'Sergei', 'Abraham', 'Tatsuki'
        ],
        'female': [
            'Anna', 'Emily', 'Natalia', 'Salma', 'Valentina', 'Olivia', 'Amelia', 'Viktoria', 'Anastasia',
            'Maryam', 'Sakura', 'Charlotte', 'Sarah', 'Ashley', 'Samantha', 'Laura', 'Latifa', 'Carlota', 'Eva',
            'Olga'
        ]
    }
}

In [7]:
class DataSource:
    iso_map = {
        "eng": "en",
        "rus": "ru",
        "ita": "it",
        "deu": "de",
        "fra": "fr",
        "por": "pt",
        "spa": "es",
        "nld": "nl",
        "fin": "fi",
        "dan": "da",
        "swe": "sv",
        "nob": "no",
        "hat": "ht"
    }
    
    
    def __init__(self, *, data_path: str, name_patterns : dict, new_names: dict, seed: int = 42):
        self.name_patterns = name_patterns
        self.new_names = new_names
        self.data_path = data_path
        self.languages = list(self.iso_map.keys())
        self.sentences = self._load_sentences()
        self.translations = self._load_translations()
        random.seed(seed)
        
        
    def _load_sentences(self):
        path = os.path.join(self.data_path, 'sentences.csv')
        sentences = pd.read_csv(path, sep='\t', names=['tatoeba_id', 'language', 'sentence'])
        sentences = sentences[sentences['language'].isin(self.languages)]
        return sentences.set_index('tatoeba_id', drop=False).sort_index()
    
    
    def _load_translations(self):
        path = os.path.join(self.data_path, 'links.csv')
        translations = pd.read_csv(path, sep='\t', names = ['source', 'target'])
        translations = translations.set_index('target', drop=False).join(self.sentences).dropna()[[
            'source', 'target', 'language'
        ]].rename(columns = {'language': 'target_language'})

        translations = translations.set_index('source', drop=False).join(self.sentences).dropna()[[
            'source', 'target', 'language', 'target_language'
        ]].rename(columns = {'language': 'source_language'})

        return translations
    
    
    def random_name(self, gender: str, lang: str):
        return random.sample(self.new_names[lang][gender], 1)[0]
    
    
    def name_sentences(self, name: str, lang: str):
        pattern = self.name_patterns[name][lang]
        lang_sentences = self.sentences.loc[self.sentences['language'] == lang]
        name_regex = r'\b' + pattern + r'\b'
        return lang_sentences[lang_sentences['sentence'].str.contains(name_regex, regex=True)]
    
    
    def name_pattern(self, name: str, lang: str):
        return self.name_patterns[name][lang]
    
    
    def iso(self, lang: str):
        return self.iso_map[lang]

    
    def find_translations(self, source_ids, tgt_lang):
        ids = self.translations[
            (self.translations['source'].isin(source_ids)) & (self.translations['target_language'] == 'eng')
        ]
        groups = ids.groupby(['source'])['target'].apply(list)
        keys = []
        mapping = {}
        for source_id in source_ids:
            try: 
                key = groups.loc[[source_id]].to_list()[0][0]
                keys.append(key)
                mapping[source_id] = key
            except KeyError:
                mapping[source_id] = None

        targets = self.sentences.loc[keys]['sentence'].to_dict()
        return {k: targets[v] if v is not None else None for (k,v) in mapping.items()}

In [8]:
class NameGenerator: 
    def __init__(self, *, data: DataSource, name: str, gender: str, init_lang: str, max_depth: int = 15):
        self.data = data
        self.replacement_map = {}
        self.max_depth = max_depth
        self.name = name
        self.gender = gender
        self.init_lang = init_lang
        self.takeback_ids = []
        
        
    def generate(self):
        self._fill_replacement_map()
        return len(self.replacement_map)
    
    
    def save(self, data_path):
        path = os.path.join(data_path, self.name.lower() + '_replacements.pkl')
        with open(path, 'wb') as file:
            pickle.dump(self.replacement_map, file, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    def load(self, data_path):
        path = os.path.join(data_path, self.name.lower() + '_replacements.pkl')
        with open(path, 'rb') as file:
            self.replacement_map = pickle.load(file)
    
    
    def get_name_replacement(self, tatoeba_id):
        if tatoeba_id not in self.replacement_map:
            self._fill_for_ids([tatoeba_id])
        return self.replacement_map[tatoeba_id]
    
    
    def takeback(self, tatoeba_id, depth=0):
        if depth > self.max_depth:
            return
        if tatoeba_id not in self.takeback_ids:
            self.takeback_ids.append(tatoeba_id)
            translations = self.data.translations[self.data.translations['source'] == tatoeba_id]
            self._takeback_translations(translations, depth)
    
    
    def _takeback_translations(self, translations, depth):
        for tatoeba_id in translations['target']:
            self.takeback(tatoeba_id, depth+1)
        

    def _fill_replacement_map(self):
        sentences = self.data.name_sentences(self.name, self.init_lang)
        self._fill_for_ids(sentences['tatoeba_id'])
    
    
    def _fill_for_ids(self, sentence_ids, depth=0):
        if depth > self.max_depth:
            return
        for tatoeba_id in sentence_ids:
            if tatoeba_id not in self.replacement_map:
                translations = self.data.translations[self.data.translations['source'] == tatoeba_id]
                self.replacement_map[tatoeba_id] = self._get_name(translations)
                print(len(self.replacement_map), end='\r')
                self._fill_translations(translations, depth)
            
    
    def _fill_translations(self, translations, depth):
        for language in self.data.languages:
            targets = translations[translations['target_language'] == language]['target']
            self._fill_for_ids(targets, depth + 1)
    
    
    def _get_name(self, translations):
        # First try to find if some translation already has a name
        for tatoeba_id in translations['target']:
            if tatoeba_id in self.replacement_map:
                return self.replacement_map[tatoeba_id]
        # If not, generate a new name at random
        return self.data.random_name(self.gender, self.init_lang)

In [9]:
class Translator: 
    models = {}
    cache = {}

    def translate(self, text: str, src_lang: str, tgt_lang: str):
        cached = self.cache.pop(text, None)
        if cached:
            return cached
        translations = self.translate_batch([text], src_lang, tgt_lang)
        return translations[0]


    def translate_batch(self, texts: list, src_lang: str, tgt_lang: str):
        model, tokenizer = self._load_model(src_lang, tgt_lang)

        # The target language is specified as a special token within the source string
        batch = [f'>>{tgt_lang}<< {text}' for text in texts]

        translated = model.generate(**tokenizer.prepare_translation_batch(batch))
        translations = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

        # Resulting translations sometimes contain weird characters at the start, remove them
        translations = [re.sub(r'^(▸|▪|::|\-|□|–|"|\*|♪)\s*', '', t) for t in translations]

        translations = [
            re.sub(r'\.$', '', translations[i]) if texts[i][-1] != '.' else translations[i] 
            for i in range(len(translations))
        ]
 
        return translations
 

    def cache_translations(self, texts: list, src_lang: str, tgt_lang: str):
        translations = self.translate_batch(texts, src_lang, tgt_lang)
            
        for idx, translation in enumerate(translations):
            self.cache[texts[idx]] = translation
    
    
    def _get_model_name(self, src_lang, tgt_lang):
        # There are only two models from romance languages to English and back
        romance_langs = ["fr", "es", "it", "pt", "ro", "ca", "gl", "la", "wa", "fur", "oc", "sc", "an", "frp", "lad", "vec", "co", "lld", "lij", "lmo", "nap", "rm", "scn", "mwl"]
        src_lang = 'ROMANCE' if src_lang in romance_langs and tgt_lang == 'en' else src_lang
        tgt_lang = 'ROMANCE' if tgt_lang in romance_langs and src_lang == 'en' else tgt_lang
        tgt_lang = 'NORWAY' if tgt_lang == 'no' else tgt_lang
        return 'Helsinki-NLP/opus-mt-{0}-{1}'.format(src_lang, tgt_lang)


    def _load_model(self, src_lang, tgt_lang):
        model_name = self._get_model_name(src_lang, tgt_lang)
        if model_name not in self.models:
            print('Loading ' + model_name)
            model = MarianMTModel.from_pretrained(model_name)
            tokenizer = MarianTokenizer.from_pretrained(model_name)
            # Save the loaded model in 'models' to speed up its later use
            self.models[model_name] = (model, tokenizer)
        else:
            model, tokenizer = self.models[model_name]
        return model, tokenizer 

In [11]:
class NameReplacer:
    basic_langs = ['eng', 'spa', 'ita', 'hat', 'por']
    
    def __init__(self, *, name: str, translator: Translator, data: DataSource, init_lang: str):
        assert init_lang in self.basic_langs
        self.translator = translator
        self.data = data
        self.init_lang = init_lang
        self.name = name
    
    
    def replace_name(self, *, new_name: str, lang: str, sentence: str, ref_sentence: str = None):
        if lang in self.basic_langs:
            return self._replace_basic(new_name, sentence, lang)
        else:
            return self._replace_inflected(new_name, sentence, ref_sentence, lang)
    
    
    def is_basic(self, lang):
        return lang in self.basic_langs
    

    def _replace_basic(self, new_name: str, sentence: str, lang: str):
        old_name = self.data.name_pattern(self.name, lang)
        return re.sub(old_name, new_name, sentence)


    def _replace_inflected(self, new_name: str, sentence: str, ref_sentence: str, lang: str):
        assert ref_sentence is not None
         
        # Translate the original sentence (with the old name) into the target language
        ref_translated = self.translator.translate(
            ref_sentence, self.data.iso(self.init_lang), self.data.iso(lang)
        )
        # Replace the name in the original sentence with the new name
        ref_replaced = self.replace_name(
            new_name=new_name, lang=self.init_lang, sentence=ref_sentence
        )
        # Translate the previous sentence (with the new name) into the target language 
        ref_replaced_translated = self.translator.translate(
            ref_replaced, self.data.iso(self.init_lang), self.data.iso(lang)
        )
        
        # Get the difference between both translations, hoping that the thing that changed was the name or some
        # sorrounding text, in order to find the most likely inflection of the replaced name in the target language
        repls = self._find_replacements(ref_translated, ref_replaced_translated)
        
        #print(ref_translated, ref_replaced_translated, repls)

        # Don't replace if it finds more than one replacement, because the ambiguity might cause serious mistakes.
        if len(repls) > 1 or len(repls) == 0:
            return None

        repl = repls[0]
        old_name = self.data.name_pattern(self.name, lang)
        # If it doesn't detect the name to replace, just return the sentence untouched
        if re.search(old_name, repl[0]) is None:
            return None

        # Replace name by its regular expression to match all variations
        name_pattern = re.sub(old_name, old_name, repl[0])
        return re.sub(name_pattern, repl[1], sentence)

    
    def _find_replacements(self, base, replaced): 
        base = word_tokenize(base)
        replaced = word_tokenize(replaced)
        s = SequenceMatcher(None, base, replaced)
        replacements = []
        for tag, i1, i2, j1, j2 in s.get_opcodes():
            if tag == 'replace':
                replacements.append((' '.join(base[i1:i2]), ' '.join(replaced[j1:j2])))
        return replacements

In [12]:
class BulkReplacer:
    def __init__(self, *, generator: NameGenerator, data: DataSource):
        self.data = data
        self.generator = generator
        self.init_lang = generator.init_lang
        self.translator = Translator()
        self.replacer = NameReplacer(
            name=generator.name, init_lang=self.init_lang, translator=self.translator, data=data
        )
        
    
    def replace_lang_names(self, lang):
        lang_sentences = self.data.name_sentences(self.generator.name, lang)
        chunks = []
        for chunk in tqdm(np.array_split(lang_sentences, len(lang_sentences)//150 + 1)):
            refs = []
            if not self.replacer.is_basic(lang):
                refs = self._prepare_refs(chunk, lang)
            chunk['sentence_new'] = chunk.apply(
                lambda row: self._replace_row(row, lang, refs), axis=1
            )
            chunks.append(chunk)
        return pd.concat(chunks)
        
    
    def _prepare_refs(self, chunk, lang):
        ids = chunk['tatoeba_id'].to_list()
        refs = self.data.find_translations(ids, self.init_lang)
        texts = [v for (k,v) in refs.items() if v is not None]
        self.translator.cache_translations(texts, self.data.iso(self.init_lang), self.data.iso(lang))
        tatoeba_ids = chunk['tatoeba_id'].to_list()
        sentences = chunk['sentence'].to_list()
        
        replaced_refs = []
        for tatoeba_id in tatoeba_ids:
            if refs[tatoeba_id] is not None:
                new_name = self.generator.get_name_replacement(tatoeba_id)
                ref_replaced = self.replacer.replace_name(
                    new_name=new_name, lang=self.init_lang, sentence=refs[tatoeba_id]
                )
                replaced_refs.append(ref_replaced)
        
        self.translator.cache_translations(replaced_refs, self.data.iso(self.init_lang), self.data.iso(lang), fast)
        return refs
    
    
    def _replace_row(self, row, lang, refs):
        new_name = self.generator.get_name_replacement(row['tatoeba_id'])
        ref = refs[row['tatoeba_id']] if row['tatoeba_id'] in refs else None
        if ref is None and not self.replacer.is_basic(lang):
            ref = self.translator.translate(
                row['sentence'], self.data.iso(lang), self.data.iso(self.init_lang)
            )
        
        new_sentence = self.replacer.replace_name(
            new_name=new_name, lang=lang, sentence=row['sentence'], ref_sentence=ref
        )
        
        if new_sentence is None:
            new_sentence = row['sentence']
            self.generator.takeback(row['tatoeba_id'])
        return new_sentence

In [13]:
data = DataSource(data_path='../data/sources/tatoeba', name_patterns=tom_mary, new_names=names)

In [14]:
generator = NameGenerator(name='Tom', gender='male', init_lang='eng', data=data)
#generator.generate()

generator.load('../data/objects')

#generator.generate()
#generator.save('../data/objects')

bulk = BulkReplacer(generator=generator, data=data)

In [None]:
start = time.time()
rus = bulk.replace_lang_names('eng')
end = time.time()
end-start

In [22]:
generator.save('../data/objects')

In [24]:
rus.to_csv('por_tom.csv')

In [76]:
import collections
import stanza
import torch

from nltk.tokenize import word_tokenize

# Models that are able to do Named Entity Recognition.
ner_languages = ['spa', 'rus', 'deu', 'fra', 'nld']

stanza_models = {}

def count_people(sentences, lang, regex, sample = 500):
    torch.cuda.empty_cache()
    nlp = None
    if lang in ner_languages:
        if lang in stanza_models:
            nlp = stanza_models[lang]
        else:
            nlp = stanza_models[lang] = stanza.Pipeline(iso_map[lang], dir='../data/models/stanza')
    
    lang_sentences = sentences.loc[sentences['language'] == lang].sample(sample)
    
    people = collections.Counter()

    for index, row in lang_sentences.iterrows():
        if nlp is not None:
            doc = nlp(row['sentence'])
            persons = [entity.text for entity in doc.entities if entity.type == 'PER']
        else:
            persons = word_tokenize(row['sentence'])
        
        for person in persons:
            if re.match(regex, person):
                people[person] += 1

    return sorted(people, key=people.get, reverse=True)

In [81]:
people = [name for name in count_people(sentences, 'eng', '(Tom|Mar)', 30000)]