In [1]:
from googletrans import Translator
translator = Translator()
import pandas as pd
import numpy as np
import os
import re
import sys
import datetime

sys.path.append('../python_scripts/')
from bulkTranslate import *

In [2]:
def split_dest(dest_str : str) -> list:
    """Split translated chunk by line and clean numbers and blank spaces"""
    dest_list = re.split('\n', dest_str)
    nonum = [re.sub(r'[0-9,.;:]', '', t) for t in dest_list]
    dest_clean = [e.strip() for e in nonum]    
    return dest_clean

def bulk_translate(src : pd.Series, dest_lang : str) -> pd.Series:
    """Send single src_str for bulk translation request to server,
    split retrieved dest_str into its entries using RegEx.
    Returns dest : final formatted Series in dest_lang language"""

    dest_str = translator.translate(src.to_string(), src=src.name, dest=dest_lang).text

    dest_clean = split_dest(dest_str)

    dest = pd.Series(dest_clean, name = dest_lang)
    
    print('Attempted translation of %i entries. Check DB for mistranslations.' %len(dest))
    return dest

In [3]:
def test_long_sentence(src):
    """Search entries with more than 3 words and drop them"""
    for i,w in enumerate(src):
        if ' ' in w:
            sentence = w
            if len(re.findall(' ', sentence)) >= 3:
                print("Entry with more than 3 words detected: ", sentence)
                src.pop(i)
    src.reset_index(drop=True, inplace=True)
    return src

def test_split_dest(dest_str):
    dest_list = re.split('\n', dest_str)
    dest_clean = split_dest(dest_str)
    assert len(dest_clean) == len(dest_list), "Incorrect split and number cleaning of dest_str"

def test_bulk_translate(src : pd.Series, dest_lang : str):
    dest = bulk_translate(src, dest_lang)
    assert len(src) == len(dest), 'bulk_translate error: len(dest) does not match len(src)'

In [4]:
class Translation:

    def __init__(self, cadera_path : str, dest : str = None, src : str = None, color : str = 'blue') -> pd.Series :
        """ Import source CADERA file and keep only column specified as 'color' """
        self.dfSrc = pd.read_csv(cadera_path, index_col=0)[color].dropna()
        self.src : str = src
        self.dest : str = dest
        self.translator = Translator()

    def detect_src(self, N : int = 0):
        """Auto-detect languages in wordset given and arrange them by occurrences
        Parameters:
        N : int = 0
            Number of words to sample from CADERA series. If left to 0, the whole Series is taken"""

        def compute_highest_scoring_language(dictNorm : dict, dictWeights : dict) -> dict:
            """Return highest scoring weighted language over the sample.
            The probability for each language results from summing over all occurrences the product of two factors:
                - Occurrences_language / sum(occurrences_languages)
                - confidence_occurrence """
            ret = dict()
            normFactor = sum(dictNorm.values())
            for key, language in dictWeights.items():
                ret[key] = language*dictNorm.get(key, 1) / normFactor
            return ret

        if N != 0:
            sample = self.dfSrc.sample(N)
        else:
            sample = self.dfSrc

        lang = [translator.detect(w).lang for w in sample]
        conf = [translator.detect(w).confidence for w in sample]
        dfLang = pd.DataFrame({'lang':lang, 'conf':conf})

        dictNorm = dfLang.lang.value_counts().to_dict()  # Dictionary with number of occurences per language
        normFactor = sum(dictNorm.values())              # Total number of words in sample

        a = dfLang.groupby('lang').sum().to_dict()['conf']
        dictWeights = {k: v / normFactor for k, v in a.items()}   # Weights per language, averaged by occurrences

        ret = compute_highest_scoring_language(dictNorm, dictWeights)
        maximum = max(ret, key=ret.get)
        return maximum, ret#[maximum]

In [25]:
cder_path = '/Users/pabloherrero/Documents/ManHatTan/CADERAs/Die_Verwandlung_updated.cder'
cadera = pd.read_csv(cder_path, index_col=0)['blue']
cadera.to_string()

'0                                             Ungeziefer\n1                                          Versteifungen\n2                                                 Umfang\n3                                             flimmerten\n4                                              versehen,\n5                                           Fensterblech\n6                                        undurchführbar,\n7                                             schaukelte\n8                                                 Jucken\n9                                                   Pult\n10                                                Zeiger\n11                                          Donnerwetter\n12                                              Rückgrat\n13                                              Einwände\n14                                         arbeitsscheue\n15                                                derart\n16                                           ausführlich\n17           

In [27]:
translator.detect(cadera.to_string()).lang, translator.detect(cadera.to_string()).confidence

('de', 1.0)

## Testing the package 

In [40]:
from googletrans import LANGCODES
langKeys = list(LANGUAGES.keys())
if 'ax' not in langKeys:
    print('Invalid lang')
print(langKeys)

Invalid lang
['af', 'sq', 'am', 'ar', 'hy', 'az', 'eu', 'be', 'bn', 'bs', 'bg', 'ca', 'ceb', 'ny', 'zh-cn', 'zh-tw', 'co', 'hr', 'cs', 'da', 'nl', 'en', 'eo', 'et', 'tl', 'fi', 'fr', 'fy', 'gl', 'ka', 'de', 'el', 'gu', 'ht', 'ha', 'haw', 'iw', 'hi', 'hmn', 'hu', 'is', 'ig', 'id', 'ga', 'it', 'ja', 'jw', 'kn', 'kk', 'km', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mi', 'mr', 'mn', 'my', 'ne', 'no', 'ps', 'fa', 'pl', 'pt', 'pa', 'ro', 'ru', 'sm', 'gd', 'sr', 'st', 'sn', 'sd', 'si', 'sk', 'sl', 'so', 'es', 'su', 'sw', 'sv', 'tg', 'ta', 'te', 'th', 'tr', 'uk', 'ur', 'uz', 'vi', 'cy', 'xh', 'yi', 'yo', 'zu', 'fil', 'he']


In [2]:
translator.detect('ciao').lang, translator.detect('ciao').confidence

('en', 0.70463574)

#####  Google's translate API might be blocking the IP, tested with

In [21]:
translator.translate('ciao', src='it', dest='en').text, translator.translate('ciao', src='it', dest='en').pronunciation

('Hello', 'Hello')

In [23]:
translator.translate('ciao', src='it', dest='en').

'ciao'

In [22]:
translator.translate('ciao', src='it', dest='en').extra_data

{'translation': [['Hello', 'ciao', None, None, 1]],
 'all-translations': [['interjection',
   ['Hello!',
    'Hi!',
    'Bye!',
    'Bye-Bye!',
    'Hallo!',
    'So long!',
    'Cheerio!',
    'Hullo!'],
   [['Hello!',
     ['Ciao!', 'Salve!', 'Pronto!', 'Pronto?', 'Piacere!'],
     None,
     0.5028316],
    ['Hi!', ['Ciao!', 'Salve!', 'Piacere!'], None, 0.22313017],
    ['Bye!', ['Arrivederci!', 'Ciao!', 'Addio!']],
    ['Bye-Bye!', ['Arrivederci!', 'Arrivederla!', 'Ciao!', 'Addio!']],
    ['Hallo!', ['Ciao!', 'Salve!', 'Pronto!']],
    ['So long!', ['Ci vediamo!', 'Ciao!']],
    ['Cheerio!', ['Cincin!', 'Ciao!']],
    ['Hullo!', ['Pronto!', 'Ciao!']]],
   'Ciao!',
   9]],
 'original-language': 'it',
 'possible-translations': [['ciao',
   None,
   [['Hello', 1000, True, False], ['Hi', 1000, True, False]],
   [[0, 4]],
   'ciao',
   0,
   0]],
 'confidence': 0.9629139,
 'possible-mistakes': None,
 'language': [['it'], None, [0.9629139], ['it']],
 'synonyms': None,
 'definitions': Non

# Apply on data [tests] 

In [27]:
cadera_path = '/Users/pabloherrero/Documents/ManHatTan/CADERAs/Il castello dei destini incrociati - Notizbuch.cder'
df = pd.read_csv(cadera_path, index_col=0)
src = df.blue.dropna()
src.name = 'it'

## Request whole document translation instead of placing word-by-word requests

Avoid excess of request firewall

In [28]:
src.to_string()

'0            rimpicciolita\n1              compiutezza\n2                    rozzo\n3            mi resi conto\n4               cruciverba\n5                  anziché\n6                 riscosse\n7               l’incastro\n8                   tenuta\n9                      giù\n10                  matto?\n11                coricavo\n12                fumetti:\n13          bruciacchiato:\n14          sopravvissuti,\n15               esaurito.\n16                  Perciò\n17             schiacciato\n18                    fede\n19                    anzi\n20                  pecora\n21             manganello,\n22                 balilla\n23               nuociuto:\n24                 disagio\n25                 altrui,\n26                  preti»\n27                 lontano\n28             giovinezza,\n29               ostaggio,\n30                  scelta\n31                slancio,\n32                  galera\n33                   buia,\n34                  fiato;\n35                 

### Remove non-alphanumeric symbols

In [29]:
src_str = re.sub('[\W_](?<![\n\s])', '', src.to_string())

In [30]:
src_list = re.split(pattern = '\n\d+\s+', string = src_str)
src_list[0] = re.sub(pattern='\d\s+', repl='', string= src_list[0])
src = pd.Series(src_list, name = 'it')

In [31]:
dest_str = translator.translate(src_str, src='it', dest='es').text
dest_str

'0 encogido\n1 finalización\n2 áspera\n3 me di cuenta\n4 crucigramas\n5 en vez\n6 cargada\n7 lincastro\n8 celebrada\nun 9\n10 loco\n11 fueron a la cama\n12 cómics\n13 chamuscados\n14 sobrevivientes\n15 agotado\n16 Por tanto,\n17 aplastados\n18 fe\n19 de hecho\n20 ovejas\n21 porra\n22 de la tabla\n23 perjudicados\n24 incómoda\notras 25\n26 sacerdotes\n27 de distancia\n28 jóvenes\n29 rehenes\n30 elección\n31 de impulso\n32 cárcel\n33 oscuro\n34 respiración\n35 Lagio\n36 paso\n37 Inn\n38 bancos\n39 magnífica\n40 cabeceo\n41 jengibre\n42 rebanada\n43 estaban pagando\n44 vajilla\n45 asumen\n46 labios\nGypsy 47\n48 brillado\n49 Sin embargo,\n50 similitud\n51 saccingeva\n52 rubia\n53 ricos\n54 luto\n55 suntuosa saffrettò\n56 dudas\n57 estirada\n58 ladrón\n59 despojados\n60 chica\n61 leñador\n62 Lasso\n63 bocados inactivó\n64 disfrutan\n65 tierna\n66 Dovizioso\n67 boda\n68 monello\n69 traqueteo\n70 cuchillas\n71 de horror\n72 bruja'

## Split whole list by mathing pattern '\n\d+'

In [32]:
dest_list = re.split(pattern = '\n\d+', string = dest_str)
dest_list[0] = dest_list[0][2:]
dest = pd.Series(dest_list, name = 'es')
dest

0                encogido
1            finalización
2                  áspera
3            me di cuenta
4             crucigramas
5                  en vez
6                 cargada
7               lincastro
8         celebrada\nun 9
9                    loco
10       fueron a la cama
11                 cómics
12            chamuscados
13         sobrevivientes
14                agotado
15             Por tanto,
16             aplastados
17                     fe
18               de hecho
19                 ovejas
20                  porra
21            de la tabla
22           perjudicados
23     incómoda\notras 25
24             sacerdotes
25           de distancia
26                jóvenes
27                rehenes
28               elección
29             de impulso
             ...         
40               rebanada
41        estaban pagando
42                vajilla
43                 asumen
44       labios\nGypsy 47
45               brillado
46           Sin embargo,
47          

In [33]:
dest[44]

' labios\nGypsy 47'

##### Some retrieved translation do not follow this pattern: loop over them

In [34]:
for i, w in enumerate(dest_list):
    if('\n' in w):
        subsplit = re.split(pattern = '\n', string = w)
        subsplit[1] = re.split('\w\d', subsplit[1])[0]
        dest_list[i] = subsplit[0]
        dest_list.insert(i+1, subsplit[1])
        print(subsplit[0], subsplit[1], i)
dest = pd.Series(dest_list, name = 'es')
dest

 celebrada un 9 8
 incómoda otras  24
 labios Gypsy  46


0                encogido
1            finalización
2                  áspera
3            me di cuenta
4             crucigramas
5                  en vez
6                 cargada
7               lincastro
8               celebrada
9                    un 9
10                   loco
11       fueron a la cama
12                 cómics
13            chamuscados
14         sobrevivientes
15                agotado
16             Por tanto,
17             aplastados
18                     fe
19               de hecho
20                 ovejas
21                  porra
22            de la tabla
23           perjudicados
24               incómoda
25                 otras 
26             sacerdotes
27           de distancia
28                jóvenes
29                rehenes
             ...         
43        estaban pagando
44                vajilla
45                 asumen
46                 labios
47                 Gypsy 
48               brillado
49           Sin embargo,
50          

## Append both columns in dictionary df

In [250]:
today = datetime.datetime.today()

dicdf = pd.DataFrame([src, dest]).T

dicdf.name = os.path.splitext(os.path.basename(cadera_path))[0]
dicdf['creation_time'] = today
dicdf

Unnamed: 0,it,es,creation_time
0,rimpicciolita,encogido,2019-12-30 18:52:19.333246
1,compiutezza,finalización,2019-12-30 18:52:19.333246
2,rozzo,áspera,2019-12-30 18:52:19.333246
3,mi resi conto,me di cuenta,2019-12-30 18:52:19.333246
4,cruciverba,crucigramas,2019-12-30 18:52:19.333246
5,anziché,en vez,2019-12-30 18:52:19.333246
6,riscosse,cargada,2019-12-30 18:52:19.333246
7,lincastro,lincastro,2019-12-30 18:52:19.333246
8,tenuta,celebrada,2019-12-30 18:52:19.333246
9,giù,un 9,2019-12-30 18:52:19.333246


## Write file with translated df

In [251]:
# def write_cadera(krahtos : str, cadera : pd.DataFrame):
"""Convert CADERA (full) basename into '.got' extension and flush dictdf df"""
pathname = os.path.splitext(os.path.abspath(cadera_path))[0]
path, filename = os.path.split(pathname)
dirPath, _ = os.path.split(path)
fpath = os.path.join(dirPath, 'GOTAs', filename+'.got')
dicdf.to_csv(fpath)
print('Created GOTA file %s' %fpath)

Created GOTA file /Users/pabloherrero/Documents/ManHatTan/GOTAs/Il castello dei destini incrociati - Notizbuch.got


## Detect translation failures

In [12]:
translator.detect(en[1]).lang, translator.detect(en[1]).confidence

('en', 1.0)

In [13]:
[translator.detect(w).lang for w in es], [translator.detect(w).confidence for w in es], 

(['de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'en',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'dazh-CN',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'nl',
  'de',
  'nofi',
  'en',
  'de',
  'de',
  'af',
  'de',
  'de',
  'de',
  'de',
  'de'],
 [1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.66926074,
  0.69462645,
  1.0,
  0.963038,
  1.0,
  0.940574,
  1.0,
  1.0,
  0.9830669,
  1.0,
  0.921875,
  1.0,
  1.0,
  0.5,
  0.87118214,
  1.0,
  1.0,
  0.98828125,
  1.0,
  1.0,
  0.33590734,
  1.0,
  0.53125,
  0.73307294,
  0.88235295,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.42490163,
  1.0])

##### In this case both translations were detected as failed, though only the first one, 'dauntlessly' was
Thus, by checking with detect we're introducing an additional complication...

We could relax this by trusting that the translation is reliable as long as it is in a different language than the original

In [212]:
fails = np.where(translator.detect(w).lang == 'en' for w in es)[0]
# src = [w.src for w in trans]
fails

array([0])

### In this case, let's see what the details say

In [150]:
trans[fails[0]].extra_data

{'translation': [['dauntlessly', 'dauntlessly', None, None, 0]],
 'all-translations': None,
 'original-language': 'en',
 'possible-translations': [['dauntlessly',
   None,
   [['dauntlessly', 998, True, False],
    ['sin desánimo', 1, True, False],
    ['impávido', 0, True, False],
    ['denodadamente', 0, True, False],
    ['intrépidamente', 0, True, False]],
   [[0, 11]],
   'dauntlessly',
   0,
   1]],
 'confidence': 1.0,
 'possible-mistakes': None,
 'language': [['en'], None, [1.0], ['en']],
 'synonyms': [['adverbio',
   [[['intrepidly', 'fearlessly'], '']],
   'dauntlessly']],
 'definitions': None,
 'examples': None,
 'see-also': None}

In [145]:
trans[fails[0]].extra_data['synonyms']

[['adverbio', [[['intrepidly', 'fearlessly'], '']], 'dauntlessly']]

### Get next option from possible translations

In [166]:
from copy import deepcopy

def flatten_str_list(nested_list):
    """Flatten an arbitrarily nested list, without recursion (to avoid
    stack overflows). Returns a new list, the original list is unchanged.
    >> list(flatten_list([1, 2, 3, [4], [], [[[[[[[[[5]]]]]]]]]]))
    [1, 2, 3, 4, 5]
    >> list(flatten_list([[1, 2], 3]))
    [1, 2, 3]
    """
    nested_list = deepcopy(nested_list)
    
    while nested_list:
        sublist = nested_list.pop(0)

        if isinstance(sublist, list):
            nested_list = sublist + nested_list
        else:
            if type(sublist) == str:
                yield sublist

In [169]:
possibleTrans = trans[fails[0]].extra_data['possible-translations']
possibleTrans = [w for w in flatten_list(possibleTrans)]
possibleTrans

['dauntlessly',
 'dauntlessly',
 'sin desánimo',
 'impávido',
 'denodadamente',
 'intrépidamente',
 'dauntlessly']

In [175]:
possibleTrans[np.where([translator.detect(w).lang != 'en' for w in possibleTrans])[0][0]]

'sin desánimo'

## Auto-detect likely overall source language

##### We will use the translated [es] Series for testing since it may contain translation failures

In [416]:
lang = [translator.detect(w).lang for w in es]
conf = [translator.detect(w).confidence for w in es]

##### Arrange detected languages and confidence level in "unbinned" dataframe

In [417]:
dfLang = pd.DataFrame({'lang':lang, 'conf':conf})
dfLang.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
lang,es,es,laes,es,it,es,es,es,ptes,ites,...,espt,es,es,es,es,sv,es,gles,pt,ptes
conf,1,1,0.511719,1,0.609375,1,0.61344,1,0.509728,0.488281,...,0.621094,1,1,0.640625,1,0.356434,1,0.589844,0.75,0.509728


##### Group by languages, count occurrences and sum confidence (probabilities)

In [427]:
dictNorm = dfLang.lang.value_counts().to_dict()
normFactor = sum(dictNorm.values())

dictNorm

{'es': 37,
 'en': 13,
 'pt': 3,
 'it': 2,
 'ro': 2,
 'ites': 2,
 'ptes': 2,
 'laes': 1,
 'lv': 1,
 'esla': 1,
 'id': 1,
 'esgl': 1,
 'espt': 1,
 'sv': 1,
 'gles': 1,
 'sq': 1,
 'esit': 1,
 'tl': 1,
 'ceb': 1,
 'no': 1}

In [432]:
a = dfLang.groupby('lang').sum().to_dict()['conf']
dictWeights = {k: v / normFactor for k, v in a.items()}
dictWeights

{'ceb': 0.013513513513513514,
 'en': 0.16151121472972974,
 'es': 0.4588738939189189,
 'esgl': 0.008039908108108108,
 'esit': 0.007414029189189189,
 'esla': 0.0071790540540540545,
 'espt': 0.008393158783783784,
 'gles': 0.007970861486486486,
 'id': 0.004592483108108108,
 'it': 0.011949667567567566,
 'ites': 0.013196790540540541,
 'laes': 0.006915118243243243,
 'lv': 0.010522378378378378,
 'no': 0.003000803243243243,
 'pt': 0.037162162162162164,
 'ptes': 0.013776423243243244,
 'ro': 0.010700120945945947,
 'sq': 0.01322135945945946,
 'sv': 0.004816678783783784,
 'tl': 0.00558629554054054}

##### The probability for each language results from summing over all occurrences the product of two factors:
                - Occurrences_language / sum(occurrences_languages)
                - confidence_occurrence """

The interpretation is the following: the more times a language appears as detected, the more likely the whole set was in that language. This probability is weighted by the confidence each detection carries with it.

```Ret``` is a dictionary with each detected language and its final weighted probability 

Since, for computing purposes, we are summing over the probabilities

In [415]:
ret = dict()
normFactor = sum(dictNorm.values())
for key, language in dictWeights.items():
    ret[key] = language*dictNorm.get(key, 1) / normFactor
    
maximum = max(ret, key=ret.get)
ret

{'en': 0.8674614738037154,
 'es': 0.00017079306624132108,
 'hu': 8.761982923625447e-05,
 'la': 0.00018765246762994934}

##### Return the language with maximum probability

In [357]:
maximum, ret[maximum]

('es', 0.84891670375)

# Gather methods

```RASHIB :
        Raw Array of Sentences Highlighted In Book
CADERA :
        Color-Arranged Dataframe Extracted from RAshib
GOTA:
        GOogle-Translated Array ```

In [46]:
class Translation:
    from googletrans import Translator
    def __init__(self, cadera : str, dest : str, src : str, color : str = 'blue') -> pd.Series :
        """ Import source CADERA file and keep only column specified as 'color' """
        self.dfSrc = pd.read_csv(cadera, index_col=0)[color].dropna()
        self.src : str = src
        self.dest : str = dest
        self.translator = Translator()
        
    def detect_src(self, N : int = 0):
        """Auto-detect languages in wordset given and arrange them by occurrences
        Parameters:
        N : int = 0
            Number of words to sample from CADERA series. If left to 0, the whole Series is taken"""
        
        def compute_highest_scoring_language(dictNorm : dict, dictWeights : dict) -> dict:
            """Return highest scoring weighted language over the sample.
            The probability for each language results from summing over all occurrences the product of two factors:
                - Occurrences_language / sum(occurrences_languages)
                - confidence_occurrence """
            ret = dict()
            normFactor = sum(dictNorm.values())
            for key, language in dictWeights.items():
                ret[key] = language*dictNorm.get(key, 1) / normFactor
            return ret
        
        if N != 0:
            sample = self.dfSrc.sample(N)
        else:
            sample = self.dfSrc
            
        lang = [translator.detect(w).lang for w in sample]
        conf = [translator.detect(w).confidence for w in sample]
        dfLang = pd.DataFrame({'lang':lang, 'conf':conf})
        
        dictNorm = dfLang.lang.value_counts().to_dict()  # Dictionary with number of occurences per language
        normFactor = sum(dictNorm.values())              # Total number of words in sample
        
        a = dfLang.groupby('lang').sum().to_dict()['conf']
        dictWeights = {k: v / normFactor for k, v in a.items()}   # Weights per language, averaged by occurrences
        
        ret = compute_highest_scoring_language(dictNorm, dictWeights)
        maximum = max(ret, key=ret.get)
        self.src = maximum
        return maximum, ret[maximum]

    
    def main_translate(self) -> list:
        """Translate dfSrc from src to dest language
        Returns list of translate objects"""
        trans = [self.translator.translate(w, src=self.src, dest=self.dest) for w in self.dfSrc]
        return trans
    
    def arrange_dicDf(self) -> pd.DataFrame:
        trans = self.main_translate()
        dfDest = pd.Series([w.text for w in trans])
        dicDf = pd.DataFrame({self.src : self.dfSrc,  self.dest: dfDest})
        dicDf.names = [self.src, self.dest]
        return dicDf
    
    def detect_fails(self, meth : str = 'dest'):
        """Fails are interpreted as words that remained untranslated (if using default method 'src')
        or simply if the detected language doesn't correspond to 'dest'  """
        
        trans = self.main_translate()
        if meth == 'src':
            fails = np.where(translator.detect(w).lang == self.src for w in trans)[0]
        elif meth == 'dest':
            fails = np.where(translator.detect(w).lang != self.dest for w in trans)[0]
        return fails
    
    def fix_fails(self, meth : str = 'src'):
        """For failed translations, look into extra_data attribute 'possible-translation', flatten the list 
        and take the first one in the correct language"""
        from copy import deepcopy
        def flatten_str_list(nested_list):
            """Flatten an arbitrarily nested list, without recursion (to avoid
            stack overflows). Returns a new list, the original list is unchanged.
            >> list(flatten_list([1, 2, 3, [4], [], [[[[[[[[[5]]]]]]]]]]))
            [1, 2, 3, 4, 5]
            >> list(flatten_list([[1, 2], 3]))
            [1, 2, 3]
            Note: additionally, only return type 'str' elements
            """
            nested_list = deepcopy(nested_list)

            while nested_list:
                sublist = nested_list.pop(0)

                if isinstance(sublist, list):
                    nested_list = sublist + nested_list
                else:
                    if type(sublist) == str:
                        yield sublist
        
        trans = self.main_translate()
        fails = self.detect_fails(meth)
        fixes = []
        for f in fails:
            messy_list = trans[f].extra_data['possible-translations']
            possibleTrans = [w for w in flatten_str_list(messy_list)]
            fixes.append(  possibleTrans[ np.where( [translator.detect(w).lang == self.dest for w in possibleTrans] ) [0][0]  ]  )
        return fixes
    x

##### Outlook
 - Check repeated words in first import
 - Fix failures using extra_data

In [43]:
tr = Translation(cadera_path, dest='en', src='de')
tr.detect_src(N=4)
tr.src

'de'

In [44]:
tr.arrange_dicDf()



Unnamed: 0,de,en
0,schweigend stand siddhartha im senkrechten son...,silently stood siddhartha in the vertical sunb...
1,nicht einen augenblick habe ich an dir gezweif...,not an instant I doubted you. I do not have in...
2,"beraubt hat mich der buddha, dachte siddhartha...","robbed me of buddha, siddhartha thought, he ro..."
3,daß siddhartha mir so fremd und unbekannt gebl...,siddhartha that has stayed with me so strange ...
4,"""wenn einer eine schrift liest, deren sinn er ...","""If a writer reads one whose sense he wants to..."
5,aber nie hatte er dies selbst wirklich gefunde...,but he did this himself really never found bec...
6,"struppiger bettler,","shaggy beggar,"
7,"griffel,","stylus,"
8,laub,leaves
9,biegsam,flexible


In [48]:
tr.detect_fails()
#tr.fix_fails()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

# Debug for  'Die Verwandlung' 

In [10]:
cadera_path = '/Users/pabloherrero/Documents/ManHatTan/CADERAs/Die_Verwandlung.cder'
df = pd.read_csv(cadera_path, index_col=0)
src = df.blue.dropna()
src_lang = 'de'
dest_lang = 'en'

In [105]:
src = load_blue_words(cadera_path, src_lang)
src = test_long_sentence(src)

dest = bulk_translate(src, dest_lang)


Entry with more than 3 words detected:  Und trotz dieses Zustandes hatte er keine Scheu, ein Stück auf dem makellosen Fußboden des Wohnzimmers vorzurücken.
Attempted translation of 104 entries. Check DB for mistranslations.


In [109]:

dicDf = make_dicdf(src, dest, cadera_path)
dicDf
write_gota(cadera_path, dicDf)

Created GOTA file /Users/pabloherrero/Documents/ManHatTan/GOTAs/Die_Verwandlung.cder


# Debug for  'Siddhartha' 

In [15]:
cadera_path = '/Users/pabloherrero/Documents/ManHatTan/CADERAs/Notizen aus _Siddhartha_ eine indische Dichtung (German Edition)_.cder'
df = pd.read_csv(cadera_path, index_col=0)
src = df.blue.dropna()
src_lang = 'de'
dest_lang = 'en'

In [17]:
src = load_blue_words(cadera_path, src_lang)
src = format_src(src)
src = test_long_sentence(src)
src

Entry with more than 3 words detected:  schweigend stand siddhartha im senkrechten son
Entry with more than 3 words detected:  nicht einen augenblick habe ich an dir gezweif
Entry with more than 3 words detected:  beraubt hat mich der buddha dachte siddhartha
Entry with more than 3 words detected:  daß siddhartha mir so fremd und unbekannt gebl
Entry with more than 3 words detected:  wenn einer eine schrift liest deren sinn er 
Entry with more than 3 words detected:  aber nie hatte er dies selbst wirklich gefunde


0               struppiger bettler
1                          griffel
2                             laub
3                          biegsam
4                    müdgewordenen
5                      wohlergehen
6                     dienerschaft
7                          predigt
8                       allmählich
9                          töpfers
10                      schläferte
11                           stets
12                       beneidete
13           beizulegen vermochten
14                           bange
15                   kindertorheit
16                        schleier
17                          säumen
18                            ekel
19                  begehrlichkeit
20                     törichteste
21                            tand
22                      schnödeste
23                           sitte
24                             wut
25                           frech
26    verspielen und verschleudern
27                         elenden
28                  

In [4]:
dest = bulk_translate(src, dest_lang)

Attempted translation of 33 entries. Check DB for mistranslations.


In [5]:
src = pd.Series([re.sub(r'[,.;:"]', '', t) for t in src])

In [18]:
dicDf = make_dicdf(src, dest, cadera_path)
dicDf

Unnamed: 0,de,en,creation_time
0,struppiger bettler,shaggy beggar,1586607187
1,griffel,stylus,1586607187
2,laub,leaves,1586607187
3,biegsam,flexible,1586607187
4,müdgewordenen,müdgewordenen,1586607187
5,wohlergehen,wohlergehen,1586607187
6,dienerschaft,diener economy,1586607187
7,predigt,preaches,1586607187
8,allmählich,gradually,1586607187
9,töpfers,potter,1586607187


In [15]:
write_gota(cadera_path, dicDf)

Created GOTA file /Users/pabloherrero/Documents/ManHatTan/GOTAs/Siddhartha__eine_indische_Dichtung_(German_Edition)_.got


## Test replication of reference file

In [8]:
def clean_filename(filename : str)->str:
    """Remove automatically added "Notes from..." string from filename"""
    filename = filename.replace(' ', '_')
    filename = filename.replace('_-_Bloc-notes', '')
    filename = filename.replace('_-_Notizbuch', '')

    filename = filename.replace('Notes_from__', '')
    filename = filename.replace('Notizen_aus__', '')

    return filename

In [19]:
cadera_path = clean_filename(cadera_path)
cadera_path

'/Users/pabloherrero/Documents/ManHatTan/CADERAs/Siddhartha__eine_indische_Dichtung_(German_Edition)_.cder'

In [20]:
write_gota(cadera_path, dicDf)

Created GOTA file /Users/pabloherrero/Documents/ManHatTan/GOTAs/Siddhartha__eine_indische_Dichtung_(German_Edition)_.got


In [30]:
def test_output_bulkTranslate(gotscript_path : str):
    """Test whether bulkTranslate produces the same absolute GOTA as in reference file (Siddhartha)
        and indicate what words differ"""

    gotref_path = '/Users/pabloherrero/Documents/ManHatTan/GOTAs/Siddhartha__eine_indische_Dichtung_(German_Edition)_.got'
    gotref = pd.read_csv(gotref_path)
    gotscript = pd.read_csv(gotscript_path)

    compare_df = (gotscript == gotref)
    compare_df.drop('creation_time', axis=1, inplace=True)  # creation_time is not to be compared
    fails = np.where(compare_df == False)
    print('Found (%i,%i) row,cols replication fails at init_gotstick'%(len(fails[0]), len(fails[1]) ))
    print('On reference file:')
    print(gotref.iloc[fails[0], fails[1]])
    print('On processed file:')
    print(gotscript.iloc[fails[0], fails[1]])

In [29]:
gotscript_path = '/Users/pabloherrero/Documents/ManHatTan/GOTAs/Notizen aus _Siddhartha_ eine indische Dichtung (German Edition)_.got'
gotref_path = '/Users/pabloherrero/Documents/ManHatTan/GOTAs/Siddhartha__eine_indische_Dichtung_(German_Edition)_.got'

gotref = pd.read_csv(gotref_path, index_col=0)
gotscript = pd.read_csv(gotscript_path, index_col=0)
compare_df = (gotscript == gotref)

compare_df

Unnamed: 0,de,en
0,True,True
1,True,True
2,True,True
3,True,True
4,True,True
5,True,True
6,True,True
7,True,True
8,True,True
9,True,True


In [39]:
gotscript_path = '/Users/pabloherrero/Documents/ManHatTan/GOTAs/Notizen aus _Siddhartha_ eine indische Dichtung (German Edition)_.got'
test_output_bulkTranslate(gotscript_path)

ValueError: Can only compare identically-labeled DataFrame objects

# Debug for  'Io Uccido' 

In [13]:
cadera_path = '/Users/pabloherrero/Documents/ManHatTan/CADERAs/Io_Uccido.cder'
df = pd.read_csv(cadera_path, index_col=0)
src = df.blue.dropna()
src_lang = 'it'
dest_lang = 'es'

In [18]:
src = load_blue_words(cadera_path, src_lang)
src = format_src(src)
# src = test_long_sentence(src)
src

0                                              fatica
1                                                noia
2                                             capelli
3                                         saracinesca
4                                           allenando
5                                         grattacieli
6                                               brano
7                                           scommesse
8                                             autista
9                                             buttare
10                                      Lo tirò fuori
11                                            fruscio
12                                              palle
13                                              mossa
14                                             attimo
15                                           pezzente
16                                           sbrigati
17                                              mentì
18                          

In [19]:
dest = bulk_translate(src, dest_lang)

Attempted translation of 127 entries. Check DB for mistranslations.


In [5]:
src = pd.Series([re.sub(r'[,.;:"]', '', t) for t in src])

In [20]:
dicDf = make_dicdf(src, dest, cadera_path)
dicDf

Unnamed: 0,it,es,creation_time
0,fatica,fatiga,1605001651
1,noia,aburrimiento,1605001651
2,capelli,del cabello,1605001651
3,saracinesca,puerta,1605001651
4,allenando,de formación,1605001651
5,grattacieli,rascacielos,1605001651
6,brano,canción,1605001651
7,scommesse,apuestas,1605001651
8,autista,controlador,1605001651
9,buttare,tiro,1605001651


In [21]:
write_gota(cadera_path, dicDf)

Created GOTA file /Users/pabloherrero/Documents/ManHatTan/GOTAs/Io_Uccido.got


'/Users/pabloherrero/Documents/ManHatTan/GOTAs/Io_Uccido.got'