# Sandbook Notebook for (new) async version of googletrans

In [159]:
from googletrans import Translator
translator = Translator()
import asyncio
import pandas as pd
import random
import os
import re
import datetime

## Basic translation and language detection

In [8]:
async def translate_text(text, dest_language='en'):
    async with Translator() as translator:
        try:
            # Perform the translation
            translated = await translator.translate(text, dest=dest_language)
            return translated.text
        except Exception as e:
            print(f"Error during translation: {e}")
            return None

async def detect_language(text):
    async with Translator() as translator:
        try:
            # Detect the language of the text
            detected = await translator.detect(text)
            return detected.lang
        except Exception as e:
            print(f"Error during language detection: {e}")
            return None

In [11]:
await translate_text("שלום עולם", dest_language='en')

'Peace of the world'

In [16]:
await detect_language("שלום עולם")

'iw'

## Same for lists

In [None]:
async def detect_language_list(texts):
    async with Translator() as translator:
        try:
            # Detect the languages of a list of texts
            detected = await translator.detect(texts)
            return detected
        except Exception as e:
            print(f"Error during language detection: {e}")
            return None

In [21]:
path_cder = '/Users/pabloherrero/Documents/ManHatTan/mht/data/processed/CADERAs/Die_Verwandlung.cder'
cder = pd.read_csv(path_cder)
word_list = list(cder['blue'].values)
word_list[:3]

['Ungeziefer', 'Versteifungen', 'Umfang']

In [23]:
len(word_list)

115

# Dev: detect most likely language from a random sample of 10 entries

In [None]:
# CAUTION: running this on a large list can be slow and may hit API limits.
# result = await detect_language_list(word_list)

In [33]:
detected_languages = [res.lang for res in result]
detected_confidence = [res.confidence for res in result]
detection_stats = pd.DataFrame({
    'detected_language': detected_languages,
    'confidence': detected_confidence
})
detection_stats['detected_language'].value_counts().sort_values(ascending=False)

detected_language
de    100
en      8
lb      1
hu      1
nl      1
sv      1
fr      1
sk      1
la      1
Name: count, dtype: int64

In [35]:
detection_stats.groupby('detected_language').mean().sort_values(by='confidence', ascending=False)

Unnamed: 0_level_0,confidence
detected_language,Unnamed: 1_level_1
la,1.0
de,0.936481
en,0.722688
sv,0.599222
fr,0.570312
nl,0.502095
lb,0.458477
sk,0.281745
hu,0.176923


In [41]:
detection_stats.groupby('detected_language').sum().sort_values(by='confidence', ascending=False)

Unnamed: 0_level_0,confidence
detected_language,Unnamed: 1_level_1
de,93.648051
en,5.7815
la,1.0
sv,0.599222
fr,0.570312
nl,0.502095
lb,0.458477
sk,0.281745
hu,0.176923


In [61]:
stat_sample = detection_stats.sample(10)
grouped_confidences = stat_sample.groupby('detected_language').sum().sort_values(by='confidence', ascending=False)

In [89]:
max_confidence = grouped_confidences[grouped_confidences['confidence'] == grouped_confidences['confidence'].max()]
predicted_lang, summed_conf = max_confidence.index[0], max_confidence['confidence'].values[0]
min_confidence = 4.0
summed_conf > min_confidence

True

### Write function

In [117]:
async def find_language(word_list: list, min_confidence=4.0):
    """Detect language of the blue column in CADERA.
    Parameters:
    word_list : list
        List of words to sample for language detection. Should be at least 10 words long.
    min_confidence : float
        Minimum confidence threshold for language detection. Default is 4.0.
    Returns:
    str or None
        Detected language code if confidence is above the threshold, otherwise None.
    Usage:
    >>> cder = pd.read_csv('path_to_cadera.cder')
    >>> detected_lang = await find_language(cder, min_confidence=4.0)
    >>> print(detected_lang)  # Outputs the detected language code or None if confidence is low
    """

    if not word_list or len(word_list) < 10:
        print("The word list too short. Cannot detect language confidently.")
        return None
    # Sample 10 words from the word_list

    sample_list = random.sample(word_list, 10)

    result = await detect_language_list(sample_list)
    
    detected_languages = [res.lang for res in result]
    detected_confidence = [res.confidence for res in result]
    
    detection_stats = pd.DataFrame({
        'detected_language': detected_languages,
        'confidence': detected_confidence
    })
    
    grouped_confidences = detection_stats.groupby('detected_language').sum().sort_values(by='confidence', ascending=False)
    
    max_confidence = grouped_confidences[grouped_confidences['confidence'] == grouped_confidences['confidence'].max()]
    predicted_lang, summed_conf = max_confidence.index[0], max_confidence['confidence'].values[0]
    
    if summed_conf < min_confidence:
        print(f"Predicted language '{predicted_lang}' with confidence {summed_conf/10} is below the minimum threshold of {min_confidence/10}.")
        return None
    else:
        print(f"Predicted language: {predicted_lang} with confidence: {summed_conf/10}")
        return predicted_lang

In [113]:
wordll_list = cder['blue'].dropna().to_list()
len(wordll_list)

115

In [123]:
src_lang = await find_language(wordll_list, min_confidence=4.0)

Predicted language: de with confidence: 0.69423519


In [114]:
import random
random.sample(wordll_list, 10)

['verraten.',
 'Heidengeld',
 'bisweilen',
 'wehren',
 'üppigen',
 'Pult',
 'Munterkeit',
 'Plafond',
 'Klinke,',
 'erstarrte']

# Debug bulkTranslate

In [None]:
def format_src(src_list : list) -> str:
    """Remove non-alphanumeric characters from source array
        Parameters
        src_list : list
            Source series to be formatted
        Returns
        src_formatted : list
            Formatted source series
    """
    
    src_formatted = [re.sub(pattern = '[\W_](?<![\n\s])', repl='', string=w) for w in src_list]   # Remove tabs
    src_formatted = [re.sub(r'[,.;:"]', '', w) for w in src_formatted]    #Remove ortographic symbols

    return src_formatted

async def bulk_translate(src_list: list, src_lang: str, dest_lang: str = None):
    """Translate a list of strings from src_lang to dest_lang using googletrans.
    Parameters:
    src_list : list
        List of strings to be translated.
    src_lang : str
        Source language code (e.g., 'en', 'de', 'es').
    dest_lang : str, optional
        Destination language code (e.g., 'en', 'de', 'es'). If not specified, defaults to 'en'.
    Returns:
    dest_list : list
        List of translated strings in the destination language.
    Usage:
    >>> translated_df = await bulk_translate(['Hello', 'World'], 'en', 'es')
    """

    async with Translator() as translator:
        print(f'Starting translation, src = {src_list}, dest_lang = {dest_lang}')
        
        if not dest_lang:
            print('No destination language specified. Using English as default.')
            dest_lang = 'en'

        dest_list = await translator.translate(text = src_list, dest=dest_lang, src=src_lang)
        print('Translation finished')
        
        return dest_list

def make_gota_df(src_list : list, dest_list : list, cadera_path : str) -> pd.DataFrame:
    """Assemble src and dest lists into gota_df (GOgle Translation Archive)
    and append first creation datetime (read_time)"""

    dest_dict = {}
    for s, d in zip(src_list, dest_list):
        dest_dict[s] = d.text
    gota_df = pd.DataFrame(dest_dict.items(), columns=[src_lang, dest_lang])
    
    gota_df.name = os.path.splitext(os.path.basename(cadera_path))[0]
    #today = datetime.datetime.today()
    today = int(datetime.datetime.timestamp(datetime.datetime.today())) # Correct in init_lipstick.py

    gota_df['creation_time'] = today
    return gota_df

In [170]:
src_str = [re.sub(pattern = '[\W_](?<![\n\s])', repl='', string=w) for w in word_list]   # Remove tabs
src_str = [re.sub(r'[,.;:"]', '', w) for w in src_str]    #Remove ortographic symbols
src_str

['Ungeziefer',
 'Versteifungen',
 'Umfang',
 'flimmerten',
 'versehen',
 'Fensterblech',
 'undurchführbar',
 'schaukelte',
 'Jucken',
 'Pult',
 'Zeiger',
 'Donnerwetter',
 'Rückgrat',
 'Einwände',
 'arbeitsscheue',
 'derart',
 'ausführlich',
 'Faust',
 'Einbildung',
 'Vorbote',
 'tüchtigen',
 'Willkür',
 'Zuversicht',
 'Munterkeit',
 'Krach',
 'endgültig',
 'erstarrte',
 'Versäumnis',
 'Lumpen',
 'Angelegenheit',
 'verständigen',
 'gnädige',
 'schluchzen',
 'Ungewißheit',
 'Rändern',
 'verständigten',
 'Zuversicht',
 'Kiefer',
 'Aufmunterung',
 'Klinke',
 'plump',
 'feindseligem',
 'überreicher',
 'Heidengeld',
 'beirren',
 'wehren',
 'gefährdet',
 'gefaßt',
 'Zischlaute',
 'Abenddämmerung',
 'ohnmachtähnlichen',
 'Narbe',
 'hinken',
 'Napf',
 'heiklen',
 'Unannehmlichkeiten',
 'nachdrücklich',
 'verzehrte',
 'tüchtig',
 'verraten',
 'allmählich',
 'scheute',
 'zitterte',
 'Vernunftgründen',
 'billigte',
 'Plafond',
 'Aufenthalt',
 'ererbten',
 'Einwirkungen',
 'entbehren',
 'Trotz',
 

In [153]:
dest_lang = 'en'
# src_lang = await find_language(wordll_list, min_confidence=4.0)
dest_list = await bulk_translate(wordll_list , src_lang, dest_lang)

Starting translation, src = ['Ungeziefer', 'Versteifungen', 'Umfang', 'flimmerten', 'versehen,', 'Fensterblech', 'undurchführbar,', 'schaukelte', 'Jucken', 'Pult', 'Zeiger', 'Donnerwetter', 'Rückgrat', 'Einwände', 'arbeitsscheue', 'derart', 'ausführlich', 'Faust.', 'Einbildung', 'Vorbote', 'tüchtigen', 'Willkür', 'Zuversicht', 'Munterkeit', 'Krach,', 'endgültig', 'erstarrte', 'Versäumnis', 'Lumpen,', 'Angelegenheit', 'verständigen:', 'gnädige', 'schluchzen.', 'Ungewißheit,', 'Rändern', 'verständigten', 'Zuversicht', 'Kiefer', 'Aufmunterung;', 'Klinke,', 'plump', 'feindseligem', 'überreicher', 'Heidengeld', 'beirren', 'wehren', 'gefährdet', 'gefaßt', 'Zischlaute', 'Abenddämmerung', 'ohnmachtähnlichen', 'Narbe,', 'hinken.', 'Napf', 'heiklen', 'Unannehmlichkeiten', 'nachdrücklich', 'verzehrte', 'tüchtig', 'verraten.', 'allmählich', 'scheute', 'zitterte', 'Vernunftgründen', 'billigte.', 'Plafond', 'Aufenthalt', 'ererbten', 'Einwirkungen', 'entbehren;', 'Trotz', 'Entschlusse', 'unweigerlich

In [155]:
dest_list[0].text, dest_list[0].src, dest_list[0].pronunciation,# dest_list[0].extra_data, dest_list[0].origin

('vermin', 'de', None)

In [156]:
dest_text_list = [d.text for d in dest_list]
assert len(dest_text_list) == len(wordll_list), 'bulk_translate error: len(dest) does not match len(src)'

dest_dict = {}
for s, d in zip(wordll_list, dest_list):
    dest_dict[s] = d.text
gota_df = pd.DataFrame(dest_dict.items(), columns=[src_lang, dest_lang])
gota_df

Unnamed: 0,de,en
0,Ungeziefer,vermin
1,Versteifungen,Stiffeners
2,Umfang,Scope
3,flimmerten,flicker
4,"versehen,","provided,"
...,...,...
109,behaglich.,cozy.
110,krepiert;,crepo;
111,mürrisch,grumpy
112,bisweilen,sometimes


In [142]:
make_gota_df(wordll_list, dest_list, path_cder)

Unnamed: 0,de,en,creation_time
0,Ungeziefer,vermin,1751101789
1,Versteifungen,Stiffeners,1751101789
2,Umfang,Scope,1751101789
3,flimmerten,flicker,1751101789
4,"versehen,","provided,",1751101789
...,...,...,...
109,behaglich.,cozy.,1751101789
110,krepiert;,crepo;,1751101789
111,mürrisch,grumpy,1751101789
112,bisweilen,sometimes,1751101789
