In [15]:
import pandas as pd
import numpy as np
import os
import re
import sys
import datetime
import apertium

sys.path.append('../python_scripts/')
sys.path.append('../ML')
from duolingo_hlr import *
from bulkTranslate import *
from init_lipstick import *

In [212]:
def set_lip(gota : pd.DataFrame, flag_lexeme = False):
    """Provisional simple initialization of lipstick from GOTA. 
        Attrs:
        ------
        p_recall : truth recall probability = history_correct/history_correct
        timestamp : last time practice timestamp
        delta : timedelta w.r.t. most unpracticed word (with minimum timestamp)
        user_id : user name
        learning_language: target language
        ui_language: user reference language
        lexeme_id: word in target language, lexeme in the future?
        word_id: word in reference language
        lexeme_string: lexeme tag with grammatical/syntactical information, not implemented yet
        history_seen: times the word has been practiced from initialization
        history_correct: times the translation has been correctly recalled from initialization
        session_seen: practice times in last session (not implemented)
        session_correct: correctly recalled times in last session (not implemented)

        Additional attrs:
        ------
        p_pred: predicted probability from hlr model (not in initialization)
    """
    cols0 = ['lexeme_id', 'translated_word', 'timestamp', 'history_seen', 'history_correct']

    lear_lang = gota.columns[0]
    ui_lang = gota.columns[1]
    timest = gota.creation_time # .apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d %H:%M:%S.%f')).apply(lambda dt : int(datetime.datetime.timestamp(dt)))

    delta = timest - np.min(timest)
    # ptruth = ((gota.right_hist ) / gota.seen_hist).fillna(0)   # Legacy from using GOTA as DB corpus with updatePerformance.py
    ptruth = pd.Series(np.zeros_like(timest))  # Initialize on 0, also for seen and correct attrs.
    lipstick = pd.DataFrame({'p_recall':ptruth})
        
    lipstick['timestamp'] = timest
    lipstick['delta'] = delta
    lipstick['user_id'] = 'pablo'  # Will be customizable later
    lipstick['learning_language'] = lear_lang
    lipstick['ui_language'] = ui_lang
    lipstick['word_ll'] = gota[lear_lang]
    lipstick['word_ul'] = gota[ui_lang]
    
    if flag_lexeme:
        lexeme = []
        for wd in lipstick.word_ll:
            tagSplit = str(apertium.tag(lear_lang, wd)[0]).split('/')
            lexeme.append(tagSplit[0] + '/' + tagSplit[1])
    else:
        lexeme = 'lernt/lernen<vblex><pri><p3><sg>' 
        
    lipstick['lexeme_string'] = lexeme
    lipstick['history_seen'] = ptruth  
    lipstick['history_correct'] = ptruth  
    lipstick['session_seen'] = ptruth  
    lipstick['session_correct'] = ptruth
    
    return lipstick

def make_lang_dic(languages):
    """Return a dict with inverted langcodes
    Ex: {'en': 'English'
        'eu': 'Basque'}
    """
    from googletrans import LANGCODES as dictTrans
    langs = {}
    for la in languages:
        try:
            lang = dictTrans[la.lower()]
        except KeyError:
            pass
        langs[lang] = la
    return langs

def gost2gota(gost: pd.DataFrame, langs: dict, ll: str, ul: str):
    """Adapt GOST to GOTA format for LIPSTICK processing
    Parameters:
        ll: learning language in short format
        ul: user language in short format ('en', 'de', 'es'...)
    """
    # Seach for coincidences with the learning language (ll)
    targetEntries = gost[gost['source_lang'] == langs[ll]]
    
    newGota = pd.DataFrame({ll:[], ul: []})
    newGota[ll] = targetEntries['source_word']
    newGota[ul] = targetEntries['translation']
    newGota.reset_index(drop=True, inplace=True)
    
    # Search as well for reverse translation: coincidences in the translation
    invGota = pd.DataFrame({ll:[], ul: []})
    targetEntries = gost[gost['target_lang'] == langs[ll]]
    invGota[ul] = newGota[ul].append(targetEntries['source_word'], ignore_index=True)
    invGota[ll] = newGota[ll].append(targetEntries['translation'], ignore_index=True)
    
    # Add creation timestamp
    today = int(datetime.datetime.timestamp(datetime.datetime.today()))
    invGota['creation_time'] = today
    return invGota

# Process GOogle Saved Translations (GOST)

## Import GOST

In [90]:
gost_path = '/Users/pabloherrero/Documents/ManHatTan/CADERAs/GTranslate_saved_translations.csv'
gost = pd.read_csv(gost_path, names=['source_lang', 'target_lang', 'source_word', 'translation'])

In [95]:
gost

Unnamed: 0,source_lang,target_lang,source_word,translation
0,English,Spanish,idle,inactivo
1,English,Spanish,dash,guión
2,English,Spanish,leverage,apalancamiento
3,English,Spanish,savvy,comprensión
4,English,Spanish,whelp,cachorro
5,English,Spanish,emasculate,castrar
6,English,Spanish,seasoning,condimento
7,English,Spanish,bugger,sodomita
8,English,Spanish,gallows,horca
9,English,Spanish,nigh,cerca


## Get lang (short) from GOST and make dict

In [166]:
languages = gost['source_lang'].unique()

In [174]:
dictLang = googletrans.LANGCODES

In [181]:
dictLang['english']

'en'

In [171]:
langs = []
for la in languages:
    if la == 'Spanish':
        lang = 'es'
    elif la == 'German':
        lang = 'de'
    else:
        lang = la[:2].lower()
    langs.append(lang)
langs

['en', 'de', 'es', 'it', 'fr', 'ba', 'cz']

In [217]:
make_lang_dic(languages)

{'en': 'English',
 'de': 'German',
 'es': 'Spanish',
 'it': 'Italian',
 'fr': 'French',
 'eu': 'Basque',
 'cs': 'Czech'}

## Transform to GOTA

In [159]:
targetEntries = gost[gost['source_lang'] == 'Czech']

newGota = pd.DataFrame({'cz':[], 'en': []})
newGota['cz'] = targetEntries['source_word']
newGota['en'] = targetEntries['translation']
newGota.reset_index(drop=True, inplace=True)

invGota = pd.DataFrame({'cz':[], 'en': []})
targetEntries = gost[gost['target_lang'] == 'Czech']
invGota['en'] = newGota['en'].append(targetEntries['source_word'], ignore_index=True)
invGota['cz'] = newGota['cz'].append(targetEntries['translation'], ignore_index=True)

today = int(datetime.datetime.timestamp(datetime.datetime.today())) # Correct in init_lipstick.py
invGota['creation_time'] = today
invGota

Unnamed: 0,cz,en,creation_time
0,slovo,word,1588437047
1,dlouhy,long,1588437047
2,jmeno,name,1588437047
3,ted,now,1588437047
4,ted',now',1588437047
5,tady,here,1588437047
6,Prosím,Please,1588437047
7,nádhera,splendor,1588437047
8,Proměny,Transformations,1588437047
9,brzy se uvidíme,see you soon,1588437047


In [196]:
gosta = gost2gota(gost, langs, 'cs', 'en')
gosta

Unnamed: 0,cs,en,creation_time
0,slovo,word,1588438799
1,dlouhy,long,1588438799
2,jmeno,name,1588438799
3,ted,now,1588438799
4,ted',now',1588438799
5,tady,here,1588438799
6,Prosím,Please,1588438799
7,nádhera,splendor,1588438799
8,Proměny,Transformations,1588438799
9,brzy se uvidíme,see you soon,1588438799


In [211]:
set_lip(gosta, flag_lexeme=False)

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,0,1588438799,0,pablo,cs,en,slovo,word,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
1,0,1588438799,0,pablo,cs,en,dlouhy,long,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
2,0,1588438799,0,pablo,cs,en,jmeno,name,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
3,0,1588438799,0,pablo,cs,en,ted,now,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
4,0,1588438799,0,pablo,cs,en,ted',now',lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
5,0,1588438799,0,pablo,cs,en,tady,here,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
6,0,1588438799,0,pablo,cs,en,Prosím,Please,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
7,0,1588438799,0,pablo,cs,en,nádhera,splendor,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
8,0,1588438799,0,pablo,cs,en,Proměny,Transformations,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
9,0,1588438799,0,pablo,cs,en,brzy se uvidíme,see you soon,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
