In [55]:
import pandas as pd
import numpy as np
import os
import re
import sys
import datetime
import apertium
from googletrans import Translator, LANGCODES
from hebrew import Hebrew

sys.path.append('../scripts/python_scripts/')
sys.path.append('../scripts/ML_duolingo')
from duolingo_hlr import *
from bulkTranslate import *
from init_lipstick import *

## Functions

In [56]:
def set_lip(gota : pd.DataFrame, flag_lexeme = False):
    """Provisional simple initialization of lipstick from GOTA. 
        Attrs:
        ------
        p_recall : truth recall probability = history_correct/history_correct
        timestamp : last time practice timestamp
        delta : timedelta w.r.t. most unpracticed word (with minimum timestamp)
        user_id : user name
        learning_language: target language
        ui_language: user reference language
        lexeme_id: word in target language, lexeme in the future?
        word_id: word in reference language
        lexeme_string: lexeme tag with grammatical/syntactical information, not implemented yet
        history_seen: times the word has been practiced from initialization
        history_correct: times the translation has been correctly recalled from initialization
        session_seen: practice times in last session (not implemented)
        session_correct: correctly recalled times in last session (not implemented)

        Additional attrs:
        ------
        p_pred: predicted probability from hlr model (not in initialization)
    """
    cols0 = ['lexeme_id', 'translated_word', 'timestamp', 'history_seen', 'history_correct']

    lear_lang = gota.columns[0]
    ui_lang = gota.columns[1]
    timest = gota.creation_time # .apply(lambda d: datetime.datetime.strptime(d, '%Y-%m-%d %H:%M:%S.%f')).apply(lambda dt : int(datetime.datetime.timestamp(dt)))

    delta = timest - np.min(timest)
    # ptruth = ((gota.right_hist ) / gota.seen_hist).fillna(0)   # Legacy from using GOTA as DB corpus with updatePerformance.py
    ptruth = pd.Series(np.zeros_like(timest))  # Initialize on 0, also for seen and correct attrs.
    lipstick = pd.DataFrame({'p_recall':ptruth})
        
    lipstick['timestamp'] = timest
    lipstick['delta'] = delta
    lipstick['user_id'] = 'pablo'  # Will be customizable later
    lipstick['learning_language'] = lear_lang
    lipstick['ui_language'] = ui_lang
    lipstick['word_ll'] = gota[lear_lang]
    lipstick['word_ul'] = gota[ui_lang]
    
    if flag_lexeme:
        lexeme = []
        for wd in lipstick.word_ll:
            tagSplit = str(apertium.tag(lear_lang, wd)[0]).split('/')
            lexeme.append(tagSplit[0] + '/' + tagSplit[1])
    else:
        lexeme = 'lernt/lernen<vblex><pri><p3><sg>' 
        
    lipstick['lexeme_string'] = lexeme
    lipstick['history_seen'] = ptruth  
    lipstick['history_correct'] = ptruth  
    lipstick['session_seen'] = ptruth  
    lipstick['session_correct'] = ptruth
    
    return lipstick

def make_lang_dic(languages):
    """Return a dict with inverted langcodes
    Ex: {'en': 'English'
        'eu': 'Basque'}
    """
    from googletrans import LANGCODES as dictTrans
    langs = {}
    for la in languages:
        try:
            lang = dictTrans[la.lower()]
        except KeyError:
            pass
        langs[lang] = la
    return langs

def gost2gota(gost: pd.DataFrame, langs: dict, ll: str, ul: str):
    """Adapt GOST to GOTA format for LIPSTICK processing
    Parameters:
        ll: learning language in short format
        ul: user language in short format ('en', 'de', 'es'...)
    """
    lll, lul = langs[ll], langs[ul]             # Long-format Learning Language // Long-format User Language
    # Group and rearrange the words by language
    gota = pd.DataFrame({ll:[], ul: []})
    gota[ll] = gost.apply(lambda x: x["source_word"] if x["source_lang"] == lll else x["translation"], axis=1 )
    gota[ul] = gost.apply(lambda x: x["source_word"] if x["source_lang"] == lul else x["translation"], axis=1 )

    # Add creation timestamp
    today = int(datetime.datetime.timestamp(datetime.datetime.today()))
    gota['creation_time'] = today
    return gota


In [57]:
def remove_nikud(gost):
    gost['iw'] = gost['iw'].apply(lambda x: Hebrew(x).no_niqqud() )
    return gost

## Get lang (short) from GOST and make dict

In [61]:
langs = make_lang_dic(languages)
langs

{'en': 'English', 'iw': 'Hebrew'}

# Process GOogle Saved Translations (GOST)

## Import GOST

In [58]:
gost_path = '../data/raw/googletranslate_csv/hebrew_db.csv'
gost = pd.read_csv(gost_path, names=['source_lang', 'target_lang', 'source_word', 'translation'])

In [59]:
gost

Unnamed: 0,source_lang,target_lang,source_word,translation
0,English,Hebrew,creature,יְצוּר
1,English,Hebrew,monster,מִפלֶצֶת
2,English,Hebrew,servant,מְשָׁרֵת
3,English,Hebrew,magician,קוֹסֵם
4,Hebrew,English,גיבור,a hero
...,...,...,...,...
290,Hebrew,English,ביקשת,you asked
291,Hebrew,English,להתקלח,shower
292,Hebrew,English,חָצוּף,insolent
293,Hebrew,English,מה נראה לך,What do you think


## Transform to GOTA

In [54]:
gosta = gost2gota(gost, langs, 'iw', 'en')
gosta

Unnamed: 0,iw,en,creation_time
0,תַת קַרקַעִי,underground,1737562968
1,לצפות,expect,1737562968
2,לתלות,hang,1737562968
3,לבכות,cry,1737562968
4,לדחות,reject,1737562968
...,...,...,...
227,ביקשת,you asked,1737562968
228,להתקלח,shower,1737562968
229,חָצוּף,insolent,1737562968
230,מה נראה לך,What do you think,1737562968


In [55]:
gost_nonikud = remove_nikud(gosta)
gost_nonikud

Unnamed: 0,iw,en,creation_time
0,תת קרקעי,underground,1737562968
1,לצפות,expect,1737562968
2,לתלות,hang,1737562968
3,לבכות,cry,1737562968
4,לדחות,reject,1737562968
...,...,...,...
227,ביקשת,you asked,1737562968
228,להתקלח,shower,1737562968
229,חצוף,insolent,1737562968
230,מה נראה לך,What do you think,1737562968


## Fix Hebrew db nikud while keeping performance

In [56]:
gost_nonikud['index'] = gost_nonikud.index
gost_nonikud = gost_nonikud.set_index('en')
gost_nonikud

Unnamed: 0_level_0,iw,creation_time,index
en,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
underground,תת קרקעי,1737562968,0
expect,לצפות,1737562968,1
hang,לתלות,1737562968,2
cry,לבכות,1737562968,3
reject,לדחות,1737562968,4
...,...,...,...
you asked,ביקשת,1737562968,227
shower,להתקלח,1737562968,228
insolent,חצוף,1737562968,229
What do you think,מה נראה לך,1737562968,230


In [72]:
lippath = '/Users/pabloherrero/Documents/ManHatTan/data/processed/LIPSTICK/hebrew_db copy.lip'
lip = pd.read_csv(lippath)
lip['index'] = lip.index

lip = lip.set_index('word_ul', drop=False)
# lip['timestamp'] = lip['timestamp'].astype('int64')
# lip = lip.dropna()
lip

Unnamed: 0_level_0,p_recall,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,history_seen,history_correct,session_seen,session_correct,p_pred,index
word_ul,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
subject,0.0,1736957029,0,pablo,iw,en,נושא,subject,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0,0,0
situation,0.0,1736957029,0,pablo,iw,en,המצב,situation,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0,0,1
stay,0.0,1737219013,261984,pablo,iw,en,להישאר,stay,lernt/lernen<vblex><pri><p3><sg>,1,0,0,0,0,2
hatred,0.0,1736957029,0,pablo,iw,en,שִׂנאָה,hatred,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0,0,3
Amazing,0.0,1736957029,0,pablo,iw,en,מדהים,Amazing,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
defense,1.0,1737216888,259859,pablo,iw,en,ההגנה,defense,lernt/lernen<vblex><pri><p3><sg>,1,1,0,0,0,227
roots,1.0,1737132998,175969,pablo,iw,en,שורשים,roots,lernt/lernen<vblex><pri><p3><sg>,1,1,0,0,0,228
tradition,1.0,1737217820,260791,pablo,iw,en,מָסוֹרֶת,tradition,lernt/lernen<vblex><pri><p3><sg>,1,1,0,0,0,229
expect,1.0,1737224651,267622,pablo,iw,en,לצפות,expect,lernt/lernen<vblex><pri><p3><sg>,1,1,0,0,0,230


In [59]:
# Remove duplicates
lip[lip.index.duplicated()]

Unnamed: 0_level_0,p_recall,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,history_seen,...,p_pred,mdt_history,mdt_correct,mrt_history,mrt_correct,wdt_history,wdt_correct,wrt_history,wrt_correct,index
word_ul,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


# Add index column (nid), and MDT, MRT, WDT, WRT performances

In [43]:
lipu = lip[~lip.index.duplicated()]
gou = gost_nonikud[~gost_nonikud.index.duplicated()]

lipu.loc[:, 'word_ll'] = gou['iw']
lipu = lipu.set_index('index', drop=True)

lipu

Unnamed: 0_level_0,p_recall,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,history_seen,history_correct,session_seen,session_correct,p_pred
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.0,1736957029,0,pablo,iw,en,נושא,subject,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0,0
1,0.0,1736957029,0,pablo,iw,en,המצב,situation,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0,0
2,0.0,1737219013,261984,pablo,iw,en,להישאר,stay,lernt/lernen<vblex><pri><p3><sg>,1,0,0,0,0
3,0.0,1736957029,0,pablo,iw,en,שנאה,hatred,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0,0
4,0.0,1736957029,0,pablo,iw,en,מדהים,Amazing,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,1.0,1737216888,259859,pablo,iw,en,ההגנה,defense,lernt/lernen<vblex><pri><p3><sg>,1,1,0,0,0
228,1.0,1737132998,175969,pablo,iw,en,שורשים,roots,lernt/lernen<vblex><pri><p3><sg>,1,1,0,0,0
229,1.0,1737217820,260791,pablo,iw,en,מסורת,tradition,lernt/lernen<vblex><pri><p3><sg>,1,1,0,0,0
230,1.0,1737224651,267622,pablo,iw,en,לצפות,expect,lernt/lernen<vblex><pri><p3><sg>,1,1,0,0,0


In [46]:

lipu['mdt_history'] = 0.
lipu['mdt_correct'] = 0.
lipu['mrt_history'] = 0.
lipu['mrt_correct'] = 0.
lipu['wdt_history'] = 0.
lipu['wdt_correct'] = 0.
lipu['wrt_history'] = 0.
lipu['wrt_correct'] = 0.
lipu

Unnamed: 0_level_0,p_recall,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,history_seen,...,session_correct,p_pred,mdt_history,mdt_correct,mrt_history,mrt_correct,wdt_history,wdt_correct,wrt_history,wrt_correct
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,1736957029,0,pablo,iw,en,נושא,subject,lernt/lernen<vblex><pri><p3><sg>,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1736957029,0,pablo,iw,en,המצב,situation,lernt/lernen<vblex><pri><p3><sg>,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1737219013,261984,pablo,iw,en,להישאר,stay,lernt/lernen<vblex><pri><p3><sg>,1,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1736957029,0,pablo,iw,en,שנאה,hatred,lernt/lernen<vblex><pri><p3><sg>,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1736957029,0,pablo,iw,en,מדהים,Amazing,lernt/lernen<vblex><pri><p3><sg>,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,1.0,1737216888,259859,pablo,iw,en,ההגנה,defense,lernt/lernen<vblex><pri><p3><sg>,1,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
228,1.0,1737132998,175969,pablo,iw,en,שורשים,roots,lernt/lernen<vblex><pri><p3><sg>,1,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
229,1.0,1737217820,260791,pablo,iw,en,מסורת,tradition,lernt/lernen<vblex><pri><p3><sg>,1,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
230,1.0,1737224651,267622,pablo,iw,en,לצפות,expect,lernt/lernen<vblex><pri><p3><sg>,1,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:

pathout = lippath = '/Users/pabloherrero/Documents/ManHatTan/data/processed/LIPSTICK/hebrew_db.lip'
lipu.to_csv(pathout, index=False)

# Add speed column

In [82]:
lippath = '/Users/pabloherrero/Documents/ManHatTan/data/processed/LIPSTICK/hebrew_db.lip'
lip = pd.read_csv(lippath)

# lip = lip.set_index('word_ul', drop=False)
lip.tail()

Unnamed: 0,p_recall,n_id,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,...,session_correct,p_pred,mdt_history,mdt_correct,mrt_history,mrt_correct,wdt_history,wdt_correct,wrt_history,wrt_correct
221,1.0,125,1737995298,1038269,pablo,iw,en,התחלה,beginning,lernt/lernen<vblex><pri><p3><sg>,...,0,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
222,1.0,214,1737224259,267230,pablo,iw,en,היחיד,the only,lernt/lernen<vblex><pri><p3><sg>,...,0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
223,1.0,112,1737995771,1038742,pablo,iw,en,לבכות,cry,lernt/lernen<vblex><pri><p3><sg>,...,0,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
224,1.0,210,1737650391,693362,pablo,iw,en,לסיים,to finish,lernt/lernen<vblex><pri><p3><sg>,...,0,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
225,1.0,225,1737132919,175890,pablo,iw,en,להבטיח,ensure,lernt/lernen<vblex><pri><p3><sg>,...,0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
modes = ['mdt', 'mrt', 'wdt', 'wrt']
columns = []
for m in modes:
    columns.append(m+'_history')
    columns.append(m+'_correct')
lip[columns] = lip[columns].astype('int32')
# lip[['mdt_history', 'mrt_history', 'wdt_history', 'wrt_history']].astype('int32')

In [101]:
lip['p_pred'] = 0

In [103]:
lip['speed'] = 0

In [104]:
pathout = lippath = '/Users/pabloherrero/Documents/ManHatTan/data/processed/LIPSTICK/hebrew_db.lip'
lip.to_csv(pathout, index=False)

# Add proper n_id column

In [12]:
lippath = '/Users/pabloherrero/Documents/ManHatTan/data/processed/LIPSTICK/hebrew_db.lip'
lip = pd.read_csv(lippath)

# lip = lip.set_index('word_ul', drop=False)
lip.head()

Unnamed: 0,p_recall,n_id,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,...,mdt_history,mdt_correct,mrt_history,mrt_correct,wdt_history,wdt_correct,wrt_history,wrt_correct,speed,rebag
0,1.0,68,1738413429,52,pablo,iw,en,מקרה,case,lernt/lernen<vblex><pri><p3><sg>,...,0,0,2,2,3,3,0,0,0.181,True
1,0.625,150,1738413377,0,pablo,iw,en,מברשת שנים,toothbrush,lernt/lernen<vblex><pri><p3><sg>,...,3,2,1,1,1,1,3,1,0.2779,True
2,1.0,95,1738424677,11300,pablo,iw,en,עכביש,spider,lernt/lernen<vblex><pri><p3><sg>,...,1,1,2,2,0,0,2,2,0.1468,True
3,0.833333,27,1738413434,57,pablo,iw,en,ענין,matter,lernt/lernen<vblex><pri><p3><sg>,...,1,1,2,2,2,2,1,0,0.2385,True
4,0.833333,162,1738427203,13826,pablo,iw,en,יחסית,relatively,lernt/lernen<vblex><pri><p3><sg>,...,2,2,1,1,1,1,2,1,0.2108,True


In [47]:
pathnid = '/Users/pabloherrero/Documents/ManHatTan/gui/Graphics/index_stage_0.csv'
stage0_nids = pd.read_csv(pathnid, index_col=None).T.values[0][:lenlip]

In [49]:
from random import shuffle
shuffle(stage0_nids)

In [50]:
stage0_nids

array([303, 363, 359, 410, 123, 307, 100, 140, 137, 167,  27, 337, 438,
       434, 283, 384,  23, 386, 406,  66, 369,  96, 327, 309, 109, 338,
        56,  86, 240, 300, 108, 200, 179, 415, 287, 131,  52,  84, 138,
       293, 198, 218, 314, 377, 236, 225, 265, 353,  48, 316, 273, 228,
       379,  13, 129,  79, 331, 201, 145,  72,  77, 114, 276, 425, 261,
        21, 378, 436, 347, 355,  69, 352, 401, 318, 422,  58, 231, 441,
       427, 220, 320, 102, 206, 302, 439, 234, 133, 351, 258,  74, 431,
        46,  43,  83, 440, 339, 412, 357, 227,   1, 172, 393, 387,  29,
       243, 417, 270, 175,  16, 222, 278, 403, 299, 296, 324, 285, 442,
       396, 241, 235,  41, 381, 150, 161, 187, 211, 382, 333, 216, 374,
       170, 128, 312, 420, 142, 371, 173, 345, 158, 433, 144, 251,  60,
       383, 203, 191, 418, 111, 250, 214, 190, 246, 325, 298, 290, 245,
       313,  54, 174,  19, 328,  81, 152, 147,  98, 151, 311, 263, 446,
       249, 223, 193, 204, 163, 341, 361, 215, 120, 370, 207, 15

In [37]:
lip.n_id.update(stage0_nids[:lenlip])

In [46]:
lip.to_csv(lippath, index=False)

# Debug init_lipstick with GOST

In [66]:
ll = 'iw'
ul = 'en'
gost_path = '/Users/pabloherrero/Documents/ManHatTan/data/raw/googletranslate_csv/hebrew_db.csv'

gost = pd.read_csv(gost_path, names=['source_lang', 'target_lang', 'source_word', 'translation'])
languages = gost['source_lang'].unique()
langs = make_lang_dic(languages)

gosta = gost2gota(gost, langs, 'iw', 'en')

gosta = remove_nikud(gosta)

## Refactor: this is basically init_lipstick

new_lip = set_lip(gosta)
new_lip.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,0,1738927615,0,pablo,iw,en,יצור,creature,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
1,0,1738927615,0,pablo,iw,en,מפלצת,monster,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
2,0,1738927615,0,pablo,iw,en,משרת,servant,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
3,0,1738927615,0,pablo,iw,en,קוסם,magician,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
4,0,1738927615,0,pablo,iw,en,גיבור,a hero,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0


In [68]:
lipath = '/Users/pabloherrero/Documents/ManHatTan/data/processed/LIPSTICK/hebrew_db.lip'
current_lip = pd.read_csv(lippath)
current_lip.head()

Unnamed: 0,p_recall,n_id,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,...,mdt_history,mdt_correct,mrt_history,mrt_correct,wdt_history,wdt_correct,wrt_history,wrt_correct,speed,rebag
0,1.0,213,1738413429,52,pablo,iw,en,מקרה,case,lernt/lernen<vblex><pri><p3><sg>,...,0,0,2,2,3,3,0,0,0.181,True
1,0.625,236,1738413377,0,pablo,iw,en,מברשת שנים,toothbrush,lernt/lernen<vblex><pri><p3><sg>,...,3,2,1,1,1,1,3,1,0.2779,True
2,1.0,63,1738424677,11300,pablo,iw,en,עכביש,spider,lernt/lernen<vblex><pri><p3><sg>,...,1,1,2,2,0,0,2,2,0.1468,True
3,0.8333,191,1738413434,57,pablo,iw,en,ענין,matter,lernt/lernen<vblex><pri><p3><sg>,...,1,1,2,2,2,2,1,0,0.2385,True
4,0.8333,100,1738427203,13826,pablo,iw,en,יחסית,relatively,lernt/lernen<vblex><pri><p3><sg>,...,2,2,1,1,1,1,2,1,0.2108,True


In [102]:
entry = new_lip.word_ul.isin(current_lip.word_ul)

In [103]:
entry

0      False
1      False
2      False
3      False
4      False
       ...  
290     True
291     True
292     True
293     True
294     True
Name: word_ul, Length: 295, dtype: bool

In [95]:
current_lip.word_ll.values

array(['מקרה', 'מברשת שנים', 'עכביש', 'ענין', 'יחסית', 'להתבלבל',
       'חומרי בניין', 'להחליט', 'מטומטם', 'להתאחד', 'נסיך', 'תפקיד',
       'עמדה', 'להצטרף', 'מזרח', 'התרעה', 'חוצפה', 'מספריים', 'כונה',
       'להגיע', 'מסך', 'ציון', 'מלגה', 'מגבת', 'להשתכר', 'אני סולח',
       'להתעמל', 'חתימה', 'מטריה', 'להתחנן', 'להקשיב', 'מעבדה', 'סכנה',
       'רמה', 'משקפי שמש', 'אני מלטף', 'שונא', 'להתקלח', 'מערב', 'משמעות',
       'שנאה', 'נרגילה', 'מודע', 'רכבת', 'מעיר', 'גלגל', 'חמוד', 'מערכת',
       'אני מטלפן', 'סל', 'קול', 'בעדיפות', 'רשות', 'מסתיים', 'מחקר',
       'טעם', 'להתנהג', 'אישי', 'לתלות', 'לזרום', 'השפעה', 'השלכות',
       'אנחנו מנצחים', 'חצוף', 'נופים', 'סמל', 'מסיים', 'נפלא', 'טיפש',
       'מרשימים', 'מבנה', 'נדיר', 'בעיקר', 'צבא', 'רטוב', 'להתלונן',
       'תגובה', 'אבקה', 'אל', 'לפחות', 'מטפל', 'הרצאה', 'השקפה', 'רחב',
       'גישה', 'ספק', 'חטוף', 'סוג', 'בודד', 'מיד', 'מעצבן', 'כישלון',
       'צות', 'חכם', 'מדהים', 'נקודת מבט', 'שיחה', 'טרי', 'הזדמנות',
       'פרס',

In [108]:
newEntries = []

for i,wd in zip(new_lip.index, new_lip.word_ul):
    # print(i, wd)
    # print(wd in current_lip.word_ul.values)
    if wd not in current_lip.word_ul.values:
        # print(new_lip.loc[i, 'word_ul'])
        newEntries.append(i)
        # current_lip = pd.concat([current_lip, new_lip.iloc[i]])
current_lip = pd.concat([current_lip, new_lip.iloc[newEntries] ])
current_lip

Unnamed: 0,p_recall,n_id,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,...,mdt_history,mdt_correct,mrt_history,mrt_correct,wdt_history,wdt_correct,wrt_history,wrt_correct,speed,rebag
0,1.0000,213.0,1738413429,52,pablo,iw,en,מקרה,case,lernt/lernen<vblex><pri><p3><sg>,...,0.0,0.0,2.0,2.0,3.0,3.0,0.0,0.0,0.1810,True
1,0.6250,236.0,1738413377,0,pablo,iw,en,מברשת שנים,toothbrush,lernt/lernen<vblex><pri><p3><sg>,...,3.0,2.0,1.0,1.0,1.0,1.0,3.0,1.0,0.2779,True
2,1.0000,63.0,1738424677,11300,pablo,iw,en,עכביש,spider,lernt/lernen<vblex><pri><p3><sg>,...,1.0,1.0,2.0,2.0,0.0,0.0,2.0,2.0,0.1468,True
3,0.8333,191.0,1738413434,57,pablo,iw,en,ענין,matter,lernt/lernen<vblex><pri><p3><sg>,...,1.0,1.0,2.0,2.0,2.0,2.0,1.0,0.0,0.2385,True
4,0.8333,100.0,1738427203,13826,pablo,iw,en,יחסית,relatively,lernt/lernen<vblex><pri><p3><sg>,...,2.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,0.2108,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.0000,,1738927615,0,pablo,iw,en,מיקרופון,microphone,lernt/lernen<vblex><pri><p3><sg>,...,,,,,,,,,,
60,0.0000,,1738927615,0,pablo,iw,en,מתופף,drummer,lernt/lernen<vblex><pri><p3><sg>,...,,,,,,,,,,
61,0.0000,,1738927615,0,pablo,iw,en,להקה,band,lernt/lernen<vblex><pri><p3><sg>,...,,,,,,,,,,
62,0.0000,,1738927615,0,pablo,iw,en,כינור,violin,lernt/lernen<vblex><pri><p3><sg>,...,,,,,,,,,,


Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,word_ll,word_ul,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,0,1738927615,0,pablo,iw,en,יצור,creature,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
1,0,1738927615,0,pablo,iw,en,מפלצת,monster,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
2,0,1738927615,0,pablo,iw,en,משרת,servant,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
3,0,1738927615,0,pablo,iw,en,קוסם,magician,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
4,0,1738927615,0,pablo,iw,en,גיבור,a hero,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0,1738927615,0,pablo,iw,en,מיקרופון,microphone,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
60,0,1738927615,0,pablo,iw,en,מתופף,drummer,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
61,0,1738927615,0,pablo,iw,en,להקה,band,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0
62,0,1738927615,0,pablo,iw,en,כינור,violin,lernt/lernen<vblex><pri><p3><sg>,0,0,0,0


In [110]:
def add_new_gota_terms(new_lip: pd.DataFrame, current_lip: pd.DataFrame):
    """Include updated terms from GOTA respecting the practiced ones already present"""
    
    newEntries = []

    for i,wd in zip(new_lip.index, new_lip.word_ul):
        if wd not in current_lip.word_ul.values:
            
            newEntries.append(i)

    current_lip = pd.concat([current_lip, new_lip.iloc[newEntries] ])
    return current_lip
