In [1]:
from pathlib import Path
from combined import run_fa, run_match_words, combine_df
import os
from match import normalize_word
import pandas as pd
from tqdm import tqdm
from machine.tokenization import LatinWordTokenizer
from typing import Iterable, Tuple, List
import re
tqdm.pandas()
from collections import Counter

pd.set_option('display.max_rows', 500)

In [15]:
# reverse_model_df = pd.read_csv('pfs/out/hbo-MaculaHebTok_en-NIV84_combined/hbo-MaculaHebTok_en-NIV84_combined.csv')
bible_df = pd.read_parquet('../../../../new2old/data/bible_words.parquet')
# all_df = pd.read_csv('pfs/out/en-NIV84_hbo-MaculaHebTok_align/in_context.csv')

In [83]:
# ref_trans = 'NIV84'
# ref_lang = 'en'
# gold_data_col = 'NIV84'
ref_lang = 'swh'
# ref_trans = 'ONEN'
ref_trans = 'SRUV06'
gold_data_col = 'SRUV06'
ref_trans_file = f'pfs/text_data/{ref_lang}-{ref_trans}.txt'
model_df = pd.read_csv(f'pfs/out/{ref_lang}-{ref_trans}_hbo-MaculaHebTok_combined/{ref_lang}-{ref_trans}_hbo-MaculaHebTok_combined.csv')

In [3]:
vref_file = 'pfs/ref_data/vref.txt'
with open(vref_file) as f:
    vref_data = f.readlines()

In [4]:
def text_to_words(text: str) -> List[str]:
    """
    Inputs:
        text:   Normally a sentence, or Bible verse
    Outputs:
        A list of words, where the sentence has had its punctuation removed, and words splits into a list of words
    """
    # word_list = re.sub("[^\w\s]", "", text.lower()).split()
    # word_list = text.lower().replace('.', '').replace(',', '').replace('?', '').replace('!', '').split()
    word_tokenizer = LatinWordTokenizer()
    word_list = [normalize_word(word) for word in word_tokenizer.tokenize(text)]
    return word_list


def vref_to_df(file):
    with open(file, 'r') as f:
        bible_data = f.readlines()
    words = [text_to_words(line) for line in bible_data]
    normalized_words = [[normalize_word(word) if word else '' for word in word_list] if word_list else [] for word_list in words]
    df = pd.DataFrame({'text': bible_data, 'words': words, 'normalized_words': normalized_words})
    df = df[df['text'].apply(lambda x: len(x) > 2)]
    df = df[df['text'] != "b'\n'"]
    return df

In [84]:
ref_trans_df = vref_to_df(ref_trans_file)

In [6]:
gold_data = pd.read_parquet('../../../../new2old/data/gold_translation_data.parquet')

In [71]:
gold_data.columns

Index(['index', 'book', 'chapter', 'verse', 'word', 'subword', 'id', 'marble',
       'ot-nt', 'unicode', 'text', 'lemma', 'normalized', 'english', 'greek',
       'hebrew', 'mandarin', 'domain_codes', 'domains', 'sdbh_codes',
       'top_domains', 'BHS', 'BSB', 'CU2010S', 'CU2010T', 'ESVUK16', 'GNTUK',
       'HCB', 'KJV', 'LSG10', 'NBS11', 'NIV11', 'NIV84', 'NRS89', 'RA293',
       'RSO94', 'RSV52', 'RVR60', 'SRUV06', 'TBI', 'TCV95S', 'TPB08'],
      dtype='object')

In [85]:
model_df.loc[:, 'total_score'] = model_df.progress_apply(lambda row: row['FA_translation_score'] + 4 * row['avg_aligned'] + 5 * row['jac_sim'], axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4224650/4224650 [01:29<00:00, 47030.72it/s]


In [7]:
domains = ['003001001', '003001002','003001004','003001006', '003001007','003001008','003001009','003001010', '003001012' ,'003001013' , '003001014', '003001015' , '003001017']
names_df = bible_df[bible_df['domain_codes'].apply(lambda x: len(set(domains).intersection(set(x))) > 0)]
names_df.loc[:, 'english'] = names_df['english'].apply(lambda x: x.lower() if x is not None else x)
# print(names)
english_names = names_df['english'].unique()
english_names = [name.lower() for name in english_names if name is not None]
hebrew_names = names_df['text'].unique()
hebrew_names = [name.lower() for name in hebrew_names if name is not None]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names_df.loc[:, 'english'] = names_df['english'].apply(lambda x: x.lower() if x is not None else x)


In [9]:
hebrew_matched_names = list(set(hebrew_names).intersection(model_df[model_df['jac_sim'] > 0]['target'].unique()))
english_matched_names = list(set(english_names).intersection(model_df[model_df['jac_sim'] > 0]['source'].unique()))


In [10]:
print(len(hebrew_names))
print(len(hebrew_matched_names))
print(len(english_names))
print(len(english_matched_names))

9744
9498
2649
2227


In [11]:
names = names_df[names_df['text'].apply(lambda x: x in hebrew_names)]

In [12]:
names = names[gold_data[f'{ref_lang}-{ref_trans}'].apply(lambda x: x is not None)]

  names = names[gold_data['NIV84'].apply(lambda x: x is not None)]


In [86]:
max_rows = -1
counter = Counter()
for index, row in names.iloc[:max_rows, :].iterrows():
    hebrew_source = row['text']
    hebrew_source_normalized = normalize_word(hebrew_source)    
    book = row['book']
    chapter = row['chapter']
    verse = row['verse']
    ref = f'{book} {chapter}:{verse}\n'
    vref_index = vref_data.index(ref)
    candidate_words = set(ref_trans_df.loc[vref_index, 'words'])  if vref_index in ref_trans_df.index else set([])
    candidate_words = [word.lower().split("ʼ")[0] for word in candidate_words if word is not None]

    candidate_df = model_df[model_df['normalized_target'] == hebrew_source_normalized]
    candidate_df = candidate_df[candidate_df['normalized_source'].apply(lambda x: x in list(candidate_words))]
    
    if candidate_df.shape[0] > 0:
        pred_index = candidate_df['total_score'].idxmax()
        score = candidate_df.loc[pred_index, 'total_score']
        pred = candidate_df.loc[pred_index, 'source']
    else:
        pred = None
    actual = gold_data.loc[index, gold_data_col] if index in gold_data.index else None  # .values[0]
    actual = actual.lower() if actual else ''
    if len(actual.split()) > 1:
        counter['multiple words'] += 1
    elif pred == None:
        counter['no prediction'] += 1
    else:
        correct = pred == actual or pred == actual.split("ʼ")[0] or actual == pred.split("ʼ")[0] 
        counter.update([correct]) 
    if not correct:
        print(index)
        print(f"actual: {actual}   pred: {pred}")
#         print(candidate_df)
        print(counter)

1670
actual: hawa   pred: ndiye
Counter({True: 25, 'multiple words': 6, False: 1})
1779
actual: hawa   pred: akamjua
Counter({True: 30, 'multiple words': 7, False: 2})
5118
actual: sidoni   pred: ulianza
Counter({True: 246, 'multiple words': 42, False: 3})
6177
actual: wamisri   pred: misri
Counter({True: 380, 'multiple words': 57, False: 4})
6776
actual: hamu   pred: wazuzi
Counter({True: 441, 'multiple words': 72, False: 5})
6837
actual: seboimu   pred: adma
Counter({True: 450, 'multiple words': 74, False: 6})
7211
actual: mungu   pred: ee
Counter({True: 496, 'multiple words': 81, False: 7})
7213
actual: ee bwana   pred: bwana
Counter({True: 496, 'multiple words': 82, False: 7})
7339
actual: mungu   pred: bwana
Counter({True: 503, 'multiple words': 84, False: 8})
8910
actual: sodoma   pred: wakaelekea
Counter({True: 609, 'multiple words': 94, False: 9})
10066
actual: wamoabi   pred: moabu
Counter({True: 653, 'multiple words': 98, False: 10})
10086
actual: wa waamoni   pred: waamoni
C

41278
actual: waisraeli   pred: israeli
Counter({True: 2682, 'multiple words': 632, False: 66})
41574
actual: bwana   pred: ee
Counter({True: 2695, 'multiple words': 643, False: 67})
41708
actual: mara   pred: machungu
Counter({True: 2703, 'multiple words': 648, False: 68})
46225
actual: mungu   pred: bwana
Counter({True: 2852, 'multiple words': 731, False: 69})
46247
actual: ya bwana   pred: bwana
Counter({True: 2852, 'multiple words': 732, False: 69})
54248
actual:    pred: bwana
Counter({True: 3004, 'multiple words': 899, False: 70})
59932
actual:    pred: bwana
Counter({True: 3091, 'multiple words': 979, False: 71})
79148
actual: walawi   pred: ni
Counter({True: 3554, 'multiple words': 1255, False: 72})
79743
actual:    pred: bwana
Counter({True: 3588, 'multiple words': 1285, False: 73})
80141
actual:    pred: bwana
Counter({True: 3597, 'multiple words': 1285, False: 74})
80209
actual:    pred: bwana
Counter({True: 3602, 'multiple words': 1285, False: 75})
80218
actual:    pred: ge

90702
actual: wa israeli   pred: israeli
Counter({True: 4117, 'multiple words': 1434, False: 151})
91792
actual: wamisri   pred: misri
Counter({True: 4143, 'multiple words': 1463, False: 152})
92245
actual:    pred: wakafa
Counter({True: 4178, 'multiple words': 1481, False: 153})
92258
actual: mungu   pred: bwana
Counter({True: 4179, 'multiple words': 1481, False: 154})
93136
actual:    pred: wakuu
Counter({True: 4246, 'multiple words': 1534, False: 155})
95294
actual: wakarmi   pred: karmi
Counter({True: 4369, 'multiple words': 1621, False: 156})
95436
actual: wahagi   pred: hagi
Counter({True: 4390, 'multiple words': 1631, False: 157})
95441
actual: washuni   pred: shuni
Counter({True: 4391, 'multiple words': 1631, False: 158})
95446
actual: waezboni   pred: ezboni
Counter({True: 4392, 'multiple words': 1631, False: 159})
95451
actual: waeri   pred: eri
Counter({True: 4393, 'multiple words': 1631, False: 160})
95456
actual: waarodi   pred: arodi
Counter({True: 4394, 'multiple words':

KeyboardInterrupt: 

In [81]:
index = 0
gold_data.loc[index, gold_data_col]

'Hapo mwanzo'

In [112]:
line = 100742

In [113]:
gold_data.iloc[line-8: line+8, [0, 1, 2, 3, 4, 5, 10,11, 38
                               ]]

Unnamed: 0,index,book,chapter,verse,word,subword,text,lemma,SRUV06
100734,100734,NUM,33,36,8,1,קָדֵֽשׁ,6946,ni Kadeshi
100735,100735,NUM,33,37,1,1,וַ,c,Wakasafiri
100736,100736,NUM,33,37,1,2,יִּסְע֖וּ,5265,Wakasafiri
100737,100737,NUM,33,37,2,1,מִ,m,kutoka
100738,100738,NUM,33,37,2,2,קָּדֵ֑שׁ,6946,Kadeshi
100739,100739,NUM,33,37,3,1,וַֽ,c,wakapiga kambi
100740,100740,NUM,33,37,3,2,יַּחֲנוּ֙,2583,wakapiga kambi
100741,100741,NUM,33,37,4,1,בְּ,b,wa
100742,100742,NUM,33,37,4,2,הֹ֣ר,2023,Hori
100743,100743,NUM,33,37,5,1,הָ,d,katika


In [114]:
word = bible_df.loc[line, ['text']].values[0][0]
hebrew_source_normalized = normalize_word(word)
book = bible_df.loc[line, ['book']].values[0][0]
chapter = bible_df.loc[line, ['chapter']].values[0][0]
verse = bible_df.loc[line, ['verse']].values[0][0]
ref = f'{book} {chapter}:{verse}\n'
vref_index = vref_data.index(ref)

def normalize_word(word):
    return re.sub("[^\w\s]", "", word.lower()) if word else ''

candidate_words = set(ref_trans_df.loc[vref_index, 'words']) if vref_index in ref_trans_df.index else set([])
candidate_words = [word.lower().split("ʼ")[0] for word in candidate_words if word is not None]
candidate_words

['',
 'hori',
 'wakapiga',
 'edomu',
 'kutoka',
 'wakasafiri',
 'katika',
 'ya',
 'mlima',
 'nchi',
 'kadeshi',
 'mpaka',
 'wa',
 'kambi']

In [115]:
vref_index

4798

In [116]:
candidate_df = model_df[model_df['normalized_target'] == hebrew_source_normalized]
candidate_df

Unnamed: 0.1,Unnamed: 0,source,target,co-occurrences,FA_translation_score,FA_align_count,alignment_count,FA_verse_score,avg_aligned,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
863,863,aamkapo,הַ֥ר,2,0.0,0.0,0.0,,0.0,0.0,aamkapo,הר,0.0,0,0.0
1565,1565,abarimu,הַ֥ר,1,0.0,0.0,0.0,,0.0,0.0,abarimu,הר,0.0,0,0.0
1566,1566,abarimu,הַר,1,0.0,0.0,0.0,,0.0,0.0,abarimu,הר,0.0,0,0.0
1567,1567,abarimu,הַר֩,1,0.0,0.0,0.0,,0.0,0.0,abarimu,הר,0.0,0,0.0
1737,1737,abdoni,הַ֖ר,1,0.0,0.0,0.0,,0.0,0.0,abdoni,הר,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4219126,4219126,zote,הָ֜ר,1,0.0,0.0,0.0,,0.0,0.0,zote,הר,0.0,0,0.0
4219178,4219178,zote,הָר,1,0.0,0.0,0.0,,0.0,0.0,zote,הר,0.0,0,0.0
4219179,4219179,zote,הָר֙,1,0.0,0.0,0.0,,0.0,0.0,zote,הר,0.0,0,0.0
4219185,4219185,zote,הָרַ֖,1,0.0,0.0,0.0,,0.0,0.0,zote,הר,0.0,0,0.0


In [117]:
candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words), axis=1)].sort_values('total_score', ascending=False)


Unnamed: 0.1,Unnamed: 0,source,target,co-occurrences,FA_translation_score,FA_align_count,alignment_count,FA_verse_score,avg_aligned,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
2113778,2113778,mlima,הַר֙,3,0.797552,3.0,1.0,0.311311,1.0,0.311311,mlima,הר,0.576503,211,7.680066
2113761,2113761,mlima,הַ֖ר,5,0.775821,5.0,1.0,0.398463,1.0,0.398463,mlima,הר,0.576503,211,7.658335
2113770,2113770,mlima,הַֽר,2,0.73505,2.0,1.0,0.315051,1.0,0.315051,mlima,הר,0.576503,211,7.617564
2113797,2113797,mlima,הָ֣ר,5,0.388119,5.0,1.0,0.247347,1.0,0.247347,mlima,הר,0.576503,211,7.270633
2113840,2113840,mlima,הֹ֥ר,5,0.328258,5.0,1.0,0.280039,1.0,0.280039,mlima,הר,0.576503,211,7.210772
2113790,2113790,mlima,הָ֗ר,4,0.168721,4.0,1.0,0.2449,1.0,0.2449,mlima,הר,0.576503,211,7.051234
2113765,2113765,mlima,הַ֤ר,9,0.827329,7.0,1.0,0.306713,0.777778,0.306713,mlima,הר,0.576503,211,6.820954
2113767,2113767,mlima,הַ֥ר,52,0.477814,42.0,1.0,0.308502,0.807692,0.308502,mlima,הר,0.576503,211,6.591097
2113764,2113764,mlima,הַ֣ר,58,0.416422,47.0,1.0,0.31928,0.810345,0.31928,mlima,הר,0.576503,211,6.540315
2113838,2113838,mlima,הֹ֣ר,6,0.246696,5.0,1.0,0.294278,0.833333,0.294278,mlima,הר,0.576503,211,6.462543


In [796]:
word

'לוּדִ֧ים'

In [603]:
df[df['source'] == 'shemeber']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
80989,80989,shemeber,אַדְמָ֗ה,1,0.78108,0.330563,shemeber,אדמה,-1.0,0,-0.78108


Number of matched names:

In [633]:
vref_index

464

In [243]:
df[df['target'] == 'עֵ֖דֶן']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
26110,26110,eden,עֵ֖דֶן,2,0.799089,0.322273,eden,עדן,0.818182,18,3.79264
