In [754]:
from pathlib import Path
from combined import run_fa, run_match_words, combine_df
import os
from match import normalize_word
import pandas as pd
from tqdm import tqdm
from machine.tokenization import LatinWordTokenizer
from typing import Iterable, Tuple, List
import re
tqdm.pandas()
from collections import Counter

pd.set_option('display.max_rows', 500)

In [755]:
model_df = pd.read_csv('pfs/out/en-NIV84_hbo-MaculaHebTok_combined/en-NIV84_hbo-MaculaHebTok_combined.csv')
# reverse_model_df = pd.read_csv('pfs/out/hbo-MaculaHebTok_en-NIV84_combined/hbo-MaculaHebTok_en-NIV84_combined.csv')
bible_df = pd.read_parquet('../../../../new2old/data/bible_words.parquet')
all_df = pd.read_csv('pfs/out/en-NIV84_hbo-MaculaHebTok_align/in_context.csv')

In [756]:
vref_file = 'pfs/ref_data/vref.txt'
with open(vref_file) as f:
    vref_data = f.readlines()

In [757]:
def text_to_words(text: str) -> List[str]:
    """
    Inputs:
        text:   Normally a sentence, or Bible verse
    Outputs:
        A list of words, where the sentence has had its punctuation removed, and words splits into a list of words
    """
    # word_list = re.sub("[^\w\s]", "", text.lower()).split()
    # word_list = text.lower().replace('.', '').replace(',', '').replace('?', '').replace('!', '').split()
    word_tokenizer = LatinWordTokenizer()
    word_list = [normalize_word(word) for word in word_tokenizer.tokenize(text)]
    return word_list


def vref_to_df(file):
    with open(file, 'r') as f:
        bible_data = f.readlines()
    words = [text_to_words(line) for line in bible_data]
    normalized_words = [[normalize_word(word) if word else '' for word in word_list] if word_list else [] for word_list in words]
    df = pd.DataFrame({'text': bible_data, 'words': words, 'normalized_words': normalized_words})
    df = df[df['text'].apply(lambda x: len(x) > 2)]
    df = df[df['text'] != "b'\n'"]
    return df

In [758]:
NIV84_df = vref_to_df('pfs/text_data/en-NIV84.txt')

In [759]:
gold_data = pd.read_parquet('../../../../new2old/data/gold_translation_data.parquet')

In [760]:
model_df.loc[:, 'total_score'] = model_df.apply(lambda row: row['word score'] + 5 * row['jac_sim'], axis=1)

In [761]:
domains = ['003001001', '003001002','003001004','003001006', '003001007','003001008','003001009','003001010', '003001012' ,'003001013' , '003001014', '003001015' , '003001017']
names_df = bible_df[bible_df['domain_codes'].apply(lambda x: len(set(domains).intersection(set(x))) > 0)]
names_df.loc[:, 'english'] = names_df['english'].apply(lambda x: x.lower() if x is not None else x)
# print(names)
english_names = names_df['english'].unique()
english_names = [name.lower() for name in english_names if name is not None]
hebrew_names = names_df['text'].unique()
hebrew_names = [name.lower() for name in hebrew_names if name is not None]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names_df.loc[:, 'english'] = names_df['english'].apply(lambda x: x.lower() if x is not None else x)


In [762]:
hebrew_matched_names = list(set(hebrew_names).intersection(df[df['jac_sim'] > 0]['target'].unique()))
english_matched_names = list(set(english_names).intersection(df[df['jac_sim'] > 0]['source'].unique()))


In [763]:
print(len(hebrew_names))
print(len(hebrew_matched_names))
print(len(english_names))
print(len(english_matched_names))

9744
7797
2649
2155


In [764]:
names = names_df[names_df['text'].apply(lambda x: x in hebrew_names)]
# print(names_df.iloc[:, :14][names_df['english'].apply(lambda x: x in english_matched_names)])

In [765]:
names = names[gold_data['NIV84'].apply(lambda x: x is not None)]

  names = names[gold_data['NIV84'].apply(lambda x: x is not None)]


In [766]:
def get_best_match_id(word, candidate_df):
    if candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words), axis=1)].shape[0] > 0:
        return candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words), axis=1)]['total_score'].idxmax()
    else:
        return None

In [767]:
max_rows = -1
counter = Counter()
for index, row in names.iloc[:max_rows, :].iterrows():
    hebrew_source = row['text']
    hebrew_source_normalized = normalize_word(hebrew_source)    
    book = row['book']
    chapter = row['chapter']
    verse = row['verse']
#     word = row['word']
#     subword = row['subword']
    ref = f'{book} {chapter}:{verse}\n'
    vref_index = vref_data.index(ref)
#     print(ref)
#     print(vref_index)
#     candidate_words = set(gold_data[(gold_data['book'] == book) & (gold_data['chapter'] == chapter) & (gold_data['verse'] == verse)]['NIV84'])
#     candidate_words = [word.lower() for word in candidate_words if word is not None]
    candidate_words = set(NIV84_df.loc[vref_index, 'words'])  if vref_index in NIV84_df.index else set([])
    candidate_words = [word.lower().split("ʼ")[0] for word in candidate_words if word is not None]

    candidate_df = model_df[model_df['normalized_target'] == hebrew_source_normalized]
#     reverse_candidate_df = reverse_model_df[reverse_model_df['normalized_source'] == hebrew_source_normalized]
    
    pred_index = get_best_match_id(hebrew_source_normalized, candidate_df)
#     reverse_pred_index = get_best_match_id(hebrew_source_normalized, reverse_candidate_df)
#     score = -100
#     reverse_score = -100
#     pred = None
#     reverse_pred = None
    
    if pred_index:
        score = candidate_df.loc[pred_index, 'total_score']
        pred = candidate_df.loc[pred_index, 'source']
        
#     if reverse_pred_index:
#         reverse_score = reverse_candidate_df.loc[reverse_pred_index, 'total_score']
#         reverse_pred = reverse_candidate_df.loc[reverse_pred_index, 'target']
    
#     if score >= reverse_score:
#         overall_pred = pred
#     else:
#         overall_pred = reverse_pred
    

    #     actual = row['english']
    actual = gold_data.loc[index, ['NIV84']].values[0] if index in gold_data.index else None
    actual = actual.lower() if actual else ''
    if len(actual.split()) > 1:
        counter['multiple words'] += 1
    elif pred == None:
        counter['no prediction'] += 1
    else:
        correct = pred == actual or pred == actual.split("ʼ")[0] or actual == pred.split("ʼ")[0] 
        counter.update([correct]) 

    if not correct:
        print(index)
        print(f"actual: {actual}   pred: {pred}")
#         print(candidate_df)
        print(counter)

1591
actual: adam   pred: of
Counter({True: 30, False: 1})
1682
actual: adam   pred: of
Counter({True: 32, False: 2})
2231
actual: adam   pred: in
Counter({True: 91, False: 3})
2281
actual: adam   pred: man
Counter({True: 97, False: 4})
2314
actual: adam   pred: in
Counter({True: 97, False: 5})
2777
actual: shem   pred: the
Counter({True: 144, False: 6})
3519
actual: shem   pred: the
Counter({True: 171, False: 7})
4687
actual: shem   pred: the
Counter({True: 190, False: 8})
4750
actual: shem   pred: in
Counter({True: 198, False: 9})
4813
actual: shem   pred: the
Counter({True: 202, False: 10})
4828
actual: shem   pred: in
Counter({True: 204, False: 11})
4865
actual: shem   pred: the
Counter({True: 208, False: 12})
4883
actual: madai   pred: gomer
Counter({True: 213, False: 13})
4935
actual: put   pred: mizraim
Counter({True: 229, False: 14})
4995
actual: akkad   pred: erech
Counter({True: 244, False: 15})
5036
actual: ludites   pred: mizraim
Counter({True: 253, False: 16})
5039
actual:

32583
actual: puah   pred: shiphrah
Counter({True: 2454, False: 102, 'multiple words': 18})
32591
actual: hebrew   pred: childbirth
Counter({True: 2454, False: 103, 'multiple words': 18})
32743
actual: levite   pred: levi
Counter({True: 2462, False: 104, 'multiple words': 18})
32829
actual: river   pred: nile
Counter({True: 2465, False: 105, 'multiple words': 18})
32868
actual: hebrew   pred: the
Counter({True: 2465, False: 106, 'multiple words': 18})
33788
actual: hebrews   pred: lord
Counter({True: 2554, False: 107, 'multiple words': 18})
34121
actual: river   pred: nile
Counter({True: 2570, False: 108, 'multiple words': 18})
34323
actual: jethro   pred: of
Counter({True: 2581, False: 109, 'multiple words': 18})
34917
actual: israelite   pred: the
Counter({True: 2630, False: 110, 'multiple words': 18})
34945
actual: israelite   pred: the
Counter({True: 2631, False: 111, 'multiple words': 18})
35011
actual: israelite   pred: the
Counter({True: 2633, False: 112, 'multiple words': 18})


88365
actual: israelite   pred: the
Counter({True: 5313, False: 192, 'multiple words': 19})
88397
actual: israelite   pred: the
Counter({True: 5314, False: 193, 'multiple words': 19})
88457
actual: israelite   pred: the
Counter({True: 5315, False: 194, 'multiple words': 19})
88747
actual: on   pred: eliab
Counter({True: 5338, False: 195, 'multiple words': 19})
88761
actual: israelite   pred: the
Counter({True: 5341, False: 196, 'multiple words': 19})
88908
actual: israelite   pred: israel
Counter({True: 5356, False: 197, 'multiple words': 19})
89616
actual: israelite   pred: the
Counter({True: 5412, False: 198, 'multiple words': 19})
91087
actual: israelite   pred: the
Counter({True: 5503, False: 199, 'multiple words': 19})
91453
actual: israelite   pred: the
Counter({True: 5507, False: 200, 'multiple words': 19})
91792
actual: egyptians   pred: egypt
Counter({True: 5538, False: 201, 'multiple words': 19})
91942
actual: israelite   pred: the
Counter({True: 5547, False: 202, 'multiple w

102716
actual: ashtaroth   pred: reigned
Counter({True: 6592, False: 281, 'multiple words': 21})
103950
actual: moabites   pred: rephaites
Counter({True: 6665, False: 282, 'multiple words': 21})
104336
actual: moabites   pred: seir
Counter({True: 6699, False: 283, 'multiple words': 21})
104893
actual: maacathites   pred: geshurites
Counter({True: 6764, False: 284, 'multiple words': 21})
105069
actual: joshua   pred: jordan
Counter({True: 6782, False: 285, 'multiple words': 21})
105130
actual: lord   pred: sovereign
Counter({True: 6786, False: 286, 'multiple words': 21})
105132
actual: sovereign   pred: lord
Counter({True: 6786, False: 287, 'multiple words': 21})
109789
actual: lord   pred: sovereign
Counter({True: 7009, False: 288, 'multiple words': 21})
109791
actual: sovereign   pred: lord
Counter({True: 7009, False: 289, 'multiple words': 21})
110516
actual: egyptian   pred: brought
Counter({True: 7049, False: 290, 'multiple words': 21})
110574
actual: reubenite   pred: the
Counter(

137584
actual: towns   pred: jordan
Counter({True: 8875, False: 370, 'multiple words': 38})
137613
actual: asher   pred: the
Counter({True: 8876, False: 371, 'multiple words': 38})
137728
actual: asher   pred: the
Counter({True: 8897, False: 372, 'multiple words': 38})
137798
actual: asher   pred: the
Counter({True: 8906, False: 373, 'multiple words': 38})
137815
actual: zer   pred: ziddim
Counter({True: 8908, False: 374, 'multiple words': 38})
137836
actual: iron   pred: migdal
Counter({True: 8916, False: 375, 'multiple words': 38})
137931
actual: danites   pred: dan
Counter({True: 8933, False: 376, 'multiple words': 38})
138454
actual: simeon   pred: thirteen
Counter({True: 8981, False: 377, 'multiple words': 38})
138500
actual: asher   pred: the
Counter({True: 8988, False: 378, 'multiple words': 38})
138615
actual: anak   pred: pastureland
Counter({True: 9001, False: 379, 'multiple words': 38})
138706
actual: juttah   pred: shemesh
Counter({True: 9013, False: 380, 'multiple words': 

153315
actual: lord   pred: sovereign
Counter({True: 10290, False: 460, 'multiple words': 42})
153317
actual: sovereign   pred: lord
Counter({True: 10290, False: 461, 'multiple words': 42})
153457
actual: micah   pred: ephraim
Counter({True: 10299, False: 462, 'multiple words': 42})
153567
actual: micah   pred: lord
Counter({True: 10301, False: 463, 'multiple words': 42})
153794
actual: danites   pred: the
Counter({True: 10319, False: 464, 'multiple words': 42})
154090
actual: danites   pred: men
Counter({True: 10334, False: 465, 'multiple words': 42})
154598
actual: danites   pred: dan
Counter({True: 10360, False: 466, 'multiple words': 42})
154607
actual: moses   pred: of
Counter({True: 10362, False: 467, 'multiple words': 42})
154617
actual: dan   pred: the
Counter({True: 10362, False: 468, 'multiple words': 42})
155093
actual: benjamites   pred: fields
Counter({True: 10378, False: 469, 'multiple words': 42})
156338
actual: israelite   pred: israel
Counter({True: 10458, False: 470, 

180291
actual: israelite   pred: the
Counter({True: 12443, False: 546, 'multiple words': 60})
180803
actual: hebron   pred: david
Counter({True: 12486, False: 547, 'multiple words': 60})
181357
actual: ammah   pred: was
Counter({True: 12558, False: 548, 'multiple words': 64})
181361
actual: giah   pred: wasteland
Counter({True: 12558, False: 549, 'multiple words': 64})
181583
actual: sons   pred: david
Counter({True: 12583, False: 550, 'multiple words': 64})
181601
actual: widow   pred: abigail
Counter({True: 12584, False: 551, 'multiple words': 64})
181609
actual: son   pred: absalom
Counter({True: 12584, False: 552, 'multiple words': 64})
181610
actual: maacah   pred: son
Counter({True: 12584, False: 553, 'multiple words': 64})
181613
actual: king   pred: talmai
Counter({True: 12584, False: 554, 'multiple words': 64})
181614
actual: geshur   pred: king
Counter({True: 12584, False: 555, 'multiple words': 64})
181986
actual: benjamites   pred: benjamin
Counter({True: 12637, False: 556,

199570
actual: jether   pred: of
Counter({True: 14189, False: 635, 'multiple words': 69})
200723
actual: shisha   pred: ahijah
Counter({True: 14264, False: 636, 'multiple words': 69})
200750
actual: zabud   pred: nathan
Counter({True: 14272, False: 637, 'multiple words': 69})
200758
actual: ahishar   pred: charge
Counter({True: 14273, False: 638, 'multiple words': 69})
200807
actual: ben-deker   pred: ephraim
Counter({True: 14278, False: 639, 'multiple words': 69})
200821
actual: ben-hesed   pred: arubboth
Counter({True: 14280, False: 640, 'multiple words': 69})
200833
actual: ben-abinadab   pred: naphoth
Counter({True: 14282, False: 641, 'multiple words': 69})
200837
actual: taphath   pred: was
Counter({True: 14284, False: 642, 'multiple words': 69})
200857
actual: zarethan   pred: shan
Counter({True: 14289, False: 643, 'multiple words': 69})
200875
actual: ben-geber   pred: were
Counter({True: 14291, False: 644, 'multiple words': 69})
200878
actual: ramoth gilead   pred: gilead
Count

224587
actual: jerusalem   pred: megiddo
Counter({True: 16002, False: 718, 'multiple words': 91})
225199
actual: recab   pred: chariot
Counter({True: 16038, False: 719, 'multiple words': 92})
225671
actual: israelites   pred: israel
Counter({True: 16088, False: 720, 'multiple words': 92})
225684
actual: gad   pred: manasseh
Counter({True: 16090, False: 721, 'multiple words': 92})
225687
actual: reuben   pred: the
Counter({True: 16090, False: 722, 'multiple words': 92})
225690
actual: manasseh   pred: the
Counter({True: 16090, False: 723, 'multiple words': 92})
225864
actual: lord   pred: temple
Counter({True: 16112, False: 724, 'multiple words': 92})
225876
actual: lord   pred: temple
Counter({True: 16112, False: 725, 'multiple words': 92})
225991
actual: jehoiada   pred: ordered
Counter({True: 16112, False: 726, 'multiple words': 92})
226010
actual: jehoiada   pred: to
Counter({True: 16112, False: 727, 'multiple words': 92})
226036
actual: lord   pred: temple
Counter({True: 16112, Fal

237653
actual: jether   pred: of
Counter({True: 17568, False: 805, 'multiple words': 103})
237658
actual: jether   pred: of
Counter({True: 17569, False: 806, 'multiple words': 103})
237835
actual: regem   pred: jahdai
Counter({True: 17628, False: 807, 'multiple words': 103})
237860
actual: sheva   pred: madmannah
Counter({True: 17637, False: 808, 'multiple words': 103})
237862
actual: macbenah   pred: sheva
Counter({True: 17637, False: 809, 'multiple words': 103})
237865
actual: gibea   pred: macbenah
Counter({True: 17637, False: 810, 'multiple words': 103})
237910
actual: puthites   pred: ithrites
Counter({True: 17647, False: 811, 'multiple words': 103})
237946
actual: shimeathites   pred: tirathites
Counter({True: 17657, False: 812, 'multiple words': 103})
237971
actual: amnon   pred: hebron
Counter({True: 17663, False: 813, 'multiple words': 103})
238054
actual: elishua   pred: ibhar
Counter({True: 17684, False: 814, 'multiple words': 105})
238164
actual: shenazzar   pred: jekamiah


241909
actual: jahzerah   pred: son
Counter({True: 18591, False: 893, 'multiple words': 107})
241915
actual: immer   pred: of
Counter({True: 18593, False: 894, 'multiple words': 107})
241952
actual: bakbakkar   pred: heresh
Counter({True: 18599, False: 895, 'multiple words': 107})
241953
actual: heresh   pred: galal
Counter({True: 18599, False: 896, 'multiple words': 107})
241990
actual: talmon   pred: ahiman
Counter({True: 18614, False: 897, 'multiple words': 107})
242066
actual: meshelemiah   pred: gatekeeper
Counter({True: 18627, False: 898, 'multiple words': 107})
242256
actual: levite   pred: the
Counter({True: 18634, False: 899, 'multiple words': 107})
242400
actual: esh-baal   pred: eshbaal
Counter({True: 18662, False: 900, 'multiple words': 107})
242417
actual: melech   pred: of
Counter({True: 18666, False: 901, 'multiple words': 107})
242486
actual: israelites   pred: israel
Counter({True: 18690, False: 902, 'multiple words': 107})
243413
actual: tekoa   pred: ikkesh
Counter({

KeyboardInterrupt: 

In [786]:
line = 236711

In [787]:
gold_data.iloc[line-8: line+8, [0, 1, 2, 3, 4, 5, 10,11, 32]]

Unnamed: 0,index,book,chapter,verse,word,subword,text,lemma,NIV84
236703,236703,1CH,1,10,8,1,גִּבּ֖וֹר,1368,mighty warrior
236704,236704,1CH,1,10,9,1,בָּ,b,
236705,236705,1CH,1,10,9,1,,d,
236706,236706,1CH,1,10,9,2,אָֽרֶץ,776,
236707,236707,1CH,1,11,1,1,וּ,c,
236708,236708,1CH,1,11,1,2,מִצְרַ֡יִם,4714,Mizraim
236709,236709,1CH,1,11,2,1,יָלַ֞ד,3205,was the father
236710,236710,1CH,1,11,3,1,אֶת,853,
236711,236711,1CH,1,11,5,1,לוּדִ֧ים,3866,Ludites
236712,236712,1CH,1,11,6,1,וְ,c,


In [788]:
word = bible_df.loc[line, ['text']].values[0]
hebrew_source_normalized = normalize_word(word)
book = bible_df.loc[line, ['book']].values[0]
chapter = bible_df.loc[line, ['chapter']].values[0]
verse = bible_df.loc[line, ['verse']].values[0]
ref = f'{book} {chapter}:{verse}\n'
vref_index = vref_data.index(ref)

def normalize_word(word):
    return re.sub("[^\w\s]", "", word.lower()) if word else ''

candidate_words = set(NIV84_df.loc[vref_index, 'words']) if vref_index in NIV84_df.index else set([])
candidate_words = [word.lower().split("ʼ")[0] for word in candidate_words if word is not None]
candidate_words

['',
 'was',
 'ludites',
 'naphtuhites',
 'father',
 'the',
 'mizraim',
 'anamites',
 'lehabites',
 'of']

In [789]:
vref_index

10266

In [795]:
candidate_df = model_df[model_df['normalized_target'] == hebrew_source_normalized]
candidate_df

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,normalized_source,normalized_target,jac_sim,match_counts,total_score
44101,44101,father,לוּדִ֧ים,2,0.902797,father,לודים,0.0,0,0.902797
79838,79838,lydia,לוּדִ֕ים,1,0.701462,lydia,לודים,0.2,1,1.701462
84643,84643,mizraim,לוּדִ֧ים,2,0.114203,mizraim,לודים,0.333333,2,1.780869


In [791]:
all_df[all_df['target'] == word].sort_values('word score', ascending=False)

Unnamed: 0.1,Unnamed: 0,vref,source,target,word score
125107,125107,GEN 10:13,father,לוּדִ֧ים,0.9027971
6747034,6747034,1CH 1:11,father,לוּדִ֧ים,0.9027971
125065,125065,GEN 10:13,mizraim,לוּדִ֧ים,0.1142026
6746992,6746992,1CH 1:11,mizraim,לוּדִ֧ים,0.1142026
125135,125135,GEN 10:13,the,לוּדִ֧ים,1.302155e-05
6747020,6747020,1CH 1:11,the,לוּדִ֧ים,1.302155e-05
125093,125093,GEN 10:13,the,לוּדִ֧ים,1.302155e-05
6747062,6747062,1CH 1:11,the,לוּדִ֧ים,1.302155e-05
125121,125121,GEN 10:13,of,לוּדִ֧ים,1.252464e-44
125149,125149,GEN 10:13,ludites,לוּדִ֧ים,1.252464e-44


In [792]:
candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words), axis=1)].sort_values('total_score', ascending=False)


Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,normalized_source,normalized_target,jac_sim,match_counts,total_score
84643,84643,mizraim,לוּדִ֧ים,2,0.114203,mizraim,לודים,0.333333,2,1.780869
44101,44101,father,לוּדִ֧ים,2,0.902797,father,לודים,0.0,0,0.902797


Total number of names:

In [796]:
word

'לוּדִ֧ים'

In [603]:
df[df['source'] == 'shemeber']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
80989,80989,shemeber,אַדְמָ֗ה,1,0.78108,0.330563,shemeber,אדמה,-1.0,0,-0.78108


Number of matched names:

In [633]:
vref_index

464

In [243]:
df[df['target'] == 'עֵ֖דֶן']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
26110,26110,eden,עֵ֖דֶן,2,0.799089,0.322273,eden,עדן,0.818182,18,3.79264


In [244]:
matched_names['predicted'] = matched_names['text'].progress_apply(lambda x: df.loc[df[df['target'] == x]['total_score'].idxmax()]['source'])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32282/32282 [06:18<00:00, 85.21it/s]


In [245]:
matched_names

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,predicted
774,GEN,2,4,8,1,o010020040081,00100200400026,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,,lord
799,GEN,2,5,15,1,o010020050151,00100200500042,OT,יְהוָ֤ה,יְהוָ֤ה,3068,,lord,,lord
827,GEN,2,7,2,1,o010020070021,00100200700006,OT,יְהוָ֨ה,יְהוָ֨ה,3068,,lord,,lord
852,GEN,2,8,2,1,o010020080021,00100200800006,OT,יְהוָ֧ה,יְהוָ֧ה,3068,,lord,κύριος,lord
856,GEN,2,8,5,2,o010020080052,00100200800014,OT,עֵ֖דֶן,עֵ֖דֶן,5731 b,,eden,εδεμ,eden
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475752,MAL,3,21,14,1,o390030210141,03900302100034,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,κύριος,lord
475756,MAL,3,22,3,1,o390030220031,03900302200006,OT,מֹשֶׁ֣ה,מֹשֶׁ֣ה,4872,,moses,μωυσῆ,moses
475767,MAL,3,22,11,1,o390030220111,03900302200024,OT,יִשְׂרָאֵ֔ל,יִשְׂרָאֵ֔ל,3478,,israel,ισραηλ,israel
475777,MAL,3,23,6,1,o390030230061,03900302300012,OT,אֵלִיָּ֣ה,אֵלִיָּ֣ה,452,,elijah,ηλιαν,elijah


In [246]:
def is_word_correct(row):
    word = row['english'].split("’")[0] if row['english'] is not None else ''
    return word == row['predicted']

In [247]:
matched_names[matched_names.apply(is_word_correct, axis=1)]

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,predicted
774,GEN,2,4,8,1,o010020040081,00100200400026,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,,lord
799,GEN,2,5,15,1,o010020050151,00100200500042,OT,יְהוָ֤ה,יְהוָ֤ה,3068,,lord,,lord
827,GEN,2,7,2,1,o010020070021,00100200700006,OT,יְהוָ֨ה,יְהוָ֨ה,3068,,lord,,lord
852,GEN,2,8,2,1,o010020080021,00100200800006,OT,יְהוָ֧ה,יְהוָ֧ה,3068,,lord,κύριος,lord
856,GEN,2,8,5,2,o010020080052,00100200800014,OT,עֵ֖דֶן,עֵ֖דֶן,5731 b,,eden,εδεμ,eden
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475752,MAL,3,21,14,1,o390030210141,03900302100034,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,κύριος,lord
475756,MAL,3,22,3,1,o390030220031,03900302200006,OT,מֹשֶׁ֣ה,מֹשֶׁ֣ה,4872,,moses,μωυσῆ,moses
475767,MAL,3,22,11,1,o390030220111,03900302200024,OT,יִשְׂרָאֵ֔ל,יִשְׂרָאֵ֔ל,3478,,israel,ισραηλ,israel
475777,MAL,3,23,6,1,o390030230061,03900302300012,OT,אֵלִיָּ֣ה,אֵלִיָּ֣ה,452,,elijah,ηλιαν,elijah


In [248]:
matched_names[~matched_names.apply(is_word_correct, axis=1)].head(100)

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,predicted
1591,GEN,3,17,1,3,o010030170013,100301700006,OT,אָדָ֣ם,אָדָ֣ם,121,,adam,αδαμ,man
1682,GEN,3,21,4,2,o010030210042,100302100012,OT,אָדָ֧ם,אָדָ֧ם,120,,adam,αδαμ,of
1809,GEN,4,2,11,2,o010040020112,100400200030,OT,קַ֕יִן,קַ֕יִן,7014 b,,cain,καιν,tiller
1845,GEN,4,4,11,1,o010040040111,100400400032,OT,הֶ֖בֶל,הֶ֖בֶל,1893,,abel,αβελ,vanity
1927,GEN,4,8,12,1,o010040080121,100400800036,OT,הֶ֥בֶל,הֶ֥בֶל,1893,,abel,αβελ,futility
2114,GEN,4,18,2,2,o010040180022,100401800008,OT,חֲנוֹךְ֙,חֲנוֹךְ֙,2585,,enoch,ενωξ,hanoch
2198,GEN,4,23,4,1,o010040230041,100402300012,OT,עָדָ֤ה,עָדָ֤ה,5711,,adah,αδα,zillah
2231,GEN,4,25,2,1,o010040250021,100402500006,OT,אָדָ֥ם,אָדָ֥ם,121,,adam,αδαμ,man
2270,GEN,4,26,9,1,o010040260091,100402600024,OT,אֱנ֑וֹשׁ,אֱנ֑וֹשׁ,583,,enosh,ενως,man
2281,GEN,5,1,4,1,o010050010041,100500100008,OT,אָדָ֑ם,אָדָ֑ם,121,,adam,ἀνθρώπων,man


In [167]:
hebrew_names = set(hebrew_names)
hebrew_matched_names = set(hebrew_matched_names)
list(hebrew_names - hebrew_matched_names)

['מָכִֽי',
 'יְה֗וּד',
 'מִיכָיָ֜ה',
 'זִ֥יעַ',
 'נָב֗וֹת',
 'נְבַלָּֽט',
 'כִּנֶּ֔רֶת',
 'נְבֽוּכַדְרֶאצַּ֥ר',
 'זְבֻל֑וּן',
 'רֶ֧גֶם',
 'מּוֹאֲבִיָּ֜ה',
 'הֶ֨בֶל֙',
 'עֵ֠ילָם',
 'מֵידָ֔ד',
 'יִּשְׂרְאֵלִֽי',
 'אֱלִישָׁ֖ה',
 'הוֹמָ֑ם',
 'בְאֵרָֽא',
 'אַשְׁבֵּלִ֑י',
 'אֲגַ֥ג',
 'רֽאוּבֵנִ֗י',
 'תַפֻּ֖חַ',
 'אָצַל֒',
 'כִּסְלֵיו֙',
 'רְפָיָ֧ה',
 'מִיכָיָ֧ה',
 'בִּתִּ֞',
 'שִׁמְעָתִ֖ים',
 'תַּ֠תְּנַי',
 'יִצְחָ֨ק',
 'אֲשִׁימָֽא',
 'יֵשׁ֛וּעַ',
 'אֲחַזְיָ֤ה',
 'אֲרִיאֵ֡ל',
 'גִּבְעָ֨תָ',
 'דָּ֣ן',
 'שְׁבָ֔ט',
 'בָּשָׁ֞ן',
 'יִבְלְעָ֨ם',
 'עִיּ֡וֹן',
 'תֻּבַ֣ל',
 'בִּנְיָמִ֣ין',
 'צָפ֥וֹן',
 'חֶלְקָֽי',
 'שָׁמָע֙',
 'מֹּ֣רַשְׁתִּ֔י',
 'כְּנַעֲנִ֨י',
 'אֲחִיה֖וּד',
 'פַּלְטִ֔י',
 'קְעִלָ֔ה',
 'צוֹפִ֖ים',
 'אַבְרָ֫הָ֥ם',
 'שִׁרְטַ֖י',
 'בַקְבֻּֽקְיָ֛ה',
 'מֹּצָֽה',
 'כְנָ֑נִי',
 'יַּעֲקֹ֑ב',
 'יְהוּדִי֙',
 'נַעֲמָֽן',
 'מָע֖וֹךְ',
 'פְּרָ֔ת',
 'שַׁלְמַ֖י',
 'שִׁמְשַׁ֥י',
 'חֲשֻׂפָ֖א',
 'שִׁפְעִ֧י',
 'אַמְנֹ֗ן',
 'יַֽאֲזַנְיָ֨הוּ֙',
 'בּוּנִּֽי',
 'עֵיבָ֖ל',
 'ירִימ֤וֹת',
 'אֵ֔צֶר',
 'אֲשַׂרְ

In [227]:
df[df['source'] == 'judith']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
48917,48917,judith,אִשָּׁה֙,1,0.333115,0.380322,judith,אשה,-1.0,0,-0.0
48918,48918,judith,יִּקַּ֤ח,1,0.332959,0.380322,judith,יקח,-1.0,0,-0.0


In [249]:
bible_df[bible_df['text'] == 'יְהוּדִ֔ית']

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,hebrew,mandarin,domain_codes,domains,sdbh_codes,top_domains
15003,GEN,26,34,9,1,o010260340091,102603400022,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3067,,Judith,ιουδιν,יְהוּדִ֔ית,犹滴,[003001007],[Names of People],[002822001001000],[003001]
231296,2KI,18,26,21,1,o120180260211,1201802600048,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3066,,in Hebrew,ιουδαϊστί,יְהוּדִ֔ית,用希伯来语,[003001009],[Names of Languages],[002820001001000],[003001]
366489,ISA,36,11,19,1,o230360110191,2303601100044,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3066,,language of Judah,ιουδαϊστί,יְהוּדִ֔ית,犹大语,[003001009],[Names of Languages],[002820001001000],[003001]


In [251]:
df[df['normalized_target'] == 'יהודית']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
10354,10354,beeri,יְהוּדִ֔ית,1,0.195174,0.380322,beeri,יהודית,0.125,1,0.009279
48762,48762,judean,יְהוּדִ֑ית,2,0.57045,0.31968,judean,יהודית,0.571429,4,0.416827
48763,48763,judean,יְהוּדִ֔ית,2,0.523498,0.333,judean,יהודית,0.571429,4,0.398457


In [229]:
bible_df[bible_df['english'] == 'Judith']

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,hebrew,mandarin,domain_codes,domains,sdbh_codes,top_domains
15003,GEN,26,34,9,1,o010260340091,102603400022,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3067,,Judith,ιουδιν,יְהוּדִ֔ית,犹滴,[003001007],[Names of People],[002822001001000],[003001]


In [250]:
normalize_word('יְהוּדִ֔ית')

'יהודית'

In [252]:
old_matched_names = matched_names