In [324]:
from pathlib import Path
from combined import run_fa, run_match_words, combine_df
import os
from match import normalize_word
import pandas as pd
from tqdm import tqdm
from machine.tokenization import LatinWordTokenizer
from typing import Iterable, Tuple, List
import re
tqdm.pandas()
from collections import Counter

pd.set_option('display.max_rows', 500)

In [646]:
df = pd.read_csv('pfs/out/en-NIV84_combined.csv')
bible_df = pd.read_parquet('../../../../new2old/data/bible_words.parquet')

In [635]:
vref_file = 'pfs/ref_data/vref.txt'
with open(vref_file) as f:
    vref_data = f.readlines()

In [427]:
def text_to_words(text: str) -> List[str]:
    """
    Inputs:
        text:   Normally a sentence, or Bible verse
    Outputs:
        A list of words, where the sentence has had its punctuation removed, and words splits into a list of words
    """
    # word_list = re.sub("[^\w\s]", "", text.lower()).split()
    # word_list = text.lower().replace('.', '').replace(',', '').replace('?', '').replace('!', '').split()
    word_tokenizer = LatinWordTokenizer()
    word_list = [normalize_word(word) for word in word_tokenizer.tokenize(text)]
    return word_list


def vref_to_df(file):
    with open(file, 'r') as f:
        bible_data = f.readlines()
    words = [text_to_words(line) for line in bible_data]
    normalized_words = [[normalize_word(word) if word else '' for word in word_list] if word_list else [] for word_list in words]
    df = pd.DataFrame({'text': bible_data, 'words': words, 'normalized_words': normalized_words})
    df = df[df['text'].apply(lambda x: len(x) > 2)]
    df = df[df['text'] != "b'\n'"]
    return df

In [428]:
NIV84_df = vref_to_df('pfs/text_data/en-NIV84.txt')

In [357]:
gold_data = pd.read_parquet('../../../../new2old/data/gold_translation_data.parquet')

In [652]:
df.loc[:, 'total_score'] = df.apply(lambda row: row['word score'] * row['jac_sim'], axis=1)

In [404]:
domains = ['003001001', '003001002','003001004','003001006', '003001007','003001008','003001009','003001010', '003001012' ,'003001013' , '003001014', '003001015' , '003001017']
names_df = bible_df[bible_df['domain_codes'].apply(lambda x: len(set(domains).intersection(set(x))) > 0)]
names_df.loc[:, 'english'] = names_df['english'].apply(lambda x: x.lower() if x is not None else x)
# print(names)
english_names = names_df['english'].unique()
english_names = [name.lower() for name in english_names if name is not None]
hebrew_names = names_df['text'].unique()
hebrew_names = [name.lower() for name in hebrew_names if name is not None]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names_df.loc[:, 'english'] = names_df['english'].apply(lambda x: x.lower() if x is not None else x)


In [361]:
hebrew_matched_names = list(set(hebrew_names).intersection(df[df['jac_sim'] > 0]['target'].unique()))
english_matched_names = list(set(english_names).intersection(df[df['jac_sim'] > 0]['source'].unique()))


In [571]:
print(len(hebrew_names))
print(len(hebrew_matched_names))
print(len(english_names))
print(len(english_matched_names))

9744
7245
2649
2155


In [406]:
names = names_df[names_df['text'].apply(lambda x: x in hebrew_names)]
# print(names_df.iloc[:, :14][names_df['english'].apply(lambda x: x in english_matched_names)])

In [407]:
names = names[gold_data['NIV84'].apply(lambda x: x is not None)]

  names = names[gold_data['NIV84'].apply(lambda x: x is not None)]


In [659]:
max_rows = -1
counter = Counter()
for index, row in names.iloc[:max_rows, :].iterrows():
    hebrew_source = row['text']
    hebrew_source_normalized = normalize_word(hebrew_source)    
    book = row['book']
    chapter = row['chapter']
    verse = row['verse']
#     word = row['word']
#     subword = row['subword']
    ref = f'{book} {chapter}:{verse}\n'
    vref_index = vref_data.index(ref)
#     print(ref)
#     print(vref_index)
#     candidate_words = set(gold_data[(gold_data['book'] == book) & (gold_data['chapter'] == chapter) & (gold_data['verse'] == verse)]['NIV84'])
#     candidate_words = [word.lower() for word in candidate_words if word is not None]
    candidate_words = set(NIV84_df.loc[vref_index, 'words'])  if vref_index in NIV84_df.index else set([])
    candidate_words = [word.lower().split("ʼ")[0] for word in candidate_words if word is not None]

    candidate_df = df[(df['normalized_target'] == hebrew_source_normalized) | (df['normalized_source'] == hebrew_source_normalized)]
#     actual = row['english']
    actual = gold_data.loc[index, ['NIV84']].values[0] if index in gold_data.index else None
    actual = actual.lower() if actual else ''
    if len(actual.split()) > 1:
        counter['multiple words'] += 1
    elif candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words) or x['normalized_target'] in list(candidate_words), axis=1)].shape[0] > 0:
        pred = candidate_df.loc[candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words) or x['normalized_target'] in list(candidate_words), axis=1)]['total_score'].idxmax()]['source']
        correct = pred == actual or pred == actual.split("ʼ")[0] or actual == pred.split("ʼ")[0] 
        counter.update([correct]) 
    else:
        pred = ''
        counter['no prediction'] += 1

    if not correct:
        print(index)
        print(f"actual: {actual}   pred: {pred}")
#         print(candidate_df)
        print(counter)

920
actual: pishon   pred: first
Counter({True: 7, False: 1})
1591
actual: adam   pred: of
Counter({True: 29, False: 2})
1670
actual: eve   pred: would
Counter({True: 29, False: 3})
1682
actual: adam   pred: of
Counter({True: 30, False: 4})
1779
actual: eve   pred: pregnant
Counter({True: 34, False: 5})
2231
actual: adam   pred: of
Counter({True: 88, False: 6})
2281
actual: adam   pred: man
Counter({True: 94, False: 7})
2314
actual: adam   pred: in
Counter({True: 94, False: 8})
2777
actual: shem   pred: the
Counter({True: 141, False: 9})
3519
actual: shem   pred: the
Counter({True: 168, False: 10})
4687
actual: shem   pred: the
Counter({True: 187, False: 11})
4750
actual: shem   pred: in
Counter({True: 195, False: 12})
4813
actual: shem   pred: the
Counter({True: 199, False: 13})
4828
actual: shem   pred: in
Counter({True: 201, False: 14})
4865
actual: shem   pred: the
Counter({True: 205, False: 15})
4883
actual: madai   pred: gomer
Counter({True: 210, False: 16})
4933
actual: mizraim 

22470
actual: magdiel   pred: iram
Counter({True: 1735, False: 94, 'multiple words': 14, 'no prediction': 5})
22486
actual: edomites   pred: edom
Counter({True: 1738, False: 95, 'multiple words': 14, 'no prediction': 5})
23305
actual: adullam   pred: hirah
Counter({True: 1776, False: 96, 'multiple words': 14, 'no prediction': 5})
23517
actual: adullamite   pred: hirah
Counter({True: 1803, False: 97, 'multiple words': 14, 'no prediction': 5})
25074
actual: riverbank   pred: nile
Counter({True: 1876, False: 98, 'multiple words': 14, 'no prediction': 5})
25911
actual: on   pred: potiphera
Counter({True: 1928, False: 99, 'multiple words': 14, 'no prediction': 5})
26025
actual: on   pred: potiphera
Counter({True: 1940, False: 100, 'multiple words': 14, 'no prediction': 5})
26119
actual: egyptians   pred: egypt
Counter({True: 1949, False: 101, 'multiple words': 14, 'no prediction': 5})
26148
actual: egyptians   pred: egypt
Counter({True: 1951, False: 102, 'multiple words': 14, 'no prediction

72905
actual: israelites   pred: israel
Counter({True: 4317, False: 169, 'multiple words': 18, 'no prediction': 6})
72914
actual: israelite   pred: israel
Counter({True: 4317, False: 170, 'multiple words': 18, 'no prediction': 6})
74294
actual: israelite   pred: blasphemed
Counter({True: 4395, False: 171, 'multiple words': 18, 'no prediction': 7})
74312
actual: dibri   pred: of
Counter({True: 4397, False: 172, 'multiple words': 18, 'no prediction': 7})
75183
actual: levitical   pred: levites
Counter({True: 4416, False: 173, 'multiple words': 18, 'no prediction': 7})
77407
actual: israelite   pred: the
Counter({True: 4467, False: 174, 'multiple words': 18, 'no prediction': 8})
77518
actual: asher   pred: of
Counter({True: 4497, False: 175, 'multiple words': 18, 'no prediction': 8})
77994
actual: asher   pred: the
Counter({True: 4533, False: 176, 'multiple words': 18, 'no prediction': 8})
78021
actual: asher   pred: the
Counter({True: 4533, False: 177, 'multiple words': 18, 'no predictio

95438
actual: shuni   pred: haggi
Counter({True: 5907, False: 241, 'multiple words': 20, 'no prediction': 11})
95441
actual: shunite   pred: haggi
Counter({True: 5907, False: 242, 'multiple words': 20, 'no prediction': 11})
95443
actual: ozni   pred: oznite
Counter({True: 5907, False: 243, 'multiple words': 20, 'no prediction': 11})
95448
actual: eri   pred: the
Counter({True: 5908, False: 244, 'multiple words': 20, 'no prediction': 11})
95451
actual: erite   pred: the
Counter({True: 5908, False: 245, 'multiple words': 20, 'no prediction': 11})
95456
actual: arodite   pred: arodi
Counter({True: 5909, False: 246, 'multiple words': 20, 'no prediction': 11})
95461
actual: arelite   pred: areli
Counter({True: 5910, False: 247, 'multiple words': 20, 'no prediction': 11})
95680
actual: noah   pred: mahlah
Counter({True: 5971, False: 248, 'multiple words': 20, 'no prediction': 12})
95841
actual: asher   pred: the
Counter({True: 6010, False: 249, 'multiple words': 20, 'no prediction': 12})
958

129214
actual: israelites   pred: israel
Counter({True: 7820, False: 314, 'multiple words': 23, 'no prediction': 21})
129405
actual: lord   pred: sovereign
Counter({True: 7838, False: 315, 'multiple words': 24, 'no prediction': 21})
129407
actual: sovereign   pred: lord
Counter({True: 7838, False: 316, 'multiple words': 24, 'no prediction': 21})
129790
actual: a   pred: from
Counter({True: 7876, False: 317, 'multiple words': 24, 'no prediction': 21})
130636
actual: israelites   pred: israel
Counter({True: 7925, False: 318, 'multiple words': 25, 'no prediction': 21})
130708
actual: israelites   pred: israel
Counter({True: 7929, False: 319, 'multiple words': 25, 'no prediction': 21})
131173
actual: perhaps   pred: hivites
Counter({True: 7979, False: 320, 'multiple words': 25, 'no prediction': 21})
131437
actual: israelites   pred: israel
Counter({True: 7996, False: 321, 'multiple words': 25, 'no prediction': 21})
132017
actual: israelites   pred: israel
Counter({True: 8051, False: 322, '

139476
actual: west   pred: jordan
Counter({True: 9061, False: 386, 'multiple words': 39, 'no prediction': 27})
139639
actual: israelite   pred: israelites
Counter({True: 9080, False: 387, 'multiple words': 39, 'no prediction': 27})
139709
actual: israelite   pred: israel
Counter({True: 9090, False: 388, 'multiple words': 39, 'no prediction': 27})
141000
actual: egyptians   pred: egypt
Counter({True: 9205, False: 389, 'multiple words': 39, 'no prediction': 27})
142459
actual: asher   pred: in
Counter({True: 9362, False: 390, 'multiple words': 40, 'no prediction': 28})
142464
actual: acco   pred: 
Counter({True: 9362, False: 390, 'multiple words': 40, 'no prediction': 29})
142471
actual: ahlab   pred: or
Counter({True: 9363, False: 391, 'multiple words': 40, 'no prediction': 29})
142480
actual: aphek   pred: helbah
Counter({True: 9365, False: 392, 'multiple words': 40, 'no prediction': 29})
143327
actual: israelites   pred: israel
Counter({True: 9445, False: 393, 'multiple words': 41, '

154598
actual: danites   pred: dan
Counter({True: 10336, False: 457, 'multiple words': 42, 'no prediction': 33})
154607
actual: moses   pred: tribe
Counter({True: 10338, False: 458, 'multiple words': 42, 'no prediction': 33})
154617
actual: dan   pred: the
Counter({True: 10338, False: 459, 'multiple words': 42, 'no prediction': 33})
155093
actual: benjamites   pred: fields
Counter({True: 10354, False: 460, 'multiple words': 42, 'no prediction': 33})
156338
actual: israelite   pred: israel
Counter({True: 10434, False: 461, 'multiple words': 42, 'no prediction': 33})
156385
actual: israelites   pred: israel
Counter({True: 10439, False: 462, 'multiple words': 42, 'no prediction': 33})
156387
actual: benjamites   pred: benjamin
Counter({True: 10439, False: 463, 'multiple words': 42, 'no prediction': 33})
156407
actual: benjamites   pred: benjamin
Counter({True: 10439, False: 464, 'multiple words': 42, 'no prediction': 33})
156548
actual: israelites   pred: israel
Counter({True: 10450, Fals

166529
actual: michmash   pred: micmash
Counter({True: 11166, False: 529, 'multiple words': 53, 'no prediction': 37})
166615
actual: gibeah   pred: gilgal
Counter({True: 11175, False: 530, 'multiple words': 53, 'no prediction': 37})
166653
actual: michmash   pred: micmash
Counter({True: 11182, False: 531, 'multiple words': 53, 'no prediction': 37})
166801
actual: michmash   pred: micmash
Counter({True: 11195, False: 532, 'multiple words': 53, 'no prediction': 37})
166862
actual: ichabod   pred: was
Counter({True: 11203, False: 533, 'multiple words': 53, 'no prediction': 37})
166866
actual: eli   pred: the
Counter({True: 11204, False: 534, 'multiple words': 53, 'no prediction': 37})
166928
actual: michmash   pred: micmash
Counter({True: 11212, False: 535, 'multiple words': 53, 'no prediction': 37})
167056
actual: philistine   pred: philistines
Counter({True: 11219, False: 536, 'multiple words': 53, 'no prediction': 37})
167585
actual: michmash   pred: micmash
Counter({True: 11253, False

186470
actual: jerub-besheth   pred: jerubbesheth
Counter({True: 13058, False: 599, 'multiple words': 65, 'no prediction': 44})
186565
actual: hittite   pred: uriah
Counter({True: 13064, False: 600, 'multiple words': 65, 'no prediction': 44})
187272
actual: lord   pred: gracious
Counter({True: 13104, False: 601, 'multiple words': 66, 'no prediction': 44})
187328
actual: lord   pred: the
Counter({True: 13105, False: 602, 'multiple words': 66, 'no prediction': 44})
187490
actual: returned   pred: ammonite
Counter({True: 13117, False: 603, 'multiple words': 66, 'no prediction': 44})
187493
actual: entire   pred: david
Counter({True: 13117, False: 604, 'multiple words': 66, 'no prediction': 44})
188801
actual: lord   pred: lives
Counter({True: 13201, False: 605, 'multiple words': 66, 'no prediction': 45})
189626
actual: israelites   pred: israel
Counter({True: 13237, False: 606, 'multiple words': 66, 'no prediction': 45})
190542
actual: ziba   pred: to
Counter({True: 13315, False: 607, 'mu

208592
actual: israelites   pred: israel
Counter({True: 14719, False: 667, 'multiple words': 75, 'no prediction': 56})
208658
actual: israelites   pred: israel
Counter({True: 14729, False: 668, 'multiple words': 75, 'no prediction': 56})
208691
actual: jerusalem   pred: rehoboam
Counter({True: 14733, False: 669, 'multiple words': 75, 'no prediction': 56})
208702
actual: a hundred   pred: rehoboam
Counter({True: 14733, False: 669, 'multiple words': 76, 'no prediction': 56})
208721
actual: son   pred: rehoboam
Counter({True: 14733, False: 670, 'multiple words': 76, 'no prediction': 56})
211069
actual: lord   pred: temple
Counter({True: 14911, False: 671, 'multiple words': 77, 'no prediction': 56})
211149
actual: king   pred: aram
Counter({True: 14923, False: 672, 'multiple words': 77, 'no prediction': 57})
211843
actual: elah   pred: god
Counter({True: 15029, False: 673, 'multiple words': 77, 'no prediction': 57})
211865
actual: elah   pred: are
Counter({True: 15032, False: 674, 'multipl

231276
actual: shebna   pred: joah
Counter({True: 16680, False: 736, 'multiple words': 93, 'no prediction': 64})
231795
actual: cushite   pred: tirhakah
Counter({True: 16725, False: 737, 'multiple words': 96, 'no prediction': 64})
232151
actual: egypt   pred: streams
Counter({True: 16763, False: 738, 'multiple words': 97, 'no prediction': 64})
232431
actual: nisroch   pred: worshiping
Counter({True: 16775, False: 739, 'multiple words': 97, 'no prediction': 64})
232435
actual: adrammelech   pred: nisroch
Counter({True: 16775, False: 740, 'multiple words': 97, 'no prediction': 64})
232437
actual: sharezer   pred: adrammelech
Counter({True: 16775, False: 741, 'multiple words': 97, 'no prediction': 64})
233470
actual: meshullemeth   pred: haruz
Counter({True: 16871, False: 742, 'multiple words': 98, 'no prediction': 64})
233714
actual: lord   pred: temple
Counter({True: 16896, False: 743, 'multiple words': 98, 'no prediction': 65})
233974
actual: harhas   pred: tikvah
Counter({True: 16922,

238459
actual: hathath   pred: meonothai
Counter({True: 17754, False: 804, 'multiple words': 105, 'no prediction': 72})
238525
actual: judean   pred: jered
Counter({True: 17778, False: 805, 'multiple words': 106, 'no prediction': 73})
238528
actual: jered   pred: the
Counter({True: 17778, False: 806, 'multiple words': 106, 'no prediction': 73})
238575
actual: ishi   pred: sons
Counter({True: 17793, False: 807, 'multiple words': 106, 'no prediction': 74})
238576
actual: zoheth   pred: ishi
Counter({True: 17793, False: 808, 'multiple words': 106, 'no prediction': 74})
238733
actual: shaaraim   pred: the
Counter({True: 17829, False: 809, 'multiple words': 107, 'no prediction': 74})
238764
actual: baalath   pred: the
Counter({True: 17831, False: 810, 'multiple words': 107, 'no prediction': 74})
238774
actual: meshobab   pred: jamlech
Counter({True: 17831, False: 811, 'multiple words': 107, 'no prediction': 74})
238776
actual: jamlech   pred: son
Counter({True: 17831, False: 812, 'multiple 

241990
actual: talmon   pred: ahiman
Counter({True: 18556, False: 877, 'multiple words': 107, 'no prediction': 78})
242066
actual: meshelemiah   pred: was
Counter({True: 18569, False: 878, 'multiple words': 107, 'no prediction': 78})
242256
actual: levite   pred: the
Counter({True: 18576, False: 879, 'multiple words': 107, 'no prediction': 78})
242400
actual: esh-baal   pred: eshbaal
Counter({True: 18604, False: 880, 'multiple words': 107, 'no prediction': 78})
242417
actual: melech   pred: of
Counter({True: 18608, False: 881, 'multiple words': 107, 'no prediction': 78})
242486
actual: israelites   pred: israel
Counter({True: 18632, False: 882, 'multiple words': 107, 'no prediction': 78})
243413
actual: tekoa   pred: ikkesh
Counter({True: 18738, False: 883, 'multiple words': 109, 'no prediction': 78})
243516
actual: reubenite   pred: reubenites
Counter({True: 18800, False: 884, 'multiple words': 109, 'no prediction': 78})
243535
actual: shama   pred: and
Counter({True: 18806, False: 88

255307
actual: israelite   pred: israel
Counter({True: 20074, False: 947, 'multiple words': 113, 'no prediction': 87})
255540
actual: jeduthun   pred: asaph
Counter({True: 20093, False: 948, 'multiple words': 113, 'no prediction': 87})
255593
actual: lord   pred: thanks
Counter({True: 20093, False: 949, 'multiple words': 113, 'no prediction': 87})
257704
actual: fifty   pred: solomon
Counter({True: 20198, False: 950, 'multiple words': 113, 'no prediction': 87})
259147
actual: israelites   pred: israel
Counter({True: 20295, False: 951, 'multiple words': 113, 'no prediction': 90})
259534
actual: david   pred: abihail
Counter({True: 20361, False: 952, 'multiple words': 113, 'no prediction': 91})
259536
actual: daughter   pred: eliab
Counter({True: 20361, False: 953, 'multiple words': 113, 'no prediction': 91})
260952
actual: oded   pred: god
Counter({True: 20532, False: 954, 'multiple words': 114, 'no prediction': 93})
261937
actual: tobijah   pred: the
Counter({True: 20638, False: 955, '

275831
actual: lord   pred: to
Counter({True: 21995, False: 1020, 'no prediction': 135, 'multiple words': 126})
276022
actual: hodaviah   pred: of
Counter({True: 22014, False: 1021, 'no prediction': 136, 'multiple words': 126})
276336
actual: tabeel   pred: the
Counter({True: 22046, False: 1022, 'no prediction': 139, 'multiple words': 126})
276343
actual: king   pred: persia
Counter({True: 22046, False: 1023, 'no prediction': 139, 'multiple words': 126})
276349
actual: written   pred: aramaic
Counter({True: 22046, False: 1024, 'no prediction': 139, 'multiple words': 126})
277464
actual: jewish   pred: jews
Counter({True: 22101, False: 1025, 'no prediction': 155, 'multiple words': 126})
277754
actual: twelve   pred: israel
Counter({True: 22117, False: 1026, 'no prediction': 157, 'multiple words': 126})
277834
actual: israelites   pred: israel
Counter({True: 22121, False: 1027, 'no prediction': 157, 'multiple words': 126})
278632
actual: ebed   pred: the
Counter({True: 22203, False: 1028

285411
actual: bani   pred: the
Counter({True: 22745, False: 1086, 'no prediction': 195, 'multiple words': 130})
285412
actual: hashabneiah   pred: shebaniah
Counter({True: 22745, False: 1087, 'no prediction': 195, 'multiple words': 130})
286516
actual: azaniah   pred: binnui
Counter({True: 22801, False: 1088, 'no prediction': 196, 'multiple words': 130})
286537
actual: bani   pred: and
Counter({True: 22816, False: 1089, 'no prediction': 196, 'multiple words': 130})
286547
actual: bani   pred: the
Counter({True: 22820, False: 1090, 'no prediction': 196, 'multiple words': 130})
287166
actual: gabbai   pred: followers
Counter({True: 22906, False: 1091, 'no prediction': 198, 'multiple words': 130})
287167
actual: sallai   pred: gabbai
Counter({True: 22906, False: 1092, 'no prediction': 198, 'multiple words': 130})
287184
actual: hassenuah   pred: district
Counter({True: 22909, False: 1093, 'no prediction': 198, 'multiple words': 130})
287250
actual: amashsai   pred: עֲמַשְׁסַ֧י
Counter({T

315952
actual: zaphon   pred: the
Counter({True: 24009, False: 1153, 'no prediction': 247, 'multiple words': 138})
316329
actual: asaph   pred: the
Counter({True: 24015, False: 1154, 'no prediction': 248, 'multiple words': 138})
319488
actual: lord   pred: praise
Counter({True: 24058, False: 1155, 'no prediction': 281, 'multiple words': 138})
319673
actual: lord   pred: dwell
Counter({True: 24068, False: 1156, 'no prediction': 281, 'multiple words': 138})
319697
actual: sovereign   pred: lord
Counter({True: 24069, False: 1157, 'no prediction': 281, 'multiple words': 138})
319698
actual: lord   pred: sovereign
Counter({True: 24069, False: 1158, 'no prediction': 281, 'multiple words': 138})
320483
actual: lord   pred: sovereign
Counter({True: 24095, False: 1159, 'no prediction': 283, 'multiple words': 138})
320485
actual: sovereign   pred: lord
Counter({True: 24095, False: 1160, 'no prediction': 283, 'multiple words': 138})
320616
actual: lord   pred: sovereign
Counter({True: 24095, Fals

335708
actual: david   pred: lord
Counter({True: 24699, False: 1224, 'no prediction': 301, 'multiple words': 145})
335898
actual: david   pred: lord
Counter({True: 24703, False: 1225, 'no prediction': 301, 'multiple words': 145})
335920
actual: stronghold   pred: fortress
Counter({True: 24704, False: 1226, 'no prediction': 301, 'multiple words': 145})
336117
actual: david   pred: my
Counter({True: 24708, False: 1227, 'no prediction': 301, 'multiple words': 145})
336364
actual: lord   pred: praise
Counter({True: 24717, False: 1228, 'no prediction': 301, 'multiple words': 145})
336486
actual: lord   pred: praise
Counter({True: 24728, False: 1229, 'no prediction': 301, 'multiple words': 145})
336488
actual: lord   pred: praise
Counter({True: 24728, False: 1230, 'no prediction': 301, 'multiple words': 145})
336696
actual: lord   pred: praise
Counter({True: 24739, False: 1231, 'no prediction': 301, 'multiple words': 145})
336698
actual: lord   pred: praise
Counter({True: 24739, False: 1232,

379707
actual: kittim   pred: coasts
Counter({True: 26177, False: 1294, 'no prediction': 308, 'multiple words': 152})
379807
actual: shaved   pred: tahpanhes
Counter({True: 26181, False: 1295, 'no prediction': 308, 'multiple words': 152})
380318
actual: nomad   pred: waiting
Counter({True: 26201, False: 1296, 'no prediction': 308, 'multiple words': 152})
380708
actual: northern   pred: the
Counter({True: 26227, False: 1297, 'no prediction': 308, 'multiple words': 152})
380988
actual: zion   pred: signal
Counter({True: 26247, False: 1298, 'no prediction': 308, 'multiple words': 152})
383752
actual: lord   pred: do
Counter({True: 26340, False: 1299, 'no prediction': 308, 'multiple words': 152})
386526
actual: north   pred: from
Counter({True: 26455, False: 1300, 'no prediction': 308, 'multiple words': 152})
386582
actual: ethiopian   pred: change
Counter({True: 26455, False: 1301, 'no prediction': 308, 'multiple words': 152})
387324
actual: son   pred: manasseh
Counter({True: 26480, Fals

431367
actual: put   pred: lydia
Counter({True: 29178, False: 1364, 'no prediction': 321, 'multiple words': 165})
431576
actual: thebes   pred: to
Counter({True: 29200, False: 1365, 'no prediction': 321, 'multiple words': 166})
431589
actual: thebes   pred: the
Counter({True: 29202, False: 1366, 'no prediction': 321, 'multiple words': 166})
431599
actual: thebes   pred: to
Counter({True: 29204, False: 1367, 'no prediction': 321, 'multiple words': 166})
435267
actual: lord   pred: sovereign
Counter({True: 29364, False: 1368, 'no prediction': 322, 'multiple words': 166})
435292
actual: lord   pred: sovereign
Counter({True: 29364, False: 1369, 'no prediction': 322, 'multiple words': 166})
436180
actual: israelites   pred: israel
Counter({True: 29405, False: 1370, 'no prediction': 322, 'multiple words': 166})
436260
actual: israelite   pred: the
Counter({True: 29412, False: 1371, 'no prediction': 322, 'multiple words': 166})
436630
actual: put   pred: cush
Counter({True: 29434, False: 1372

471344
actual: tobijah   pred: jedaiah
Counter({True: 31003, False: 1430, 'no prediction': 337, 'multiple words': 178})
471451
actual: tobijah   pred: jedaiah
Counter({True: 31012, False: 1431, 'no prediction': 338, 'multiple words': 178})
471457
actual: hen   pred: in
Counter({True: 31013, False: 1432, 'no prediction': 338, 'multiple words': 178})


In [647]:
line = 9318

In [648]:
gold_data.iloc[line-8: line+8, [0, 1, 2, 3, 4, 5, 10,11, 32]]

Unnamed: 0,index,book,chapter,verse,word,subword,text,lemma,NIV84
9310,9310,GEN,19,6,6,1,סָגַ֥ר,5462,shut
9311,9311,GEN,19,6,7,1,אַחֲרָֽי,310 a,
9312,9312,GEN,19,6,7,2,ו,,
9313,9313,GEN,19,7,1,1,וַ,c,
9314,9314,GEN,19,7,1,2,יֹּאמַ֑ר,559,said
9315,9315,GEN,19,7,2,1,אַל,408,Don
9316,9316,GEN,19,7,3,1,נָ֥א,4994,
9317,9317,GEN,19,7,5,1,תָּרֵֽעוּ,7489 a,do this wicked thing
9318,9318,GEN,19,7,4,1,אַחַ֖,251,friends
9319,9319,GEN,19,7,4,2,י,,friends


In [649]:
word = bible_df.loc[line, ['text']].values[0][0]
hebrew_source_normalized = normalize_word(word)
book = bible_df.loc[line, ['book']].values[0][0]
chapter = bible_df.loc[line, ['chapter']].values[0][0]
verse = bible_df.loc[line, ['verse']].values[0][0]
ref = f'{book} {chapter}:{verse}\n'
vref_index = vref_data.index(ref)

def normalize_word(word):
    return re.sub("[^\w\s]", "", word.lower()) if word else ''

candidate_words = set(NIV84_df.loc[vref_index, 'words']) if vref_index in NIV84_df.index else set([])
candidate_words = [word.lower().split("ʼ")[0] for word in candidate_words if word is not None]
candidate_words

['',
 'this',
 'said',
 'my',
 'and',
 'friends',
 'don',
 'thing',
 'wicked',
 'no',
 'do']

In [653]:
candidate_df = df[(df['normalized_target'] == hebrew_source_normalized) | (df['normalized_source'] == hebrew_source_normalized)]
candidate_df

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
668,668,a,אָֽח,2,0.120882,0.207583,a,אח,-1.0,0,-0.120882
5550,5550,an,אָ֗ח,1,0.137852,0.223522,an,אח,-1.0,0,-0.137852
16505,16505,brother,אָ֖ח,2,0.516493,0.20281,brother,אח,0.172996,41,0.089351
16506,16506,brother,אָ֗ח,2,0.370858,0.166233,brother,אח,0.172996,41,0.064157
16507,16507,brother,אָ֣ח,3,0.591563,0.198404,brother,אח,0.172996,41,0.102338
16508,16508,brother,אָ֥ח,3,0.53279,0.223882,brother,אח,0.172996,41,0.09217
16509,16509,brother,אָֽח,3,0.578469,0.233705,brother,אח,0.172996,41,0.100073
16510,16510,brother,אָחִ,3,0.599111,0.256289,brother,אח,0.172996,41,0.103644
16511,16511,brother,אָחִ֑,3,0.477107,0.275923,brother,אח,0.172996,41,0.082537
16514,16514,brother,אָחִ֕,1,0.33854,0.222177,brother,אח,0.172996,41,0.058566


In [655]:
candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words) or x['normalized_target'] in list(candidate_words), axis=1)].sort_values('total_score', ascending=False)


Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
71220,71220,my,אַחַ֣,4,0.382741,0.228364,my,אח,0.018008,49,0.006892
136655,136655,אַחַ֖,wicked,1,0.093528,0.179815,אח,wicked,-1.0,0,-0.093528
136653,136653,אַחַ֖,thing,1,0.095583,0.179815,אח,thing,-1.0,0,-0.095583
110511,110511,this,אַחַ֖,1,0.095848,0.211885,this,אח,-1.0,0,-0.095848
136654,136654,אַחַ֖,this,1,0.095848,0.179815,אח,this,-1.0,0,-0.095848
136690,136690,אַחַ֣,my,5,0.382741,0.214986,אח,my,-1.0,0,-0.382741


Total number of names:

In [604]:
word

'שֶׁמְאֵ֨בֶר֙'

In [603]:
df[df['source'] == 'shemeber']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
80989,80989,shemeber,אַדְמָ֗ה,1,0.78108,0.330563,shemeber,אדמה,-1.0,0,-0.78108


Number of matched names:

In [633]:
vref_index

464

In [243]:
df[df['target'] == 'עֵ֖דֶן']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
26110,26110,eden,עֵ֖דֶן,2,0.799089,0.322273,eden,עדן,0.818182,18,3.79264


In [244]:
matched_names['predicted'] = matched_names['text'].progress_apply(lambda x: df.loc[df[df['target'] == x]['total_score'].idxmax()]['source'])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32282/32282 [06:18<00:00, 85.21it/s]


In [245]:
matched_names

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,predicted
774,GEN,2,4,8,1,o010020040081,00100200400026,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,,lord
799,GEN,2,5,15,1,o010020050151,00100200500042,OT,יְהוָ֤ה,יְהוָ֤ה,3068,,lord,,lord
827,GEN,2,7,2,1,o010020070021,00100200700006,OT,יְהוָ֨ה,יְהוָ֨ה,3068,,lord,,lord
852,GEN,2,8,2,1,o010020080021,00100200800006,OT,יְהוָ֧ה,יְהוָ֧ה,3068,,lord,κύριος,lord
856,GEN,2,8,5,2,o010020080052,00100200800014,OT,עֵ֖דֶן,עֵ֖דֶן,5731 b,,eden,εδεμ,eden
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475752,MAL,3,21,14,1,o390030210141,03900302100034,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,κύριος,lord
475756,MAL,3,22,3,1,o390030220031,03900302200006,OT,מֹשֶׁ֣ה,מֹשֶׁ֣ה,4872,,moses,μωυσῆ,moses
475767,MAL,3,22,11,1,o390030220111,03900302200024,OT,יִשְׂרָאֵ֔ל,יִשְׂרָאֵ֔ל,3478,,israel,ισραηλ,israel
475777,MAL,3,23,6,1,o390030230061,03900302300012,OT,אֵלִיָּ֣ה,אֵלִיָּ֣ה,452,,elijah,ηλιαν,elijah


In [246]:
def is_word_correct(row):
    word = row['english'].split("’")[0] if row['english'] is not None else ''
    return word == row['predicted']

In [247]:
matched_names[matched_names.apply(is_word_correct, axis=1)]

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,predicted
774,GEN,2,4,8,1,o010020040081,00100200400026,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,,lord
799,GEN,2,5,15,1,o010020050151,00100200500042,OT,יְהוָ֤ה,יְהוָ֤ה,3068,,lord,,lord
827,GEN,2,7,2,1,o010020070021,00100200700006,OT,יְהוָ֨ה,יְהוָ֨ה,3068,,lord,,lord
852,GEN,2,8,2,1,o010020080021,00100200800006,OT,יְהוָ֧ה,יְהוָ֧ה,3068,,lord,κύριος,lord
856,GEN,2,8,5,2,o010020080052,00100200800014,OT,עֵ֖דֶן,עֵ֖דֶן,5731 b,,eden,εδεμ,eden
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475752,MAL,3,21,14,1,o390030210141,03900302100034,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,κύριος,lord
475756,MAL,3,22,3,1,o390030220031,03900302200006,OT,מֹשֶׁ֣ה,מֹשֶׁ֣ה,4872,,moses,μωυσῆ,moses
475767,MAL,3,22,11,1,o390030220111,03900302200024,OT,יִשְׂרָאֵ֔ל,יִשְׂרָאֵ֔ל,3478,,israel,ισραηλ,israel
475777,MAL,3,23,6,1,o390030230061,03900302300012,OT,אֵלִיָּ֣ה,אֵלִיָּ֣ה,452,,elijah,ηλιαν,elijah


In [248]:
matched_names[~matched_names.apply(is_word_correct, axis=1)].head(100)

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,predicted
1591,GEN,3,17,1,3,o010030170013,100301700006,OT,אָדָ֣ם,אָדָ֣ם,121,,adam,αδαμ,man
1682,GEN,3,21,4,2,o010030210042,100302100012,OT,אָדָ֧ם,אָדָ֧ם,120,,adam,αδαμ,of
1809,GEN,4,2,11,2,o010040020112,100400200030,OT,קַ֕יִן,קַ֕יִן,7014 b,,cain,καιν,tiller
1845,GEN,4,4,11,1,o010040040111,100400400032,OT,הֶ֖בֶל,הֶ֖בֶל,1893,,abel,αβελ,vanity
1927,GEN,4,8,12,1,o010040080121,100400800036,OT,הֶ֥בֶל,הֶ֥בֶל,1893,,abel,αβελ,futility
2114,GEN,4,18,2,2,o010040180022,100401800008,OT,חֲנוֹךְ֙,חֲנוֹךְ֙,2585,,enoch,ενωξ,hanoch
2198,GEN,4,23,4,1,o010040230041,100402300012,OT,עָדָ֤ה,עָדָ֤ה,5711,,adah,αδα,zillah
2231,GEN,4,25,2,1,o010040250021,100402500006,OT,אָדָ֥ם,אָדָ֥ם,121,,adam,αδαμ,man
2270,GEN,4,26,9,1,o010040260091,100402600024,OT,אֱנ֑וֹשׁ,אֱנ֑וֹשׁ,583,,enosh,ενως,man
2281,GEN,5,1,4,1,o010050010041,100500100008,OT,אָדָ֑ם,אָדָ֑ם,121,,adam,ἀνθρώπων,man


In [167]:
hebrew_names = set(hebrew_names)
hebrew_matched_names = set(hebrew_matched_names)
list(hebrew_names - hebrew_matched_names)

['מָכִֽי',
 'יְה֗וּד',
 'מִיכָיָ֜ה',
 'זִ֥יעַ',
 'נָב֗וֹת',
 'נְבַלָּֽט',
 'כִּנֶּ֔רֶת',
 'נְבֽוּכַדְרֶאצַּ֥ר',
 'זְבֻל֑וּן',
 'רֶ֧גֶם',
 'מּוֹאֲבִיָּ֜ה',
 'הֶ֨בֶל֙',
 'עֵ֠ילָם',
 'מֵידָ֔ד',
 'יִּשְׂרְאֵלִֽי',
 'אֱלִישָׁ֖ה',
 'הוֹמָ֑ם',
 'בְאֵרָֽא',
 'אַשְׁבֵּלִ֑י',
 'אֲגַ֥ג',
 'רֽאוּבֵנִ֗י',
 'תַפֻּ֖חַ',
 'אָצַל֒',
 'כִּסְלֵיו֙',
 'רְפָיָ֧ה',
 'מִיכָיָ֧ה',
 'בִּתִּ֞',
 'שִׁמְעָתִ֖ים',
 'תַּ֠תְּנַי',
 'יִצְחָ֨ק',
 'אֲשִׁימָֽא',
 'יֵשׁ֛וּעַ',
 'אֲחַזְיָ֤ה',
 'אֲרִיאֵ֡ל',
 'גִּבְעָ֨תָ',
 'דָּ֣ן',
 'שְׁבָ֔ט',
 'בָּשָׁ֞ן',
 'יִבְלְעָ֨ם',
 'עִיּ֡וֹן',
 'תֻּבַ֣ל',
 'בִּנְיָמִ֣ין',
 'צָפ֥וֹן',
 'חֶלְקָֽי',
 'שָׁמָע֙',
 'מֹּ֣רַשְׁתִּ֔י',
 'כְּנַעֲנִ֨י',
 'אֲחִיה֖וּד',
 'פַּלְטִ֔י',
 'קְעִלָ֔ה',
 'צוֹפִ֖ים',
 'אַבְרָ֫הָ֥ם',
 'שִׁרְטַ֖י',
 'בַקְבֻּֽקְיָ֛ה',
 'מֹּצָֽה',
 'כְנָ֑נִי',
 'יַּעֲקֹ֑ב',
 'יְהוּדִי֙',
 'נַעֲמָֽן',
 'מָע֖וֹךְ',
 'פְּרָ֔ת',
 'שַׁלְמַ֖י',
 'שִׁמְשַׁ֥י',
 'חֲשֻׂפָ֖א',
 'שִׁפְעִ֧י',
 'אַמְנֹ֗ן',
 'יַֽאֲזַנְיָ֨הוּ֙',
 'בּוּנִּֽי',
 'עֵיבָ֖ל',
 'ירִימ֤וֹת',
 'אֵ֔צֶר',
 'אֲשַׂרְ

In [227]:
df[df['source'] == 'judith']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
48917,48917,judith,אִשָּׁה֙,1,0.333115,0.380322,judith,אשה,-1.0,0,-0.0
48918,48918,judith,יִּקַּ֤ח,1,0.332959,0.380322,judith,יקח,-1.0,0,-0.0


In [249]:
bible_df[bible_df['text'] == 'יְהוּדִ֔ית']

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,hebrew,mandarin,domain_codes,domains,sdbh_codes,top_domains
15003,GEN,26,34,9,1,o010260340091,102603400022,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3067,,Judith,ιουδιν,יְהוּדִ֔ית,犹滴,[003001007],[Names of People],[002822001001000],[003001]
231296,2KI,18,26,21,1,o120180260211,1201802600048,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3066,,in Hebrew,ιουδαϊστί,יְהוּדִ֔ית,用希伯来语,[003001009],[Names of Languages],[002820001001000],[003001]
366489,ISA,36,11,19,1,o230360110191,2303601100044,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3066,,language of Judah,ιουδαϊστί,יְהוּדִ֔ית,犹大语,[003001009],[Names of Languages],[002820001001000],[003001]


In [251]:
df[df['normalized_target'] == 'יהודית']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
10354,10354,beeri,יְהוּדִ֔ית,1,0.195174,0.380322,beeri,יהודית,0.125,1,0.009279
48762,48762,judean,יְהוּדִ֑ית,2,0.57045,0.31968,judean,יהודית,0.571429,4,0.416827
48763,48763,judean,יְהוּדִ֔ית,2,0.523498,0.333,judean,יהודית,0.571429,4,0.398457


In [229]:
bible_df[bible_df['english'] == 'Judith']

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,hebrew,mandarin,domain_codes,domains,sdbh_codes,top_domains
15003,GEN,26,34,9,1,o010260340091,102603400022,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3067,,Judith,ιουδιν,יְהוּדִ֔ית,犹滴,[003001007],[Names of People],[002822001001000],[003001]


In [250]:
normalize_word('יְהוּדִ֔ית')

'יהודית'

In [252]:
old_matched_names = matched_names