In [324]:
from pathlib import Path
from combined import run_fa, run_match_words, combine_df
import os
from match import normalize_word
import pandas as pd
from tqdm import tqdm
from machine.tokenization import LatinWordTokenizer
from typing import Iterable, Tuple, List
import re
tqdm.pandas()
from collections import Counter

pd.set_option('display.max_rows', 500)

In [402]:
df = pd.read_csv('pfs/out/en-NIV84_combined.csv')
bible_df = pd.read_parquet('../../../../new2old/data/bible_words.parquet')


In [432]:
vref_file = 'pfs/ref_data/vref.txt'
with open(vref_file) as f:
    vref_data = f.readlines()
vref_data

['GEN 1:1\n',
 'GEN 1:2\n',
 'GEN 1:3\n',
 'GEN 1:4\n',
 'GEN 1:5\n',
 'GEN 1:6\n',
 'GEN 1:7\n',
 'GEN 1:8\n',
 'GEN 1:9\n',
 'GEN 1:10\n',
 'GEN 1:11\n',
 'GEN 1:12\n',
 'GEN 1:13\n',
 'GEN 1:14\n',
 'GEN 1:15\n',
 'GEN 1:16\n',
 'GEN 1:17\n',
 'GEN 1:18\n',
 'GEN 1:19\n',
 'GEN 1:20\n',
 'GEN 1:21\n',
 'GEN 1:22\n',
 'GEN 1:23\n',
 'GEN 1:24\n',
 'GEN 1:25\n',
 'GEN 1:26\n',
 'GEN 1:27\n',
 'GEN 1:28\n',
 'GEN 1:29\n',
 'GEN 1:30\n',
 'GEN 1:31\n',
 'GEN 2:1\n',
 'GEN 2:2\n',
 'GEN 2:3\n',
 'GEN 2:4\n',
 'GEN 2:5\n',
 'GEN 2:6\n',
 'GEN 2:7\n',
 'GEN 2:8\n',
 'GEN 2:9\n',
 'GEN 2:10\n',
 'GEN 2:11\n',
 'GEN 2:12\n',
 'GEN 2:13\n',
 'GEN 2:14\n',
 'GEN 2:15\n',
 'GEN 2:16\n',
 'GEN 2:17\n',
 'GEN 2:18\n',
 'GEN 2:19\n',
 'GEN 2:20\n',
 'GEN 2:21\n',
 'GEN 2:22\n',
 'GEN 2:23\n',
 'GEN 2:24\n',
 'GEN 2:25\n',
 'GEN 3:1\n',
 'GEN 3:2\n',
 'GEN 3:3\n',
 'GEN 3:4\n',
 'GEN 3:5\n',
 'GEN 3:6\n',
 'GEN 3:7\n',
 'GEN 3:8\n',
 'GEN 3:9\n',
 'GEN 3:10\n',
 'GEN 3:11\n',
 'GEN 3:12\n',
 'GEN 3

In [427]:
def text_to_words(text: str) -> List[str]:
    """
    Inputs:
        text:   Normally a sentence, or Bible verse
    Outputs:
        A list of words, where the sentence has had its punctuation removed, and words splits into a list of words
    """
    # word_list = re.sub("[^\w\s]", "", text.lower()).split()
    # word_list = text.lower().replace('.', '').replace(',', '').replace('?', '').replace('!', '').split()
    word_tokenizer = LatinWordTokenizer()
    word_list = [normalize_word(word) for word in word_tokenizer.tokenize(text)]
    return word_list


def vref_to_df(file):
    with open(file, 'r') as f:
        bible_data = f.readlines()
    words = [text_to_words(line) for line in bible_data]
    normalized_words = [[normalize_word(word) if word else '' for word in word_list] if word_list else [] for word_list in words]
    df = pd.DataFrame({'text': bible_data, 'words': words, 'normalized_words': normalized_words})
    df = df[df['text'].apply(lambda x: len(x) > 2)]
    df = df[df['text'] != "b'\n'"]
    return df

In [428]:
NIV84_df = vref_to_df('pfs/text_data/en-NIV84.txt')

In [357]:
gold_data = pd.read_parquet('../../../../new2old/data/gold_translation_data.parquet')

In [508]:
df.loc[:, 'total_score'] = df.apply(lambda row: row['word score'] * row['jac_sim'], axis=1)

In [404]:
domains = ['003001001', '003001002','003001004','003001006', '003001007','003001008','003001009','003001010', '003001012' ,'003001013' , '003001014', '003001015' , '003001017']
names_df = bible_df[bible_df['domain_codes'].apply(lambda x: len(set(domains).intersection(set(x))) > 0)]
names_df.loc[:, 'english'] = names_df['english'].apply(lambda x: x.lower() if x is not None else x)
# print(names)
english_names = names_df['english'].unique()
english_names = [name.lower() for name in english_names if name is not None]
hebrew_names = names_df['text'].unique()
hebrew_names = [name.lower() for name in hebrew_names if name is not None]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names_df.loc[:, 'english'] = names_df['english'].apply(lambda x: x.lower() if x is not None else x)


In [361]:
hebrew_matched_names = list(set(hebrew_names).intersection(df[df['jac_sim'] > 0]['target'].unique()))
english_matched_names = list(set(english_names).intersection(df[df['jac_sim'] > 0]['source'].unique()))


In [571]:
print(len(hebrew_names))
print(len(hebrew_matched_names))
print(len(english_names))
print(len(english_matched_names))

9744
7245
2649
2155


In [406]:
names = names_df[names_df['text'].apply(lambda x: x in hebrew_names)]
# print(names_df.iloc[:, :14][names_df['english'].apply(lambda x: x in english_matched_names)])

In [407]:
names = names[gold_data['NIV84'].apply(lambda x: x is not None)]

  names = names[gold_data['NIV84'].apply(lambda x: x is not None)]


In [572]:
max_rows = -1
counter = Counter()
for index, row in names.iloc[:max_rows, :].iterrows():
    hebrew_source = row['text']
    hebrew_source_normalized = normalize_word(hebrew_source)    
    book = row['book']
    chapter = row['chapter']
    verse = row['verse']
#     word = row['word']
#     subword = row['subword']
    ref = f'{book} {chapter}:{verse}\n'
    vref_index = vref_data.index(ref)
#     print(ref)
#     print(vref_index)
#     candidate_words = set(gold_data[(gold_data['book'] == book) & (gold_data['chapter'] == chapter) & (gold_data['verse'] == verse)]['NIV84'])
#     candidate_words = [word.lower() for word in candidate_words if word is not None]
    candidate_words = set(NIV84_df.loc[vref_index, 'words'])  if vref_index in NIV84_df.index else set([])
    candidate_words = [word.lower().split("ʼ")[0] for word in candidate_words if word is not None]

    candidate_df = df[df['normalized_target'] == hebrew_source_normalized]
#     actual = row['english']
    actual = gold_data.loc[index, ['NIV84']].values[0] if index in gold_data.index else None
    actual = actual.lower() if actual else ''
    if len(actual.split()) > 1:
        counter['multiple words'] += 1
    elif candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words), axis=1)].shape[0] > 0:
        pred = candidate_df.loc[candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words), axis=1)]['total_score'].idxmax()]['source']
        correct = pred == actual or pred == actual.split("ʼ")[0] or actual == pred.split("ʼ")[0] 
        counter.update([correct]) 
    else:
        pred = ''
        counter['no prediction'] += 1

    if not correct:
        print(index)
        print(f"actual: {actual}   pred: {pred}")
#         print(candidate_df)
        print(counter)

920
actual: pishon   pred: first
Counter({True: 7, False: 1})
1591
actual: adam   pred: of
Counter({True: 29, False: 2})
1670
actual: eve   pred: would
Counter({True: 29, False: 3})
1682
actual: adam   pred: of
Counter({True: 30, False: 4})
2231
actual: adam   pred: of
Counter({True: 88, False: 5, 'no prediction': 1})
2281
actual: adam   pred: man
Counter({True: 94, False: 6, 'no prediction': 1})
2314
actual: adam   pred: in
Counter({True: 94, False: 7, 'no prediction': 1})
2777
actual: shem   pred: the
Counter({True: 141, False: 8, 'no prediction': 1})
3519
actual: shem   pred: the
Counter({True: 168, False: 9, 'no prediction': 1})
4687
actual: shem   pred: the
Counter({True: 187, False: 10, 'no prediction': 1})
4750
actual: shem   pred: in
Counter({True: 195, False: 11, 'no prediction': 1})
4813
actual: shem   pred: the
Counter({True: 199, False: 12, 'no prediction': 1})
4828
actual: shem   pred: in
Counter({True: 201, False: 13, 'no prediction': 1})
4865
actual: shem   pred: the
Cou

KeyboardInterrupt: 

In [628]:
line = 9318

In [629]:
gold_data.iloc[line-8: line+8, [0, 1, 2, 3, 4, 5, 10,11, 32]]

Unnamed: 0,index,book,chapter,verse,word,subword,text,lemma,NIV84
9310,9310,GEN,19,6,6,1,סָגַ֥ר,5462,shut
9311,9311,GEN,19,6,7,1,אַחֲרָֽי,310 a,
9312,9312,GEN,19,6,7,2,ו,,
9313,9313,GEN,19,7,1,1,וַ,c,
9314,9314,GEN,19,7,1,2,יֹּאמַ֑ר,559,said
9315,9315,GEN,19,7,2,1,אַל,408,Don
9316,9316,GEN,19,7,3,1,נָ֥א,4994,
9317,9317,GEN,19,7,5,1,תָּרֵֽעוּ,7489 a,do this wicked thing
9318,9318,GEN,19,7,4,1,אַחַ֖,251,friends
9319,9319,GEN,19,7,4,2,י,,friends


In [630]:
word = bible_df.loc[line, ['text']].values[0][0]
hebrew_source_normalized = normalize_word(word)
book = bible_df.loc[line, ['book']].values[0][0]
chapter = bible_df.loc[line, ['chapter']].values[0][0]
verse = bible_df.loc[line, ['verse']].values[0][0]
ref = f'{book} {chapter}:{verse}\n'
vref_index = vref_data.index(ref)

def normalize_word(word):
    return re.sub("[^\w\s]", "", word.lower()) if word else ''

candidate_words = set(NIV84_df.loc[vref_index, 'words']) if vref_index in NIV84_df.index else set([])
candidate_words = [word.lower().split("ʼ")[0] for word in candidate_words if word is not None]
candidate_words

['',
 'this',
 'said',
 'my',
 'and',
 'friends',
 'don',
 'thing',
 'wicked',
 'no',
 'do']

In [631]:
candidate_df = df[df['normalized_target'] == hebrew_source_normalized]
candidate_df

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
619,619,a,אָֽח,2,0.120882,0.207583,a,אח,-1.0,0,-0.120882
4708,4708,an,אָ֗ח,1,0.137852,0.223522,an,אח,-1.0,0,-0.137852
13798,13798,brother,אָ֖ח,2,0.516493,0.20281,brother,אח,0.172996,41,0.089351
13799,13799,brother,אָ֗ח,2,0.370858,0.166233,brother,אח,0.172996,41,0.064157
13800,13800,brother,אָ֣ח,3,0.591563,0.198404,brother,אח,0.172996,41,0.102338
13801,13801,brother,אָ֥ח,3,0.53279,0.223882,brother,אח,0.172996,41,0.09217
13802,13802,brother,אָֽח,3,0.578469,0.233705,brother,אח,0.172996,41,0.100073
13803,13803,brother,אָחִ,3,0.599111,0.256289,brother,אח,0.172996,41,0.103644
13804,13804,brother,אָחִ֑,3,0.477107,0.275923,brother,אח,0.172996,41,0.082537
13807,13807,brother,אָחִ֕,1,0.33854,0.222177,brother,אח,0.172996,41,0.058566


In [632]:
candidate_df[candidate_df.apply(lambda x: x['normalized_source'] in list(candidate_words), axis=1)].sort_values('total_score', ascending=False)


Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
60293,60293,my,אַחַ֣,4,0.382741,0.228364,my,אח,0.018008,49,0.006892
93619,93619,this,אַחַ֖,1,0.095848,0.211885,this,אח,-1.0,0,-0.095848


Total number of names:

In [604]:
word

'שֶׁמְאֵ֨בֶר֙'

In [603]:
df[df['source'] == 'shemeber']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
80989,80989,shemeber,אַדְמָ֗ה,1,0.78108,0.330563,shemeber,אדמה,-1.0,0,-0.78108


Number of matched names:

In [242]:
matched_names.shape[0]

32282

In [243]:
df[df['target'] == 'עֵ֖דֶן']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
26110,26110,eden,עֵ֖דֶן,2,0.799089,0.322273,eden,עדן,0.818182,18,3.79264


In [244]:
matched_names['predicted'] = matched_names['text'].progress_apply(lambda x: df.loc[df[df['target'] == x]['total_score'].idxmax()]['source'])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32282/32282 [06:18<00:00, 85.21it/s]


In [245]:
matched_names

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,predicted
774,GEN,2,4,8,1,o010020040081,00100200400026,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,,lord
799,GEN,2,5,15,1,o010020050151,00100200500042,OT,יְהוָ֤ה,יְהוָ֤ה,3068,,lord,,lord
827,GEN,2,7,2,1,o010020070021,00100200700006,OT,יְהוָ֨ה,יְהוָ֨ה,3068,,lord,,lord
852,GEN,2,8,2,1,o010020080021,00100200800006,OT,יְהוָ֧ה,יְהוָ֧ה,3068,,lord,κύριος,lord
856,GEN,2,8,5,2,o010020080052,00100200800014,OT,עֵ֖דֶן,עֵ֖דֶן,5731 b,,eden,εδεμ,eden
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475752,MAL,3,21,14,1,o390030210141,03900302100034,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,κύριος,lord
475756,MAL,3,22,3,1,o390030220031,03900302200006,OT,מֹשֶׁ֣ה,מֹשֶׁ֣ה,4872,,moses,μωυσῆ,moses
475767,MAL,3,22,11,1,o390030220111,03900302200024,OT,יִשְׂרָאֵ֔ל,יִשְׂרָאֵ֔ל,3478,,israel,ισραηλ,israel
475777,MAL,3,23,6,1,o390030230061,03900302300012,OT,אֵלִיָּ֣ה,אֵלִיָּ֣ה,452,,elijah,ηλιαν,elijah


In [246]:
def is_word_correct(row):
    word = row['english'].split("’")[0] if row['english'] is not None else ''
    return word == row['predicted']

In [247]:
matched_names[matched_names.apply(is_word_correct, axis=1)]

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,predicted
774,GEN,2,4,8,1,o010020040081,00100200400026,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,,lord
799,GEN,2,5,15,1,o010020050151,00100200500042,OT,יְהוָ֤ה,יְהוָ֤ה,3068,,lord,,lord
827,GEN,2,7,2,1,o010020070021,00100200700006,OT,יְהוָ֨ה,יְהוָ֨ה,3068,,lord,,lord
852,GEN,2,8,2,1,o010020080021,00100200800006,OT,יְהוָ֧ה,יְהוָ֧ה,3068,,lord,κύριος,lord
856,GEN,2,8,5,2,o010020080052,00100200800014,OT,עֵ֖דֶן,עֵ֖דֶן,5731 b,,eden,εδεμ,eden
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475752,MAL,3,21,14,1,o390030210141,03900302100034,OT,יְהוָ֥ה,יְהוָ֥ה,3068,,lord,κύριος,lord
475756,MAL,3,22,3,1,o390030220031,03900302200006,OT,מֹשֶׁ֣ה,מֹשֶׁ֣ה,4872,,moses,μωυσῆ,moses
475767,MAL,3,22,11,1,o390030220111,03900302200024,OT,יִשְׂרָאֵ֔ל,יִשְׂרָאֵ֔ל,3478,,israel,ισραηλ,israel
475777,MAL,3,23,6,1,o390030230061,03900302300012,OT,אֵלִיָּ֣ה,אֵלִיָּ֣ה,452,,elijah,ηλιαν,elijah


In [248]:
matched_names[~matched_names.apply(is_word_correct, axis=1)].head(100)

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,predicted
1591,GEN,3,17,1,3,o010030170013,100301700006,OT,אָדָ֣ם,אָדָ֣ם,121,,adam,αδαμ,man
1682,GEN,3,21,4,2,o010030210042,100302100012,OT,אָדָ֧ם,אָדָ֧ם,120,,adam,αδαμ,of
1809,GEN,4,2,11,2,o010040020112,100400200030,OT,קַ֕יִן,קַ֕יִן,7014 b,,cain,καιν,tiller
1845,GEN,4,4,11,1,o010040040111,100400400032,OT,הֶ֖בֶל,הֶ֖בֶל,1893,,abel,αβελ,vanity
1927,GEN,4,8,12,1,o010040080121,100400800036,OT,הֶ֥בֶל,הֶ֥בֶל,1893,,abel,αβελ,futility
2114,GEN,4,18,2,2,o010040180022,100401800008,OT,חֲנוֹךְ֙,חֲנוֹךְ֙,2585,,enoch,ενωξ,hanoch
2198,GEN,4,23,4,1,o010040230041,100402300012,OT,עָדָ֤ה,עָדָ֤ה,5711,,adah,αδα,zillah
2231,GEN,4,25,2,1,o010040250021,100402500006,OT,אָדָ֥ם,אָדָ֥ם,121,,adam,αδαμ,man
2270,GEN,4,26,9,1,o010040260091,100402600024,OT,אֱנ֑וֹשׁ,אֱנ֑וֹשׁ,583,,enosh,ενως,man
2281,GEN,5,1,4,1,o010050010041,100500100008,OT,אָדָ֑ם,אָדָ֑ם,121,,adam,ἀνθρώπων,man


In [167]:
hebrew_names = set(hebrew_names)
hebrew_matched_names = set(hebrew_matched_names)
list(hebrew_names - hebrew_matched_names)

['מָכִֽי',
 'יְה֗וּד',
 'מִיכָיָ֜ה',
 'זִ֥יעַ',
 'נָב֗וֹת',
 'נְבַלָּֽט',
 'כִּנֶּ֔רֶת',
 'נְבֽוּכַדְרֶאצַּ֥ר',
 'זְבֻל֑וּן',
 'רֶ֧גֶם',
 'מּוֹאֲבִיָּ֜ה',
 'הֶ֨בֶל֙',
 'עֵ֠ילָם',
 'מֵידָ֔ד',
 'יִּשְׂרְאֵלִֽי',
 'אֱלִישָׁ֖ה',
 'הוֹמָ֑ם',
 'בְאֵרָֽא',
 'אַשְׁבֵּלִ֑י',
 'אֲגַ֥ג',
 'רֽאוּבֵנִ֗י',
 'תַפֻּ֖חַ',
 'אָצַל֒',
 'כִּסְלֵיו֙',
 'רְפָיָ֧ה',
 'מִיכָיָ֧ה',
 'בִּתִּ֞',
 'שִׁמְעָתִ֖ים',
 'תַּ֠תְּנַי',
 'יִצְחָ֨ק',
 'אֲשִׁימָֽא',
 'יֵשׁ֛וּעַ',
 'אֲחַזְיָ֤ה',
 'אֲרִיאֵ֡ל',
 'גִּבְעָ֨תָ',
 'דָּ֣ן',
 'שְׁבָ֔ט',
 'בָּשָׁ֞ן',
 'יִבְלְעָ֨ם',
 'עִיּ֡וֹן',
 'תֻּבַ֣ל',
 'בִּנְיָמִ֣ין',
 'צָפ֥וֹן',
 'חֶלְקָֽי',
 'שָׁמָע֙',
 'מֹּ֣רַשְׁתִּ֔י',
 'כְּנַעֲנִ֨י',
 'אֲחִיה֖וּד',
 'פַּלְטִ֔י',
 'קְעִלָ֔ה',
 'צוֹפִ֖ים',
 'אַבְרָ֫הָ֥ם',
 'שִׁרְטַ֖י',
 'בַקְבֻּֽקְיָ֛ה',
 'מֹּצָֽה',
 'כְנָ֑נִי',
 'יַּעֲקֹ֑ב',
 'יְהוּדִי֙',
 'נַעֲמָֽן',
 'מָע֖וֹךְ',
 'פְּרָ֔ת',
 'שַׁלְמַ֖י',
 'שִׁמְשַׁ֥י',
 'חֲשֻׂפָ֖א',
 'שִׁפְעִ֧י',
 'אַמְנֹ֗ן',
 'יַֽאֲזַנְיָ֨הוּ֙',
 'בּוּנִּֽי',
 'עֵיבָ֖ל',
 'ירִימ֤וֹת',
 'אֵ֔צֶר',
 'אֲשַׂרְ

In [227]:
df[df['source'] == 'judith']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
48917,48917,judith,אִשָּׁה֙,1,0.333115,0.380322,judith,אשה,-1.0,0,-0.0
48918,48918,judith,יִּקַּ֤ח,1,0.332959,0.380322,judith,יקח,-1.0,0,-0.0


In [249]:
bible_df[bible_df['text'] == 'יְהוּדִ֔ית']

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,hebrew,mandarin,domain_codes,domains,sdbh_codes,top_domains
15003,GEN,26,34,9,1,o010260340091,102603400022,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3067,,Judith,ιουδιν,יְהוּדִ֔ית,犹滴,[003001007],[Names of People],[002822001001000],[003001]
231296,2KI,18,26,21,1,o120180260211,1201802600048,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3066,,in Hebrew,ιουδαϊστί,יְהוּדִ֔ית,用希伯来语,[003001009],[Names of Languages],[002820001001000],[003001]
366489,ISA,36,11,19,1,o230360110191,2303601100044,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3066,,language of Judah,ιουδαϊστί,יְהוּדִ֔ית,犹大语,[003001009],[Names of Languages],[002820001001000],[003001]


In [251]:
df[df['normalized_target'] == 'יהודית']

Unnamed: 0.1,Unnamed: 0,source,target,align_count,word score,verse score,normalized_source,normalized_target,jac_sim,match_counts,total_score
10354,10354,beeri,יְהוּדִ֔ית,1,0.195174,0.380322,beeri,יהודית,0.125,1,0.009279
48762,48762,judean,יְהוּדִ֑ית,2,0.57045,0.31968,judean,יהודית,0.571429,4,0.416827
48763,48763,judean,יְהוּדִ֔ית,2,0.523498,0.333,judean,יהודית,0.571429,4,0.398457


In [229]:
bible_df[bible_df['english'] == 'Judith']

Unnamed: 0,book,chapter,verse,word,subword,id,marble,ot-nt,unicode,text,lemma,normalized,english,greek,hebrew,mandarin,domain_codes,domains,sdbh_codes,top_domains
15003,GEN,26,34,9,1,o010260340091,102603400022,OT,יְהוּדִ֔ית,יְהוּדִ֔ית,3067,,Judith,ιουδιν,יְהוּדִ֔ית,犹滴,[003001007],[Names of People],[002822001001000],[003001]


In [250]:
normalize_word('יְהוּדִ֔ית')

'יהודית'

In [252]:
old_matched_names = matched_names