In [2]:
import pandas as pd
import json
from enum import Enum
from functools import reduce

with open('genesis.json') as fp:
    data = json.load(fp)["versions"][0]


df = pd.DataFrame()
for index, chapter in enumerate(data["text"]):
    chapterNum = index + 1
    
    chapterDf = pd.DataFrame({ "chapter": chapterNum, "text": chapter })
    chapterDf['verse'] = range(1, len(chapterDf) + 1)
    df = pd.concat([df, chapterDf])
   


def strip_non_hebrew(str):
    def is_heb(c): 
        code = ord(c)
        return (code >= 1425 and code <= 1524) or code == 32

    res = [x for x in str if is_heb(x)]
    str = ""
    for x in res:
        str += x
    return str

gen1_1_string = df['text'].iloc[0]

df['heb_text'] = df['text'].apply(lambda x: strip_non_hebrew(x))

df

Unnamed: 0,chapter,text,verse,heb_text
0,1,<big>בְּ</big>רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת...,1,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...
1,1,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁך...,2,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁך...
2,1,וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽיְהִי־אֽוֹר׃,3,וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽיְהִי־אֽוֹר׃
3,1,וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב וַי...,4,וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב וַי...
4,1,וַיִּקְרָ֨א אֱלֹהִ֤ים&thinsp;<small>׀</small>&...,5,וַיִּקְרָ֨א אֱלֹהִ֤ים׀לָאוֹר֙ י֔וֹם וְלַחֹ֖שֶׁ...
...,...,...,...,...
21,50,וַיֵּ֤שֶׁב יוֹסֵף֙ בְּמִצְרַ֔יִם ה֖וּא וּבֵ֣ית...,22,וַיֵּ֤שֶׁב יוֹסֵף֙ בְּמִצְרַ֔יִם ה֖וּא וּבֵ֣ית...
22,50,וַיַּ֤רְא יוֹסֵף֙ לְאֶפְרַ֔יִם בְּנֵ֖י שִׁלֵּש...,23,וַיַּ֤רְא יוֹסֵף֙ לְאֶפְרַ֔יִם בְּנֵ֖י שִׁלֵּש...
23,50,וַיֹּ֤אמֶר יוֹסֵף֙ אֶל־אֶחָ֔יו אָנֹכִ֖י מֵ֑ת ו...,24,וַיֹּ֤אמֶר יוֹסֵף֙ אֶל־אֶחָ֔יו אָנֹכִ֖י מֵ֑ת ו...
24,50,וַיַּשְׁבַּ֣ע יוֹסֵ֔ף אֶת־בְּנֵ֥י יִשְׂרָאֵ֖ל ...,25,וַיַּשְׁבַּ֣ע יוֹסֵ֔ף אֶת־בְּנֵ֥י יִשְׂרָאֵ֖ל ...


In [15]:
# https://www.ssec.wisc.edu/~tomw/java/unicode.html#x0590
gen1_1_string = df['text'].iloc[0]

# for i in range(len(gen1_1_string)):
#   code_point = ord(gen1_1_string[i])
#   if code_point < 1425 or code_point > 1525:
#       print(f"The Unicode code point of the character '{gen1_1_string[i]}' at index {i} is U+{code_point:04X}.")

teamim = {
    chr(1425): "Etnachta",
    chr(1426): "Segol",
    chr(1427): "Shalshelet",
    chr(1428): "Zakef",
    chr(1429): "Zakef Gadol",
    chr(1430): "Tippecha",
    chr(1431): "Revia",
    chr(1432): "Zarka",
    chr(1433): "Pashta",
    chr(1434): "Yetiv",
    chr(1435): "Tevir",
    chr(1436): "Geresh",
    chr(1437): "Geresh mukdam",
    chr(1438): "Gereshayim",
    chr(1439): "Qarnei Parah",
    chr(1440): "Telisha Gedolah",
    chr(1441): "Pazer",
    chr(1443): "Munah",
    chr(1444): "Mahpakh",
    chr(1445): "Merkha",
    chr(1446): "Merkha kefula",
    chr(1447): "Darga",
    chr(1448): "Kadma",
    chr(1449): "Telisha",
    chr(1450): "Galgal",
    chr(1469): "Meteg",
    chr(1472): "Pasek",
    chr(1475): "Sof pasuk",
}

teamim_map = { v: k for k, v in teamim.items() }
    
def get_teamim_chars(str):
    def is_taam(char):
        code_point = char
        return code_point in teamim.keys()
    
    return [c for c in str if is_taam(c)]

def get_teamim_eng(teamim_list):
    return list(map(lambda a: teamim[a], teamim_list))


df['teamim'] = df.apply(lambda x: get_teamim_chars(x['text']), axis=1)
#df['teamim_eng'] = df.apply(lambda x: get_teamim_eng(x['teamim']), axis=1)
df = df.drop(['text'], axis=1)
df


# for char in gen1_1_string:
#     if char in teamim:
#         print(teamim[char])


Unnamed: 0,chapter,verse,heb_text,teamim
0,1,1,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,"[֖, ֣, ֑, ֥, ֖, ֥, ֽ, ׃]"
1,1,2,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֙הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁך...,"[֗, ֥, ֙, ֙, ֔, ֖, ֣, ֑, ֣, ֔, ֖, ֥, ֽ, ׃]"
2,1,3,וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽיְהִי־אֽוֹר׃,"[֥, ֖, ֣, ֑, ֽ, ֽ, ׃]"
3,1,4,וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָא֖וֹר כִּי־ט֑וֹב וַי...,"[֧, ֛, ֖, ֑, ֣, ֔, ֥, ֖, ֥, ֽ, ׃]"
4,1,5,וַיִּקְרָ֨א אֱלֹהִ֤ים׀לָאוֹר֙ י֔וֹם וְלַחֹ֖שֶׁ...,"[֨, ֤, ׀, ֙, ֔, ֖, ֣, ֑, ֽ, ֥, ֽ, ֖, ֥, ֽ, ׃]"
...,...,...,...,...
21,50,22,וַיֵּ֤שֶׁב יוֹסֵף֙ בְּמִצְרַ֔יִם ה֖וּא וּבֵ֣ית...,"[֤, ֙, ֔, ֖, ֣, ֑, ֣, ֔, ֥, ֖, ֽ, ׃]"
22,50,23,וַיַּ֤רְא יוֹסֵף֙ לְאֶפְרַ֔יִם בְּנֵ֖י שִׁלֵּש...,"[֤, ֙, ֔, ֖, ֑, ֗, ֤, ֙, ֔, ֖, ֥, ֽ, ׃]"
23,50,24,וַיֹּ֤אמֶר יוֹסֵף֙ אֶל־אֶחָ֔יו אָנֹכִ֖י מֵ֑ת ו...,"[֤, ֙, ֔, ֖, ֑, ֽ, ֞, ֧, ֣, ֗, ֤, ֙, ֣, ֔, ֕, ..."
24,50,25,וַיַּשְׁבַּ֣ע יוֹסֵ֔ף אֶת־בְּנֵ֥י יִשְׂרָאֵ֖ל ...,"[֣, ֔, ֥, ֖, ֑, ֨, ֤, ֙, ֔, ֥, ֖, ֽ, ׃]"


In [4]:

get_teamim_eng(get_teamim_chars(gen1_1_string))


['Tippecha',
 'Munah',
 'Etnachta',
 'Merkha',
 'Tippecha',
 'Merkha',
 'Meteg',
 'Sof pasuk']

In [16]:

class Taam(Enum):
    MUNAH = 1
    MERKHA = 2
    TIPPECHA = 3
    ETNACHTA = 4
    YETIV = 5
    KADMA = 6
    MAHPAKH = 7
    PASHTA = 8
    ZAKEF = 9
    ZAKEF_GADOL = 10
    LEGARMEH = 11
    REVIA = 12
    ZARKA = 13
    SEGOL = 14
    TELISHA = 15
    TELISHA_GEDOLAH = 16
    DARGA = 17
    TEVIR = 18
    GERESH = 19
    GERESHAYIM = 20
    GERESH_MUKDAM = 21
    PAZER = 22
    SHALSHELET = 23
    MERKHA_KEFULA = 24
    GALGAL = 25
    QARNEI_PARAH = 26
    SOF_PASUK = 27
    
chars_to_teamim = {
    chr(1425): Taam.ETNACHTA,
    chr(1426): Taam.SEGOL,
    chr(1427): Taam.SHALSHELET,
    chr(1428): Taam.ZAKEF,
    chr(1429): Taam.ZAKEF_GADOL,
    chr(1430): Taam.TIPPECHA,
    chr(1431): Taam.REVIA,
    chr(1432): Taam.ZARKA,
    chr(1433): Taam.PASHTA,
    chr(1434): Taam.YETIV,
    chr(1435): Taam.TEVIR,
    chr(1436): Taam.GERESH,
    chr(1437): Taam.GERESH_MUKDAM,
    chr(1438): Taam.GERESHAYIM,
    chr(1439): Taam.QARNEI_PARAH,
    chr(1440): Taam.TELISHA_GEDOLAH,
    chr(1441): Taam.PAZER,
    chr(1443): Taam.MUNAH,
    chr(1444): Taam.MAHPAKH,
    chr(1445): Taam.MERKHA,
    chr(1446): Taam.MERKHA_KEFULA,
    chr(1447): Taam.DARGA,
    chr(1448): Taam.KADMA,
    chr(1449): Taam.TELISHA,
    chr(1450): Taam.GALGAL,
    chr(1475): Taam.SOF_PASUK,
}



def sof_pasuk(char_list):
    if(char_list[-2:]) == [teamim_map["Meteg"],teamim_map["Sof pasuk"]]:
        return char_list[0:-2] + [teamim_map["Sof pasuk"]]

def starts_sof_pasuk(char_list):
    return char_list[0:2] == [teamim_map["Meteg"],teamim_map["Sof pasuk"]]

def starts_legarmeh(char_list):
    return char_list[0:4] == [teamim_map["Munah"], teamim_map["Pasek"], teamim_map["Munah"], teamim_map["Revia"]]


# chars_to_teamim.keys()

In [18]:
cl = [chr(1445),chr(1430),chr(1469),chr(1475)]
sof_pasuk(cl)

def get_verse_teamim(d, chapter, verse):
    return d.loc[(d["chapter"] == chapter) & (df["verse"] == verse)]["teamim"].iloc[0]

# gen_1_29 = df.loc[(df["chapter"] == 1) & (df["verse"] == 29)]["teamim"].iloc[0]
gen_1_29 = get_verse_teamim(df, 1, 29)

# gen_1_29["heb_text"].iloc[0]
result = []
def translate_to_teamim(char_list, acc):
    if not hasattr(char_list, '__len__') or not len(char_list):
        return acc
    
    if starts_legarmeh(char_list):
        return acc + [Taam.LEGARMEH] + translate_to_teamim(char_list[2:], acc)
    elif starts_sof_pasuk(char_list):
        return acc + [Taam.SOF_PASUK]
    elif char_list[0] in chars_to_teamim.keys():
        return acc + [chars_to_teamim[char_list[0]]] + translate_to_teamim(char_list[1:], acc)
    else:
        return acc + translate_to_teamim(char_list[1:], acc)
    
print(gen_1_29)
# translate_to_teamim(get_verse_teamim(df, 2, 4), [])

df['trop'] = df.apply(lambda x: translate_to_teamim(x['teamim'], []), axis=1)
df[df.has_etnachta.map(set([Taam.ETNACHTA]).issubset)]

# df['has_etnachta'] = df.apply(lambda x: df['trop'].to_list()[0], axis=1)
# null_mask = df.isnull().any(axis = 1)
# null_rows = df[null_mask]
# null_rows


['֣', '֗', '֩', '֨', '֜', '֣', '׀', '֣', '֗', '֙', '֣', '֔', '֛', '֥', '֖', '֣', '֑', '֥', 'ֽ', '֖', 'ֽ', '׃']


<class 'AttributeError'>: 'DataFrame' object has no attribute 'has_etnachta'