In [24]:
import pandas as pd

In [25]:
# Long book names to USFM (3 uppercase letters) format
book_name_mapping = {
    "Genesis": "GEN",
    "Exodus": "EXO",
    "Leviticus": "LEV",
    "Numbers": "NUM",
    "Deuteronomy": "DEU",
    "Joshua": "JOS",
    "Judges": "JDG",
    "Ruth": "RUT",
    "1 Samuel": "1SA",
    "2 Samuel": "2SA",
    "1 Kings": "1KI",
    "2 Kings": "2KI",
    "1 Chronicles": "1CH",
    "2 Chronicles": "2CH",
    "Ezra": "EZR",
    "Nehemiah": "NEH",
    "Esther": "EST",
    "Job": "JOB",
    "Psalms": "PSA",
    "Psalm": "PSA",
    "Proverbs": "PRO",
    "Ecclesiastes": "ECC",
    "Song of Solomon": "SNG",
    "Isaiah": "ISA",
    "Jeremiah": "JER",
    "Lamentations": "LAM",
    "Ezekiel": "EZK",
    "Daniel": "DAN",
    "Hosea": "HOS",
    "Joel": "JOL",
    "Amos": "AMO",
    "Obadiah": "OBA",
    "Jonah": "JON",
    "Micah": "MIC",
    "Nahum": "NAM",
    "Habakkuk": "HAB",
    "Zephaniah": "ZEP",
    "Haggai": "HAG",
    "Zechariah": "ZEC",
    "Malachi": "MAL",
    "Matthew": "MAT",
    "Mark": "MRK",
    "Luke": "LUK",
    "John": "JHN",
    "Acts": "ACT",
    "Romans": "ROM",
    "1 Corinthians": "1CO",
    "2 Corinthians": "2CO",
    "Galatians": "GAL",
    "Ephesians": "EPH",
    "Philippians": "PHP",
    "Colossians": "COL",
    "1 Thessalonians": "1TH",
    "2 Thessalonians": "2TH",
    "1 Timothy": "1TI",
    "2 Timothy": "2TI",
    "Titus": "TIT",
    "Philemon": "PHM",
    "Hebrews": "HEB",
    "James": "JAS",
    "1 Peter": "1PE",
    "2 Peter": "2PE",
    "1 John": "1JN",
    "2 John": "2JN",
    "3 John": "3JN",
    "Jude": "JUD",
    "Revelation": "REV"
}
reverse_book_name_mapping = {v:k for k, v in book_name_mapping.items()}

In [26]:
macula_greek_df = pd.read_csv('../data/preprocessed-macula-dataframes/mg.csv', header=0)
print('greek columns', macula_greek_df.columns)

# The macula hebrew csv is split into three files
filenames = ['../data/preprocessed-macula-dataframes/final_part_0.csv', 
             '../data/preprocessed-macula-dataframes/final_part_1.csv', 
             '../data/preprocessed-macula-dataframes/final_part_2.csv']

# Use a list comprehension to load all the dataframes
df_list = [pd.read_csv(f, header=0) for f in filenames]

# Concatenate them together
macula_hebrew_df = pd.concat(df_list)

# rename hebrew 'english' as 'gloss'
macula_hebrew_df = macula_hebrew_df.rename(columns={'english': 'gloss'})
print('hebrew columns', macula_hebrew_df.columns)

# combined_df = pd.concat([macula_greek_df, macula_hebrew_df])

# # Replace 'missing' with None in 'after' or 'text' columns
macula_hebrew_df['after'] = macula_hebrew_df['after'].replace('missing', None)
macula_hebrew_df['text'] = macula_hebrew_df['text'].replace('missing', None)

def join_source_text(group):
    text_strs = group['text'].to_list()
    after_strs = [str(a) + ' ' if a is not None and str(a) != 'missing' else '' for a in group['after'].to_list()]
    combined = [None]*(len(text_strs)+len(after_strs))
    combined[::2] = text_strs
    combined[1::2] = after_strs
    return ''.join(filter(None, combined)).strip()

def join_source_gloss(group):
    text_strs = group['gloss'].to_list()
    after_strs = [str(a) + ' ' if a is not None and str(a) != 'missing' else ' ' for a in group['after'].to_list()]
    combined = [None]*(len(text_strs)+len(after_strs))
    combined[::2] = text_strs
    combined[1::2] = after_strs
    return ''.join(filter(None, combined)).strip()

def join_greek_lemmas(group):
    lemmas = group['lemma'].to_list()
    lemmas = [str(l) if l is not None and str(l) != 'nan' else '' for l in lemmas]
    return ' '.join(lemmas).strip()

def join_hebrew_lemmas(group):
    lemmas = group['unicodelemma'].to_list()
    lemmas = [str(l) if l is not None and str(l) != 'nan' else '' for l in lemmas]
    return ' '.join(lemmas).strip()

print('...creating vref format')

hebrew_vref = macula_hebrew_df.groupby('book_chapter_verse').apply(join_source_text).reset_index()
hebrew_vref.columns = ['vref', 'content']
# hebrew_vref['lemmas'] = macula_hebrew_df.groupby('book_chapter_verse').apply(join_hebrew_lemmas).values

greek_vref = macula_greek_df.groupby('book_chapter_verse').apply(join_source_text).reset_index()
greek_vref.columns = ['vref', 'content']
# greek_vref['lemmas'] = macula_greek_df.groupby('book_chapter_verse').apply(join_greek_lemmas).values

greek columns Index(['ref', 'role', 'class', 'type', 'gloss', 'text', 'after', 'lemma',
       'normalized', 'strong', 'morph', 'person', 'number', 'gender', 'case',
       'tense', 'voice', 'mood', 'degree', 'domain', 'ln', 'frame', 'subjref',
       'referent', 'id', 'book', 'chapter', 'verse', 'book_chapter',
       'book_chapter_verse', 'domain_label'],
      dtype='object')


FileNotFoundError: [Errno 2] No such file or directory: '../data/preprocessed-macula-dataframes/final_part_0.csv'

In [27]:
hebrew_vref.iloc[3]

vref                                               1CH 10:12
content    וַיָּקוּמוּ֮  כָּל־ אִ֣ישׁ  חַיִל֒  וַיִּשְׂא֞...
Name: 3, dtype: object

In [28]:
combined_greek_hebrew_vref = pd.concat([hebrew_vref, greek_vref])
# rename book_chapter_verse as 'usfm_vref'
# combined_greek_hebrew_vref = combined_greek_hebrew_vref.rename(columns={'vref': 'usfm_vref'})
# get book name from vref and expand (e.g., GEN 1:1 -> Genesis 1:1)
# combined_greek_hebrew_vref['vref'] = combined_greek_hebrew_vref['vref'].apply(lambda x: f"{reverse_book_name_mapping[x.split(' ')[0]]} {x.split(' ')[1]}")
combined_greek_hebrew_vref.head(-20)

Unnamed: 0,vref,content
0,1CH 10:1,וּפְלִשְׁתִּ֖ים נִלְחֲמ֣וּ בְיִשְׂרָאֵ֑ל וַ...
1,1CH 10:10,וַיָּשִׂ֨ימוּ֙ אֶת־ כֵּלָ֔יו בֵּ֖ית אֱלֹהֵי...
2,1CH 10:11,וַֽיִּשְׁמְע֔וּ כֹּ֖ל יָבֵ֣ישׁ גִּלְעָ֑ד א...
3,1CH 10:12,וַיָּקוּמוּ֮ כָּל־ אִ֣ישׁ חַיִל֒ וַיִּשְׂא֞...
4,1CH 10:13,וַיָּ֣מָת שָׁא֗וּל בְּמַֽעֲלוֹ֙ אֲשֶׁ֣ר מָ...
...,...,...
7918,TIT 2:14,ὃς ἔδωκεν ἑαυτὸν ὑπὲρ ἡμῶν ἵνα λυτρώσητα...
7919,TIT 2:15,Ταῦτα λάλει καὶ παρακάλει καὶ ἔλεγχε μετ...
7920,TIT 2:2,"πρεσβύτας νηφαλίους εἶναι, σεμνούς, σώφρονας..."
7921,TIT 2:3,πρεσβύτιδας ὡσαύτως ἐν καταστήματι ἱεροπρε...


In [29]:
# save combined_greek_hebrew_vref to file
combined_greek_hebrew_vref.to_csv('../data/combined_greek_hebrew_vref.csv', index=False)