# Tagging NMV with strongs numbers
Getting translations of farsi words to match with english words with strongs number tags

## Convert NMV JSON to pandas dataframe

In [7]:
import pandas as pd
from hazm import word_tokenize

def NMVToDF():
    import json
    nmv = json.load(open("inputs/NMV.json", encoding="utf-8"))
    for book in nmv["books"]:
        for idx_chapter, chapter in enumerate(nmv["books"][book]):
            for idx_verse, verse in enumerate(chapter):
                print([book,idx_chapter+1,idx_verse+1], end="\r", flush=True)
                for idx_word, word in enumerate(word_tokenize(verse)):
                    yield pd.DataFrame({"book":[book],"idx_chapter":[idx_chapter], "idx_verse":[idx_verse], "idx_word":[idx_word], "word":[word]})

nmv_df = pd.concat(NMVToDF()).sort_values(by=["book", "idx_chapter", "idx_verse", "idx_word"])
nmv_df.to_parquet("transformations/NMV_hazm.parquet")
nmv_df.head()

['Revelation of John', 22, 21]

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word
0,Acts,0,0,0,من
0,Acts,0,0,1,کتاب
0,Acts,0,0,2,نخست
0,Acts,0,0,3,خود
0,Acts,0,0,4,را


## Convert English Bible JSON to pandas dataframe

In [3]:
import pandas as pd

def BibleJSONToDF(version: str):
    from tqdm import tqdm
    import json
    bbl = json.load(open(f"inputs/{version}.json", encoding="utf-8"))
    for book in tqdm(bbl["books"]):
        for idx_chapter, chapter in enumerate(bbl["books"][book]):
            for idx_verse, verse in enumerate(chapter):
                for idx_word, word in enumerate(verse):
                    yield pd.DataFrame(
                        {
                            "book":[book],
                            "idx_chapter":[idx_chapter], 
                            "idx_verse":[idx_verse], 
                            "idx_word":[idx_word], 
                            "eng_word":[word[0]], 
                            "strongs":[word[1]] if len(word)>1 else [None],
                            "morphology":[word[2]] if len(word)>2 else [None]
                        }
                    )

In [None]:
esv_df = pd.concat(BibleJSONToDF("ESV"))
esv_df.to_parquet("transformations/ESV.parquet")
esv_df.head()

In [8]:
# books are out of order and originals have no sentences... 
original_df = pd.concat(BibleJSONToDF("original")).sort_values(by=["book", "idx_chapter", "idx_verse", "idx_word"])
original_df.to_parquet("transformations/original.parquet")
original_df.head()

100%|██████████| 66/66 [03:53<00:00,  3.54s/it]


Unnamed: 0,book,idx_chapter,idx_verse,idx_word,eng_word,strongs,morphology
0,Acts,0,0,0,τον,G3588,T-ASM
0,Acts,0,0,1,μεν,G3303,PRT
0,Acts,0,0,2,πρωτον,G4413,A-ASM-S
0,Acts,0,0,3,λογον,G3056,N-ASM
0,Acts,0,0,4,εποιησαμην,G4160,V-AMI-1S


In [None]:
kjv_df = pd.concat(BibleJSONToDF("KJV")).sort_values(by=["book", "idx_chapter", "idx_verse", "idx_word"])
kjv_df.to_parquet("transformations/KJV.parquet")
kjv_df.head()

100%|██████████| 66/66 [07:06<00:00,  6.47s/it]


Unnamed: 0,book,idx_chapter,idx_verse,idx_word,eng_word,strongs,morphology
0,Genesis,0,0,0,In the beginning,H7225,
0,Genesis,0,0,1,God,H430,
0,Genesis,0,0,2,created,H853 H1254,TH8804
0,Genesis,0,0,3,the heaven,H8064,
0,Genesis,0,0,4,and,H853,


## Implementing Google Cloud translations and synsets approach

In [2]:
import pandas as pd
import json
from nltk.corpus import wordnet
from tqdm import tqdm
import six
from google.cloud import translate_v2 as translate
from nltk.corpus import stopwords

def GetSynsets(word):
    if word:
        return wordnet.synsets(word)

def GetEngSynsets(row):
    def CompareSynsets(eng_synsets):
        # print(eng_synsets)
        if row["farsi_synsets"] and eng_synsets:
            for t_synset in row["farsi_synsets"]:
                if t_synset:
                    for e_synset in eng_synsets:
                        if e_synset:
                            yield t_synset.wup_similarity(e_synset)
    # print(row)
    eng_verse = esv_df.loc[
        (esv_df.book == row["book"]) & 
        (esv_df.idx_chapter == row["idx_chapter"]) & 
        (esv_df.idx_verse == row["idx_verse"]) & 
        (esv_df.strongs.notna())]

    eng_verse["english_synsets"] = eng_verse.eng_word.apply(wordnet.synsets)

    eng_verse["max_similarity"] = eng_verse.english_synsets.apply(lambda x: max([val for val in CompareSynsets(x)] + [0]))
    # print(eng_verse)
    eng_verse = eng_verse[["idx_word", "eng_word", "strongs", "max_similarity"]].sort_values(by="max_similarity").tail(n=1)

    return eng_verse.values if eng_verse.max_similarity.squeeze()==1 else None

def TranslateText(text):
    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")
    result = translate_client.translate(text, target_language="en", source_language="fa")
    return [r["translatedText"] if r["translatedText"] else None for r in result]

def RemoveStopWords(phrase):
    if type(phrase) == str:
        phrase_split = phrase.split(" ")
        if len(phrase_split) > 1:
            processed_phrase = " ".join((wrd for wrd in phrase_split if wrd not in en_stops))
            return processed_phrase
        else:
            return phrase
    else:
        return phrase

def TranslateChapter(df, books):
    for book in books:
        book_filtered_df = df.loc[df.book==book].copy()
        for chapter_id in tqdm(set(book_filtered_df.idx_chapter), desc=book):
            chapter_filtered_df = book_filtered_df.loc[book_filtered_df.idx_chapter == chapter_id].copy()

            for verse_id in set(chapter_filtered_df.idx_verse):
                verse_filtered_df = chapter_filtered_df.loc[chapter_filtered_df.idx_verse == verse_id].copy()

                word_list = verse_filtered_df.word_no_punctuation.tolist()

                verse_filtered_df["translated_word"] = TranslateText(word_list) 
                verse_filtered_df["translated_word_no_stopwords"] = verse_filtered_df.translated_word.apply(RemoveStopWords)

                yield verse_filtered_df

en_stops = set(stopwords.words('english'))

nmv = json.load(open("inputs/NMV.json", encoding="utf-8"))
books = list(nmv["books"].keys())
nmv_df = pd.read_parquet("transformations/NMV.parquet")

farsi_chars = "آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی~۰۱۲۳۴۵۶۷۸۹"
farsi_chars = list(farsi_chars)
tqdm.pandas(desc="Stripping punctuation")
nmv_df.loc[:,"word_no_punctuation"] = nmv_df.word.progress_apply(lambda x:''.join((char for char in x if char in farsi_chars)))

# For translate api v3
# translate_client = translate.TranslationServiceClient.from_service_account_json(
#                 'rugged-truck-342720-4470f5b1c878.json'
#             )

# For translate API v2
translate_client = translate.Client.from_service_account_json(
    'rugged-truck-342720-4470f5b1c878.json'
)
books_df = pd.concat(TranslateChapter(nmv_df,books))
books_df.to_parquet(f"transformations/NMV_full.parquet")


### Trying synsets/ wu-palmer similarity method

Seems to have about a 35% match rate based on results for Genesis.
Very slow. Forgot to time for benchmarking.

In [1]:

nmv_full = pd.read_parquet("transformations/NMV_full.parquet")

tqdm.pandas(desc="Farsi synsets")

# nmv_full["farsi_synsets"] = nmv_full.translated_word_no_stopwords.progress_apply(GetSynsets)
esv_df = pd.read_parquet("transformations/ESV.parquet")

nmv_partial = nmv_full.head(n=100)
tqdm.pandas(desc="Match to english")
nmv_partial["eng_match"] = nmv_partial.progress_apply(GetEngSynsets, axis=1)
nmv_partial.info()

## Implementing word embeddings approach

In [6]:
from gensim.models import Word2Vec as w2v
from hazm import sent_tokenize, word_tokenize
import json
import pandas as pd

# TODO: find workaround for lack of sentences in original text json
# Will it work to tokenize on words only and build model on one big sentence?
# Can text be chunked into arbitrary sentence sizes without affecting model accuracy?

nmv_df = pd.read_parquet("transformations/NMV_hazm.parquet")
fa_bible_corpus = nmv_df.word.to_list()
fa_bible_sentences = sent_tokenize(" ".join(fa_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
fa_bible_sentence_tokens = [word_tokenize(s) for s in fa_bible_sentences]

singles = set([w for w in fa_bible_corpus if len(w)==1])
punctuation = set([s for s in singles if s not in ['آ', 'و', '\u200c', '\u200f']])
fa_bible_sentence_tokens_no_punctuation = []
for s in fa_bible_sentence_tokens:
    s_new = []
    for w in s:
        if w not in punctuation:
            split_w = w.split("_")
            if type(split_w) == list:
                s_new.extend(split_w)
            else:
                s_new.append(split_w)
    fa_bible_sentence_tokens_no_punctuation.append(s_new)
# fa_bible_sentence_tokens_no_punctuation = [[w for w in s if w not in punctuation] for s in fa_bible_sentence_tokens]
with open(f"transformations/NMV_sentences.json","w",encoding="utf8") as out_f:
    json.dump(fa_bible_sentence_tokens_no_punctuation, out_f, ensure_ascii=False)
# w2v_model = w2v(fa_bible_sentence_tokens_no_punctuation)


In [5]:
from nltk import sent_tokenize as eng_sent_tokenize
# from nltk import word_tokenize as eng_word_tokenize
from string import punctuation as eng_punctuation
import numpy as np
import pandas as pd
import json
import re

def ParquetToStrongsSentences(eng_version: str):
    eng_df = pd.read_parquet(f"transformations/{eng_version}.parquet")
    eng_df["strongs_punctuation"] = np.where(
        (eng_df.strongs.isna()) & (eng_df.eng_word.isin([a for a in eng_punctuation])), 
        eng_df.eng_word,
        eng_df.strongs
    )
    strongs_bible_corpus = eng_df.strongs_punctuation.dropna().to_list()
    strongs_bible_sentences = eng_sent_tokenize(" ".join([w.replace(" ", "_") for w in strongs_bible_corpus])) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
    strongs_bible_sentence_tokens = [[w.replace("_"," ") for w in re.split(" /",s)] for s in strongs_bible_sentences]

    strongs_bible_sentence_tokens_no_punctuation = [[w for w in s if w not in [a for a in eng_punctuation] + ["added"]] for s in strongs_bible_sentence_tokens]

    with open(f"transformations/{eng_version}_strongs_sentences.json","w") as out_f:
        json.dump(strongs_bible_sentence_tokens_no_punctuation, out_f)

ParquetToStrongsSentences("original")


### Create training data from words matched using index.mjs and wupsimilarity.py

implements https://arxiv.org/pdf/1309.4168.pdf approach to train linear relationship between strongs and farsi wordvec models using transvec module

consider merging training pairs created from multiple english versions for better coverage. Drop pairs with multiple strongs numbers that were derived from ESV?


In [1]:
from gensim.models import Word2Vec as w2v
from hazm import sent_tokenize, word_tokenize
import json
import pandas as pd
from string import punctuation as eng_punctuation
from itertools import chain


def LoadTrainingPairs(eng_version: str, book_name: str):
    with open(f"transformations/NMV_{eng_version}_strongs_{book_name}.json", encoding="utf8") as f:
        nmv_strongs_dict = json.load(f)
    training_set = []
    for book in nmv_strongs_dict["books"]:
        for chapter in nmv_strongs_dict["books"][book]:
            for verse in chapter:
                for word in verse:
                    if len(word)>1:
                        if word[1] not in [None, "added"]:
                            word[0] = "".join([a for a in word[0] if a not in list(punctuation)+[a for a in eng_punctuation]+["’"]])
                            training_set.append(tuple(word)[::-1])
    training_set = list(set(training_set))
    return training_set

nmv_df = pd.read_parquet("transformations/NMV_hazm.parquet")
eng_version = "KJV"

fa_bible_corpus = nmv_df.word.to_list()
fa_bible_sentences = sent_tokenize(" ".join(fa_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
fa_bible_sentence_tokens = [word_tokenize(s) for s in fa_bible_sentences]
singles = set([w for w in fa_bible_corpus if len(w)==1])
punctuation = set([s for s in singles if s not in ['آ', 'و', '\u200c', '\u200f']])
train = list(
    set(
        chain(
            LoadTrainingPairs(eng_version, "Genesis"),
            LoadTrainingPairs(eng_version, "Psalms"),
            LoadTrainingPairs(eng_version, "Habakkuk"),
            LoadTrainingPairs(eng_version, "Matthew"),
            LoadTrainingPairs(eng_version, "I Corinthians")
        )
    )
)

with open(f"transformations/{eng_version}_training_pairs.json", mode="w", encoding="utf8") as f:
        json.dump(train, f, ensure_ascii=False)

### Build and train bilingual model

Use separate virtual environment because of dependency clash between hazm and transvec packages

In [2]:
import json
from gensim.models import Word2Vec as w2v
from transvec.transformers import TranslationWordVectorizer
import pandas as pd

def GetStrongsWordSimilarities(row: pd.Series, eng_df: pd.DataFrame, combined_model: TranslationWordVectorizer):
    from numpy import dot
    from gensim import matutils
    import numpy as np
    print(f"{round((row/ nmv_df.shape[0])*100, 3)}% complete", end="\r", flush=True)
    word = nmv_df.iloc[row,4]
    
    if word in combined_model.sources[0]:
        word_vec = combined_model.get_vector(word)
        choices_df = eng_df.loc[
            (eng_df.book == nmv_df.iloc[row,0]) &
            (eng_df.idx_chapter == nmv_df.iloc[row,1]) &
            (eng_df.idx_verse == nmv_df.iloc[row,2]) &
            (eng_df.strongs.notna()) &
            (eng_df.strongs != "added"),
            ["idx_word", "strongs"]
        ]
        choices = zip(choices_df.idx_word, choices_df.strongs)

        return [{"idx_word": choice[0],"strongs":choice[1], "similarity":dot(matutils.unitvec(word_vec), matutils.unitvec(combined_model.get_vector(choice[1])))} for choice in choices]
    else:
        return None

def BuildBilingualModel(eng_version: str):
    with open(f"transformations/{eng_version}_training_pairs.json", encoding="utf8") as f:
        train = json.load(f)
    with open("transformations/NMV_sentences.json", encoding="utf8") as f:
        fa = json.load(f)
    fa_model = w2v(fa, window=10, min_count=1)
    with open(f"transformations/{eng_version}_strongs_sentences.json", encoding="utf8") as f:
        strongs = json.load(f)
    strongs_model = w2v(strongs, window = 10, min_count=1)

    combined_model = TranslationWordVectorizer(strongs_model, fa_model).fit(train)

    return combined_model

# esv_df = pd.read_parquet("transformations/ESV.parquet").reset_index(drop=True)
kjv_df = pd.read_parquet("transformations/KJV.parquet").reset_index(drop=True)
nmv_df = pd.read_parquet("transformations/NMV_hazm.parquet").reset_index(drop=True)
kjv_combined_model = BuildBilingualModel("KJV")

kjv_test_df = nmv_df.head(n=20)
kjv_test_df["similarities"] = kjv_test_df.reset_index()["index"].apply(GetStrongsWordSimilarities,args=(kjv_df, kjv_combined_model))
kjv_test_df

0.002% complete

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kjv_test_df["similarities"] = kjv_test_df.reset_index()["index"].apply(GetStrongsWordSimilarities,args=(kjv_df, kjv_combined_model))


Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word,similarities
0,Genesis,0,0,0,در,"[{'idx_word': 0, 'strongs': 'H7225', 'similari..."
1,Genesis,0,0,1,آغاز,"[{'idx_word': 0, 'strongs': 'H7225', 'similari..."
2,Genesis,0,0,2,،,
3,Genesis,0,0,3,خدا,"[{'idx_word': 0, 'strongs': 'H7225', 'similari..."
4,Genesis,0,0,4,آسمانها,"[{'idx_word': 0, 'strongs': 'H7225', 'similari..."
5,Genesis,0,0,5,و,"[{'idx_word': 0, 'strongs': 'H7225', 'similari..."
6,Genesis,0,0,6,زمین,"[{'idx_word': 0, 'strongs': 'H7225', 'similari..."
7,Genesis,0,0,7,را,"[{'idx_word': 0, 'strongs': 'H7225', 'similari..."
8,Genesis,0,0,8,آفرید,"[{'idx_word': 0, 'strongs': 'H7225', 'similari..."
9,Genesis,0,0,9,.,


In [3]:
def MaxSimilarity(row):
    if row["similarities"] == None:
        return [None, None, None]
        
    else:
        similarities = pd.DataFrame(row["similarities"])
        return similarities.sort_values(by="similarity").tail(n=1).squeeze().to_list()

kjv_test_df[["eng_idx_word", "strongs", "similarity"]] = pd.DataFrame(kjv_test_df.apply(MaxSimilarity, axis=1).to_list())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kjv_test_df[["eng_idx_word", "strongs", "similarity"]] = pd.DataFrame(kjv_test_df.apply(MaxSimilarity, axis=1).to_list())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kjv_test_df[["eng_idx_word", "strongs", "similarity"]] = pd.DataFrame(kjv_test_df.apply(MaxSimilarity, axis=1).to_list())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [4]:
kjv_test_df

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word,similarities,eng_idx_word,strongs,similarity
0,Genesis,0,0,0,در,"[{'idx_word': 0, 'strongs': 'H7225', 'similari...",4.0,H853,0.092953
1,Genesis,0,0,1,آغاز,"[{'idx_word': 0, 'strongs': 'H7225', 'similari...",0.0,H7225,0.061042
2,Genesis,0,0,2,،,,,,
3,Genesis,0,0,3,خدا,"[{'idx_word': 0, 'strongs': 'H7225', 'similari...",1.0,H430,0.050172
4,Genesis,0,0,4,آسمانها,"[{'idx_word': 0, 'strongs': 'H7225', 'similari...",1.0,H430,0.062915
5,Genesis,0,0,5,و,"[{'idx_word': 0, 'strongs': 'H7225', 'similari...",4.0,H853,0.067851
6,Genesis,0,0,6,زمین,"[{'idx_word': 0, 'strongs': 'H7225', 'similari...",5.0,H776,0.061576
7,Genesis,0,0,7,را,"[{'idx_word': 0, 'strongs': 'H7225', 'similari...",5.0,H776,0.058538
8,Genesis,0,0,8,آفرید,"[{'idx_word': 0, 'strongs': 'H7225', 'similari...",0.0,H7225,0.069265
9,Genesis,0,0,9,.,,,,


How to get the most similar scores without duplicating strongs words?

Could use numpy.meshgrid to get all combinations of word indices for a verse, join to fa word and strongs word, calculate similarity for each combination.
Need to then discard all but the set of complete indices that totals the highest similarity score. Not sure how to do that yet...

In [8]:
import numpy as np
gen1_word_idx_fa = nmv_df.loc[(nmv_df.book == "Genesis") & (nmv_df.idx_chapter == 0) & (nmv_df.idx_verse == 0),"idx_word"].values

gen1_word_idx_en = esv_df.loc[(esv_df.book == "Genesis") & (esv_df.idx_chapter == 0) & (esv_df.idx_verse == 0),"idx_word"].values
np.array(np.meshgrid(gen1_word_idx_fa, gen1_word_idx_en)).T.reshape(-1,2)


array([[0, 0],
       [0, 1],
       [0, 2],
       [0, 3],
       [0, 4],
       [0, 5],
       [0, 6],
       [0, 7],
       [0, 8],
       [0, 9],
       [1, 0],
       [1, 1],
       [1, 2],
       [1, 3],
       [1, 4],
       [1, 5],
       [1, 6],
       [1, 7],
       [1, 8],
       [1, 9],
       [2, 0],
       [2, 1],
       [2, 2],
       [2, 3],
       [2, 4],
       [2, 5],
       [2, 6],
       [2, 7],
       [2, 8],
       [2, 9],
       [3, 0],
       [3, 1],
       [3, 2],
       [3, 3],
       [3, 4],
       [3, 5],
       [3, 6],
       [3, 7],
       [3, 8],
       [3, 9],
       [4, 0],
       [4, 1],
       [4, 2],
       [4, 3],
       [4, 4],
       [4, 5],
       [4, 6],
       [4, 7],
       [4, 8],
       [4, 9],
       [5, 0],
       [5, 1],
       [5, 2],
       [5, 3],
       [5, 4],
       [5, 5],
       [5, 6],
       [5, 7],
       [5, 8],
       [5, 9],
       [6, 0],
       [6, 1],
       [6, 2],
       [6, 3],
       [6, 4],
       [6, 5],
       [6,

# Downloading Bible JSON from github

In [7]:
import json
import requests
bible = requests.get("https://raw.githubusercontent.com/syncbible/syncbible/gh-pages/bibles/original.json").json()


In [11]:
bible["books"]["Genesis"][0]

[[['ב/ראשית', 'Hb/H7225', 'HR/Ncfsa'],
  ['ברא', 'H1254', 'HVqp3ms'],
  ['אלהים', 'H430', 'HNcmpa'],
  ['את', 'H853', 'HTo'],
  ['ה/שמים', 'Hd/H8064', 'HTd/Ncmpa'],
  ['ו/את', 'Hc/H853', 'HC/To'],
  ['ה/ארץ', 'Hd/H776', 'HTd/Ncbsa']],
 [['ו/ה/ארץ', 'Hc/Hd/H776', 'HC/Td/Ncbsa'],
  ['היתה', 'H1961', 'HVqp3fs'],
  ['תהו', 'H8414', 'HNcmsa'],
  ['ו/בהו', 'Hc/H922', 'HC/Ncmsa'],
  ['ו/חשך', 'Hc/H2822', 'HC/Ncmsa'],
  ['על', 'H5921', 'HR'],
  ['פני', 'H6440', 'HNcbpc'],
  ['תהום', 'H8415', 'HNcbsa'],
  ['ו/רוח', 'Hc/H7307', 'HC/Ncbsc'],
  ['אלהים', 'H430', 'HNcmpa'],
  ['מרחפת', 'H7363', 'HVprfsa'],
  ['על', 'H5921', 'HR'],
  ['פני', 'H6440', 'HNcbpc'],
  ['ה/מים', 'Hd/H4325', 'HTd/Ncmpa']],
 [['ו/יאמר', 'Hc/H559', 'HC/Vqw3ms'],
  ['אלהים', 'H430', 'HNcmpa'],
  ['יהי', 'H1961', 'HVqj3ms'],
  ['אור', 'H216', 'HNcbsa'],
  ['ו/יהי', 'Hc/H1961', 'HC/Vqw3ms'],
  ['אור', 'H216', 'HNcbsa']],
 [['ו/ירא', 'Hc/H7200', 'HC/Vqw3ms'],
  ['אלהים', 'H430', 'HNcmpa'],
  ['את', 'H853', 'HTo'],
  ['ה/אור', 'H

In [12]:
with open("inputs/original.json", "w") as f:
    json.dump(bible, f)