# Tagging NMV with strongs numbers
Getting translations of farsi words to match with english words with strongs number tags

## Convert NMV JSON to pandas dataframe

In [1]:
import pandas as pd
from hazm import word_tokenize

def NMVToDF():
    import json
    nmv = json.load(open("inputs/NMV.json", encoding="utf-8"))
    for book in nmv["books"]:
        for idx_chapter, chapter in enumerate(nmv["books"][book]):
            for idx_verse, verse in enumerate(chapter):
                print([book,idx_chapter+1,idx_verse+1], end="\r", flush=True)
                for idx_word, word in enumerate(word_tokenize(verse)):
                    yield pd.DataFrame({"book":[book],"idx_chapter":[idx_chapter], "idx_verse":[idx_verse], "idx_word":[idx_word], "word":[word]})

nmv_df = pd.concat(NMVToDF())
nmv_df.head()
nmv_df.to_parquet("transformations/NMV_hazm.parquet")

['Revelation of John', 22, 21]

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word
0,Genesis,0,0,0,در
0,Genesis,0,0,1,آغاز
0,Genesis,0,0,2,،
0,Genesis,0,0,3,خدا
0,Genesis,0,0,4,آسمانها


## Convert English Bible JSON to pandas dataframe

In [3]:
import pandas as pd

def BibleJSONToDF(version: str):
    from tqdm import tqdm
    import json
    bbl = json.load(open(f"inputs/{version}.json", encoding="utf-8"))
    for book in tqdm(bbl["books"]):
        for idx_chapter, chapter in enumerate(bbl["books"][book]):
            for idx_verse, verse in enumerate(chapter):
                for idx_word, word in enumerate(verse):
                    yield pd.DataFrame(
                        {
                            "book":[book],
                            "idx_chapter":[idx_chapter], 
                            "idx_verse":[idx_verse], 
                            "idx_word":[idx_word], 
                            "eng_word":[word[0]], 
                            "strongs":[word[1]] if len(word)>1 else [None],
                            "morphology":[word[2]] if len(word)>2 else [None]
                        }
                    )

In [None]:
esv_df = pd.concat(BibleJSONToDF("ESV"))
esv_df.to_parquet("transformations/ESV.parquet")
esv_df.head()

In [4]:
kjv_df = pd.concat(BibleJSONToDF("KJV"))
kjv_df.to_parquet("transformations/KJV.parquet")
kjv_df.head()

100%|██████████| 66/66 [07:06<00:00,  6.47s/it]


Unnamed: 0,book,idx_chapter,idx_verse,idx_word,eng_word,strongs,morphology
0,Genesis,0,0,0,In the beginning,H7225,
0,Genesis,0,0,1,God,H430,
0,Genesis,0,0,2,created,H853 H1254,TH8804
0,Genesis,0,0,3,the heaven,H8064,
0,Genesis,0,0,4,and,H853,


## Implementing Google Cloud translations and synsets approach

In [2]:
import pandas as pd
import json
from nltk.corpus import wordnet
from tqdm import tqdm
import six
from google.cloud import translate_v2 as translate
from nltk.corpus import stopwords

def GetSynsets(word):
    if word:
        return wordnet.synsets(word)

def GetEngSynsets(row):
    def CompareSynsets(eng_synsets):
        # print(eng_synsets)
        if row["farsi_synsets"] and eng_synsets:
            for t_synset in row["farsi_synsets"]:
                if t_synset:
                    for e_synset in eng_synsets:
                        if e_synset:
                            yield t_synset.wup_similarity(e_synset)
    # print(row)
    eng_verse = esv_df.loc[
        (esv_df.book == row["book"]) & 
        (esv_df.idx_chapter == row["idx_chapter"]) & 
        (esv_df.idx_verse == row["idx_verse"]) & 
        (esv_df.strongs.notna())]

    eng_verse["english_synsets"] = eng_verse.eng_word.apply(wordnet.synsets)

    eng_verse["max_similarity"] = eng_verse.english_synsets.apply(lambda x: max([val for val in CompareSynsets(x)] + [0]))
    # print(eng_verse)
    eng_verse = eng_verse[["idx_word", "eng_word", "strongs", "max_similarity"]].sort_values(by="max_similarity").tail(n=1)

    return eng_verse.values if eng_verse.max_similarity.squeeze()==1 else None

def TranslateText(text):
    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")
    result = translate_client.translate(text, target_language="en", source_language="fa")
    return [r["translatedText"] if r["translatedText"] else None for r in result]

def RemoveStopWords(phrase):
    if type(phrase) == str:
        phrase_split = phrase.split(" ")
        if len(phrase_split) > 1:
            processed_phrase = " ".join((wrd for wrd in phrase_split if wrd not in en_stops))
            return processed_phrase
        else:
            return phrase
    else:
        return phrase

def TranslateChapter(df, books):
    for book in books:
        book_filtered_df = df.loc[df.book==book].copy()
        for chapter_id in tqdm(set(book_filtered_df.idx_chapter), desc=book):
            chapter_filtered_df = book_filtered_df.loc[book_filtered_df.idx_chapter == chapter_id].copy()

            for verse_id in set(chapter_filtered_df.idx_verse):
                verse_filtered_df = chapter_filtered_df.loc[chapter_filtered_df.idx_verse == verse_id].copy()

                word_list = verse_filtered_df.word_no_punctuation.tolist()

                verse_filtered_df["translated_word"] = TranslateText(word_list) 
                verse_filtered_df["translated_word_no_stopwords"] = verse_filtered_df.translated_word.apply(RemoveStopWords)

                yield verse_filtered_df

en_stops = set(stopwords.words('english'))

nmv = json.load(open("inputs/NMV.json", encoding="utf-8"))
books = list(nmv["books"].keys())
nmv_df = pd.read_parquet("transformations/NMV.parquet")

farsi_chars = "آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی~۰۱۲۳۴۵۶۷۸۹"
farsi_chars = list(farsi_chars)
tqdm.pandas(desc="Stripping punctuation")
nmv_df.loc[:,"word_no_punctuation"] = nmv_df.word.progress_apply(lambda x:''.join((char for char in x if char in farsi_chars)))

# For translate api v3
# translate_client = translate.TranslationServiceClient.from_service_account_json(
#                 'rugged-truck-342720-4470f5b1c878.json'
#             )

# For translate API v2
translate_client = translate.Client.from_service_account_json(
    'rugged-truck-342720-4470f5b1c878.json'
)
books_df = pd.concat(TranslateChapter(nmv_df,books))
books_df.to_parquet(f"transformations/NMV_full.parquet")


### Trying synsets/ wu-palmer similarity method

Seems to have about a 35% match rate based on results for Genesis.
Very slow. Forgot to time for benchmarking.

In [1]:

nmv_full = pd.read_parquet("transformations/NMV_full.parquet")

tqdm.pandas(desc="Farsi synsets")

# nmv_full["farsi_synsets"] = nmv_full.translated_word_no_stopwords.progress_apply(GetSynsets)
esv_df = pd.read_parquet("transformations/ESV.parquet")

nmv_partial = nmv_full.head(n=100)
tqdm.pandas(desc="Match to english")
nmv_partial["eng_match"] = nmv_partial.progress_apply(GetEngSynsets, axis=1)
nmv_partial.info()

## Implementing word embeddings approach

In [6]:
from gensim.models import Word2Vec as w2v
from hazm import sent_tokenize, word_tokenize
import json
import pandas as pd

nmv_df = pd.read_parquet("transformations/NMV_hazm.parquet")
fa_bible_corpus = nmv_df.word.to_list()
fa_bible_sentences = sent_tokenize(" ".join(fa_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
fa_bible_sentence_tokens = [word_tokenize(s) for s in fa_bible_sentences]

singles = set([w for w in fa_bible_corpus if len(w)==1])
punctuation = set([s for s in singles if s not in ['آ', 'و', '\u200c', '\u200f']])
fa_bible_sentence_tokens_no_punctuation = []
for s in fa_bible_sentence_tokens:
    s_new = []
    for w in s:
        if w not in punctuation:
            split_w = w.split("_")
            if type(split_w) == list:
                s_new.extend(split_w)
            else:
                s_new.append(split_w)
    fa_bible_sentence_tokens_no_punctuation.append(s_new)
# fa_bible_sentence_tokens_no_punctuation = [[w for w in s if w not in punctuation] for s in fa_bible_sentence_tokens]
with open(f"transformations/NMV_sentences.json","w",encoding="utf8") as out_f:
    json.dump(fa_bible_sentence_tokens_no_punctuation, out_f, ensure_ascii=False)
# w2v_model = w2v(fa_bible_sentence_tokens_no_punctuation)


In [1]:
from nltk import sent_tokenize as eng_sent_tokenize
# from nltk import word_tokenize as eng_word_tokenize
from string import punctuation as eng_punctuation
import numpy as np
import pandas as pd
import json

esv_df = pd.read_parquet("transformations/ESV.parquet")
esv_df["strongs_punctuation"] = np.where(
    (esv_df.strongs.isna()) & (esv_df.eng_word.isin([a for a in eng_punctuation])), 
    esv_df.eng_word,
    esv_df.strongs
)
strongs_bible_corpus = esv_df.strongs_punctuation.dropna().to_list()
strongs_bible_sentences = eng_sent_tokenize(" _".join(strongs_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
strongs_bible_sentence_tokens = [[w.replace("_","") for w in s.split("_")] for s in strongs_bible_sentences]

strongs_bible_sentence_tokens_no_punctuation = [[w for w in s if w not in [a for a in eng_punctuation]] for s in strongs_bible_sentence_tokens]

with open(f"transformations/strongs_sentences.json","w") as out_f:
    json.dump(strongs_bible_sentence_tokens_no_punctuation, out_f)


### Create training data from words matched using index.mjs and wupsimilarity.py

need to implement https://arxiv.org/pdf/1309.4168.pdf approach to train linear relationship between strongs and farsi wordvec models


In [2]:
from gensim.models import Word2Vec as w2v
from hazm import sent_tokenize, word_tokenize
import json
import pandas as pd
from string import punctuation as eng_punctuation
from itertools import chain


def LoadTrainingPairs(book_name):
    with open(f"transformations/NMV_strongs_{book_name}.json", encoding="utf8") as f:
        nmv_strongs_dict = json.load(f)
    training_set = []
    for book in nmv_strongs_dict["books"]:
        for chapter in nmv_strongs_dict["books"][book]:
            for verse in chapter:
                for word in verse:
                    if len(word)>1:
                        if word[1] != None:
                            word[0] = "".join([a for a in word[0] if a not in list(punctuation)+[a for a in eng_punctuation]+["’"]])
                            training_set.append(tuple(word)[::-1])
    training_set = list(set(training_set))
    return training_set

nmv_df = pd.read_parquet("transformations/NMV_hazm.parquet")

fa_bible_corpus = nmv_df.word.to_list()
fa_bible_sentences = sent_tokenize(" ".join(fa_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
fa_bible_sentence_tokens = [word_tokenize(s) for s in fa_bible_sentences]
singles = set([w for w in fa_bible_corpus if len(w)==1])
punctuation = set([s for s in singles if s not in ['آ', 'و', '\u200c', '\u200f']])
train = list(
    set(
        chain(
            LoadTrainingPairs("Genesis"),
            LoadTrainingPairs("Psalms"),
            LoadTrainingPairs("Matthew"),
            LoadTrainingPairs("I Corinthians")
        )
    )
)

with open(f"transformations/training_pairs.json", mode="w", encoding="utf8") as f:
        json.dump(train, f, ensure_ascii=False)

### Build and train bilingual model

Use separate virtual environment because of dependency clash between hazm and transvec packages

In [1]:
import json
from gensim.models import Word2Vec as w2v
from transvec.transformers import TranslationWordVectorizer
import pandas as pd

def GetStrongsWordSimilarities(row):
    from numpy import dot
    from gensim import matutils
    import numpy as np
    print(f"{round((row/ nmv_df.shape[0])*100, 3)}% complete", end="\r", flush=True)
    word = nmv_df.iloc[row,4]
    
    if word in combined_model.sources[0]:
        word_vec = combined_model.get_vector(word)
        choices_df = esv_df.loc[
            (esv_df.book == nmv_df.iloc[row,0]) &
            (esv_df.idx_chapter == nmv_df.iloc[row,1]) &
            (esv_df.idx_verse == nmv_df.iloc[row,2]) &
            (esv_df.strongs.notna()),
            ["idx_word", "strongs"]
        ]
        choices = zip(choices_df.idx_word, choices_df.strongs)

        return [{"idx_word": choice[0],"strongs":choice[1], "similarity":dot(matutils.unitvec(word_vec), matutils.unitvec(combined_model.get_vector(choice[1])))} for choice in choices]
    else:
        return None

with open("transformations/training_pairs.json", encoding="utf8") as f:
    train = json.load(f)
with open("transformations/NMV_sentences.json", encoding="utf8") as f:
    fa = json.load(f)
fa_model = w2v(fa, window=10, min_count=1)
with open("transformations/strongs_sentences.json", encoding="utf8") as f:
    strongs = json.load(f)
strongs_model = w2v(strongs, window = 10, min_count=1)

combined_model = TranslationWordVectorizer(strongs_model, fa_model).fit(train)

esv_df = pd.read_parquet("transformations/ESV.parquet").reset_index(drop=True)
nmv_df = pd.read_parquet("transformations/NMV_hazm.parquet").reset_index(drop=True)

test_df = nmv_df.head(n=20)
test_df["similarities"] = test_df.reset_index()["index"].apply(GetStrongsWordSimilarities)
test_df

0.002% complete

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["similarities"] = test_df.reset_index()["index"].apply(GetStrongsWordSimilarities)


Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word,similarities
0,Genesis,0,0,0,در,"[{'idx_word': 1, 'strongs': 'H7225', 'similari..."
1,Genesis,0,0,1,آغاز,"[{'idx_word': 1, 'strongs': 'H7225', 'similari..."
2,Genesis,0,0,2,،,
3,Genesis,0,0,3,خدا,"[{'idx_word': 1, 'strongs': 'H7225', 'similari..."
4,Genesis,0,0,4,آسمانها,"[{'idx_word': 1, 'strongs': 'H7225', 'similari..."
5,Genesis,0,0,5,و,"[{'idx_word': 1, 'strongs': 'H7225', 'similari..."
6,Genesis,0,0,6,زمین,"[{'idx_word': 1, 'strongs': 'H7225', 'similari..."
7,Genesis,0,0,7,را,"[{'idx_word': 1, 'strongs': 'H7225', 'similari..."
8,Genesis,0,0,8,آفرید,"[{'idx_word': 1, 'strongs': 'H7225', 'similari..."
9,Genesis,0,0,9,.,


In [43]:
def MaxSimilarity(row):
    if row["similarities"] == None:
        return [None, None, None]
        
    else:
        similarities = pd.DataFrame(row["similarities"])
        return similarities.sort_values(by="similarity").tail(n=1).squeeze().to_list()

test_df[["eng_idx_word", "strongs", "similarity"]] = pd.DataFrame(test_df.apply(MaxSimilarity, axis=1).to_list())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[["eng_idx_word", "strongs", "similarity"]] = pd.DataFrame(test_df.apply(MaxSimilarity, axis=1).to_list())


In [44]:
test_df

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word,similarities,eng_idx_word,strongs,similarity
0,Genesis,0,0,0,در,"[{'idx_word': 1, 'strongs': 'H7225', 'similari...",1.0,H7225,0.058196
1,Genesis,0,0,1,آغاز,"[{'idx_word': 1, 'strongs': 'H7225', 'similari...",1.0,H7225,0.057581
2,Genesis,0,0,2,،,,,,
3,Genesis,0,0,3,خدا,"[{'idx_word': 1, 'strongs': 'H7225', 'similari...",3.0,H430,0.050702
4,Genesis,0,0,4,آسمانها,"[{'idx_word': 1, 'strongs': 'H7225', 'similari...",3.0,H430,0.057734
5,Genesis,0,0,5,و,"[{'idx_word': 1, 'strongs': 'H7225', 'similari...",8.0,H776,0.047344
6,Genesis,0,0,6,زمین,"[{'idx_word': 1, 'strongs': 'H7225', 'similari...",8.0,H776,0.056552
7,Genesis,0,0,7,را,"[{'idx_word': 1, 'strongs': 'H7225', 'similari...",3.0,H430,0.052457
8,Genesis,0,0,8,آفرید,"[{'idx_word': 1, 'strongs': 'H7225', 'similari...",1.0,H7225,0.06216
9,Genesis,0,0,9,.,,,,


How to get the most similar scores without duplicating strongs words?

Could use numpy.meshgrid to get all combinations of word indices for a verse, join to fa word and strongs word, calculate similarity for each combination.
Need to then discard all but the set of complete indices that totals the highest similarity score. Not sure how to do that yet...

In [8]:
import numpy as np
gen1_word_idx_fa = nmv_df.loc[(nmv_df.book == "Genesis") & (nmv_df.idx_chapter == 0) & (nmv_df.idx_verse == 0),"idx_word"].values

gen1_word_idx_en = esv_df.loc[(esv_df.book == "Genesis") & (esv_df.idx_chapter == 0) & (esv_df.idx_verse == 0),"idx_word"].values
np.array(np.meshgrid(gen1_word_idx_fa, gen1_word_idx_en)).T.reshape(-1,2)


array([[0, 0],
       [0, 1],
       [0, 2],
       [0, 3],
       [0, 4],
       [0, 5],
       [0, 6],
       [0, 7],
       [0, 8],
       [0, 9],
       [1, 0],
       [1, 1],
       [1, 2],
       [1, 3],
       [1, 4],
       [1, 5],
       [1, 6],
       [1, 7],
       [1, 8],
       [1, 9],
       [2, 0],
       [2, 1],
       [2, 2],
       [2, 3],
       [2, 4],
       [2, 5],
       [2, 6],
       [2, 7],
       [2, 8],
       [2, 9],
       [3, 0],
       [3, 1],
       [3, 2],
       [3, 3],
       [3, 4],
       [3, 5],
       [3, 6],
       [3, 7],
       [3, 8],
       [3, 9],
       [4, 0],
       [4, 1],
       [4, 2],
       [4, 3],
       [4, 4],
       [4, 5],
       [4, 6],
       [4, 7],
       [4, 8],
       [4, 9],
       [5, 0],
       [5, 1],
       [5, 2],
       [5, 3],
       [5, 4],
       [5, 5],
       [5, 6],
       [5, 7],
       [5, 8],
       [5, 9],
       [6, 0],
       [6, 1],
       [6, 2],
       [6, 3],
       [6, 4],
       [6, 5],
       [6,

# Downloading Bible JSON from github

In [None]:
import json
import requests
kjv = json.loads(requests.get("https://raw.githubusercontent.com/syncbible/syncbible/gh-pages/bibles/KJV.json").text)
with open("inputs/KJV.json", "w") as f:
    json.dump(kjv, f)