# Tagging NMV with strongs numbers
Getting translations of farsi words to match with english words with strongs number tags

## Convert NMV JSON to pandas dataframe

In [1]:
import pandas as pd
from hazm import word_tokenize

def NMVToDF():
    import json
    nmv = json.load(open("inputs/NMV.json", encoding="utf-8"))
    for book in nmv["books"]:
        for idx_chapter, chapter in enumerate(nmv["books"][book]):
            for idx_verse, verse in enumerate(chapter):
                print([book,idx_chapter+1,idx_verse+1], end="\r", flush=True)
                for idx_word, word in enumerate(word_tokenize(verse)):
                    yield pd.DataFrame({"book":[book],"idx_chapter":[idx_chapter], "idx_verse":[idx_verse], "idx_word":[idx_word], "word":[word]})

nmv_df = pd.concat(NMVToDF())
nmv_df.head()

['Revelation of John', 22, 21]

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word
0,Genesis,0,0,0,در
0,Genesis,0,0,1,آغاز
0,Genesis,0,0,2,،
0,Genesis,0,0,3,خدا
0,Genesis,0,0,4,آسمانها


In [33]:
nmv_df.to_parquet("transformations/NMV_hazm.parquet")

## Convert English Bible JSON to pandas dataframe

In [3]:
import pandas as pd

def BibleJSONToDF(version: str):
    from tqdm import tqdm
    import json
    bbl = json.load(open(f"inputs/{version}.json", encoding="utf-8"))
    for book in tqdm(bbl["books"]):
        for idx_chapter, chapter in enumerate(bbl["books"][book]):
            for idx_verse, verse in enumerate(chapter):
                for idx_word, word in enumerate(verse):
                    yield pd.DataFrame(
                        {
                            "book":[book],
                            "idx_chapter":[idx_chapter], 
                            "idx_verse":[idx_verse], 
                            "idx_word":[idx_word], 
                            "eng_word":[word[0]], 
                            "strongs":[word[1]] if len(word)>1 else [None],
                            "morphology":[word[2]] if len(word)>2 else [None]
                        }
                    )

In [None]:
esv_df = pd.concat(BibleJSONToDF("ESV"))
esv_df.to_parquet("transformations/ESV.parquet")
esv_df.head()

In [4]:
kjv_df = pd.concat(BibleJSONToDF("KJV"))
kjv_df.to_parquet("transformations/KJV.parquet")
kjv_df.head()

100%|██████████| 66/66 [07:06<00:00,  6.47s/it]


Unnamed: 0,book,idx_chapter,idx_verse,idx_word,eng_word,strongs,morphology
0,Genesis,0,0,0,In the beginning,H7225,
0,Genesis,0,0,1,God,H430,
0,Genesis,0,0,2,created,H853 H1254,TH8804
0,Genesis,0,0,3,the heaven,H8064,
0,Genesis,0,0,4,and,H853,


In [5]:
kjv_df.tail()

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,eng_word,strongs,morphology
0,Revelation of John,21,20,7,you,G5216,P-2GP
0,Revelation of John,21,20,8,all,G3956,A-GPM
0,Revelation of John,21,20,9,.,,
0,Revelation of John,21,20,10,Amen,G281,HEB
0,Revelation of John,21,20,11,.,,


## Implementing Google Cloud translations and synsets approach

In [2]:
import pandas as pd
import json
from nltk.corpus import wordnet
from tqdm import tqdm
import six
from google.cloud import translate_v2 as translate
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))


def TranslateText(text):
    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")
    result = translate_client.translate(text, target_language="en", source_language="fa")
    return [r["translatedText"] if r["translatedText"] else None for r in result]

def RemoveStopWords(phrase):
    if type(phrase) == str:
        phrase_split = phrase.split(" ")
        if len(phrase_split) > 1:
            processed_phrase = " ".join((wrd for wrd in phrase_split if wrd not in en_stops))
            return processed_phrase
        else:
            return phrase
    else:
        return phrase

def TranslateChapter(df, books):
    for book in books:
        book_filtered_df = df.loc[df.book==book].copy()
        for chapter_id in tqdm(set(book_filtered_df.idx_chapter), desc=book):
            chapter_filtered_df = book_filtered_df.loc[book_filtered_df.idx_chapter == chapter_id].copy()

            for verse_id in set(chapter_filtered_df.idx_verse):
                verse_filtered_df = chapter_filtered_df.loc[chapter_filtered_df.idx_verse == verse_id].copy()

                word_list = verse_filtered_df.word_no_punctuation.tolist()

                verse_filtered_df["translated_word"] = TranslateText(word_list) 
                verse_filtered_df["translated_word_no_stopwords"] = verse_filtered_df.translated_word.apply(RemoveStopWords)

                yield verse_filtered_df


In [66]:
nmv = json.load(open("inputs/NMV.json", encoding="utf-8"))
books = list(nmv["books"].keys())
nmv_df = pd.read_parquet("transformations/NMV.parquet")

# For translate api v3
# translate_client = translate.TranslationServiceClient.from_service_account_json(
#                 'rugged-truck-342720-4470f5b1c878.json'
#             )

# For translate API v2
translate_client = translate.Client.from_service_account_json(
    'rugged-truck-342720-4470f5b1c878.json'
)

In [67]:
books_df = pd.concat(TranslateChapter(nmv_df,books))
books_df.to_parquet(f"transformations/NMV_full.parquet")

Genesis: 100%|██████████| 50/50 [02:50<00:00,  3.41s/it]
Exodus: 100%|██████████| 40/40 [02:12<00:00,  3.32s/it]
Leviticus: 100%|██████████| 27/27 [01:34<00:00,  3.52s/it]
Numbers: 100%|██████████| 36/36 [02:24<00:00,  4.01s/it]
Deuteronomy: 100%|██████████| 34/34 [01:48<00:00,  3.19s/it]
Joshua: 100%|██████████| 24/24 [01:13<00:00,  3.04s/it]
Judges: 100%|██████████| 21/21 [01:09<00:00,  3.31s/it]
Ruth: 100%|██████████| 4/4 [00:09<00:00,  2.31s/it]
I Samuel: 100%|██████████| 31/31 [01:28<00:00,  2.86s/it]
II Samuel: 100%|██████████| 24/24 [01:17<00:00,  3.21s/it]
I Kings: 100%|██████████| 22/22 [01:28<00:00,  4.03s/it]
II Kings: 100%|██████████| 25/25 [01:20<00:00,  3.22s/it]
I Chronicles: 100%|██████████| 29/29 [01:42<00:00,  3.55s/it]
II Chronicles: 100%|██████████| 36/36 [01:30<00:00,  2.53s/it]
Ezra: 100%|██████████| 10/10 [00:30<00:00,  3.09s/it]
Nehemiah: 100%|██████████| 13/13 [00:44<00:00,  3.39s/it]
Esther: 100%|██████████| 10/10 [00:18<00:00,  1.86s/it]
Job: 100%|██████████|

### Stripping punctuation

In [None]:
nmv_df = pd.read_parquet("transformations/NMV.parquet")

farsi_chars = "آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی~۰۱۲۳۴۵۶۷۸۹"
farsi_chars = list(farsi_chars)
tqdm.pandas(desc="Stripping punctuation")
nmv_df.loc[:,"word_no_punctuation"] = nmv_df.word.progress_apply(lambda x:''.join((char for char in x if char in farsi_chars)))
nmv_df.head(n=10)

### Trying synsets similarity

In [1]:
import pandas as pd
from tqdm import tqdm
from nltk.corpus import wordnet

def GetSynsets(word):
    if word:
        return wordnet.synsets(word)

def GetEngSynsets(row):
    def CompareSynsets(eng_synsets):
        # print(eng_synsets)
        if row["farsi_synsets"] and eng_synsets:
            for t_synset in row["farsi_synsets"]:
                if t_synset:
                    for e_synset in eng_synsets:
                        if e_synset:
                            yield t_synset.wup_similarity(e_synset)
    # print(row)
    eng_verse = esv_df.loc[
        (esv_df.book == row["book"]) & 
        (esv_df.idx_chapter == row["idx_chapter"]) & 
        (esv_df.idx_verse == row["idx_verse"]) & 
        (esv_df.strongs.notna())]

    eng_verse["english_synsets"] = eng_verse.eng_word.apply(wordnet.synsets)

    eng_verse["max_similarity"] = eng_verse.english_synsets.apply(lambda x: max([val for val in CompareSynsets(x)] + [0]))
    # print(eng_verse)
    eng_verse = eng_verse[["idx_word", "eng_word", "strongs", "max_similarity"]].sort_values(by="max_similarity").tail(n=1)

    return eng_verse.values if eng_verse.max_similarity.squeeze()==1 else None

nmv_full = pd.read_parquet("transformations/NMV_full.parquet")

tqdm.pandas(desc="Farsi synsets")

nmv_full["farsi_synsets"] = nmv_full.translated_word_no_stopwords.progress_apply(GetSynsets)
esv_df = pd.read_parquet("transformations/ESV.parquet")


In [21]:
nmv_partial = nmv_full.head(n=100)
tqdm.pandas(desc="Match to english")
nmv_partial["eng_match"] = nmv_partial.progress_apply(GetEngSynsets, axis=1)

Match to english: 100%|██████████| 100/100 [00:24<00:00,  4.03it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nmv_partial["eng_match"] = nmv_partial.progress_apply(GetEngSynsets, axis=1)


Seems to have about a 35% match rate

In [23]:
nmv_partial.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 0
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   book                          100 non-null    object
 1   idx_chapter                   100 non-null    int64 
 2   idx_verse                     100 non-null    int64 
 3   idx_word                      100 non-null    int64 
 4   word                          100 non-null    object
 5   word_no_punctuation           100 non-null    object
 6   translated_word               100 non-null    object
 7   translated_word_no_stopwords  100 non-null    object
 8   farsi_synsets                 100 non-null    object
 9   eng_match                     35 non-null     object
dtypes: int64(3), object(7)
memory usage: 8.6+ KB


## Implementing word embeddings approach

In [2]:
import pandas as pd

nmv_df = pd.read_parquet("transformations/NMV_hazm.parquet")

In [6]:
from gensim.models import Word2Vec as w2v
from hazm import sent_tokenize, word_tokenize
import json

fa_bible_corpus = nmv_df.word.to_list()
fa_bible_sentences = sent_tokenize(" ".join(fa_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
fa_bible_sentence_tokens = [word_tokenize(s) for s in fa_bible_sentences]

singles = set([w for w in fa_bible_corpus if len(w)==1])
punctuation = set([s for s in singles if s not in ['آ', 'و', '\u200c', '\u200f']])
fa_bible_sentence_tokens_no_punctuation = []
for s in fa_bible_sentence_tokens:
    s_new = []
    for w in s:
        if w not in punctuation:
            split_w = w.split("_")
            if type(split_w) == list:
                s_new.extend(split_w)
            else:
                s_new.append(split_w)
    fa_bible_sentence_tokens_no_punctuation.append(s_new)
# fa_bible_sentence_tokens_no_punctuation = [[w for w in s if w not in punctuation] for s in fa_bible_sentence_tokens]
with open(f"transformations/NMV_sentences.json","w",encoding="utf8") as out_f:
    json.dump(fa_bible_sentence_tokens_no_punctuation, out_f, ensure_ascii=False)
# w2v_model = w2v(fa_bible_sentence_tokens_no_punctuation)


In [31]:
def get_vector_wrapper(s):
    try:
        v = w2v_model.wv.get_vector(s)
        return v
    except:
        return None


nmv_df["vector"] = nmv_df.word.apply(get_vector_wrapper)

In [8]:
nmv_hazm_df = pd.read_parquet("transformations/NMV_hazm.parquet")

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word,vector
0,Genesis,0,0,0,در,"[-0.7641278, -0.5622938, -0.9979115, -0.506839..."
0,Genesis,0,0,1,آغاز,"[-0.06881103, 0.12053305, 0.06698449, 0.401715..."
0,Genesis,0,0,2,،,
0,Genesis,0,0,3,خدا,"[1.5973173, 1.1801493, 0.50769883, -0.2529177,..."
0,Genesis,0,0,4,آسمانها,"[0.258966, 0.03336788, -0.028665742, -0.094442..."
...,...,...,...,...,...,...
0,Revelation of John,21,20,5,شما,"[1.3056972, 1.2187053, 1.3966638, -0.58368856,..."
0,Revelation of John,21,20,6,باد,"[0.4446129, 0.04062367, -0.15836331, 0.0167003..."
0,Revelation of John,21,20,7,.,
0,Revelation of John,21,20,8,آمین,"[0.31524122, 0.36675677, 0.35467657, -0.285576..."


In [38]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saaam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
from nltk import sent_tokenize as eng_sent_tokenize
# from nltk import word_tokenize as eng_word_tokenize
from string import punctuation as eng_punctuation
import numpy as np
import pandas as pd
import json

esv_df = pd.read_parquet("transformations/ESV.parquet")
esv_df["strongs_punctuation"] = np.where(
    (esv_df.strongs.isna()) & (esv_df.eng_word.isin([a for a in eng_punctuation])), 
    esv_df.eng_word,
    esv_df.strongs
)
strongs_bible_corpus = esv_df.strongs_punctuation.dropna().to_list()
strongs_bible_sentences = eng_sent_tokenize(" _".join(strongs_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
strongs_bible_sentence_tokens = [[w.replace("_","") for w in s.split("_")] for s in strongs_bible_sentences]

strongs_bible_sentence_tokens_no_punctuation = [[w for w in s if w not in [a for a in eng_punctuation]] for s in strongs_bible_sentence_tokens]

with open(f"transformations/strongs_sentences.json","w") as out_f:
    json.dump(strongs_bible_sentence_tokens_no_punctuation, out_f)
# strongs_w2v_model = w2v(strongs_bible_sentence_tokens_no_punctuation)



NameError: name 'w2v' is not defined

In [43]:
def strongs_get_vector_wrapper(s):
    try:
        v = strongs_w2v_model.wv.get_vector(s)
        return v
    except:
        return None

esv_df["strongs_vector"] = esv_df.strongs.apply(strongs_get_vector_wrapper)
esv_df.head()

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,eng_word,strongs,strongs_punctuation,strongs_vector
0,Genesis,0,0,0,In the,,,
0,Genesis,0,0,1,beginning,H7225,H7225,"[-0.047161054, -0.007426138, -0.018998928, 0.1..."
0,Genesis,0,0,2,",",,",",
0,Genesis,0,0,3,God,H430,H430,"[-0.9601267, 0.5998295, -0.28738803, 0.9247632..."
0,Genesis,0,0,4,created,H1254,H1254,"[-0.07384865, 0.014700174, -0.047103573, 0.193..."


use following to restrict similarity to words in verse as a key list
w2v_model.wv.most_similar_to_given()

need to implement https://arxiv.org/pdf/1309.4168.pdf approach to train linear relationship between strongs and farsi wordvec models


### Create training data from words matched using index.mjs and wupsimilarity.py

In [2]:
from gensim.models import Word2Vec as w2v
from hazm import sent_tokenize, word_tokenize
import json
import pandas as pd
from string import punctuation as eng_punctuation
from itertools import chain


def LoadTrainingPairs(book_name):
    with open(f"transformations/NMV_strongs_{book_name}.json", encoding="utf8") as f:
        nmv_strongs_dict = json.load(f)
    training_set = []
    for book in nmv_strongs_dict["books"]:
        for chapter in nmv_strongs_dict["books"][book]:
            for verse in chapter:
                for word in verse:
                    if len(word)>1:
                        if word[1] != None:
                            word[0] = "".join([a for a in word[0] if a not in list(punctuation)+[a for a in eng_punctuation]+["’"]])
                            training_set.append(tuple(word)[::-1])
    training_set = list(set(training_set))
    return training_set

nmv_df = pd.read_parquet("transformations/NMV_hazm.parquet")


fa_bible_corpus = nmv_df.word.to_list()
fa_bible_sentences = sent_tokenize(" ".join(fa_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
fa_bible_sentence_tokens = [word_tokenize(s) for s in fa_bible_sentences]
singles = set([w for w in fa_bible_corpus if len(w)==1])
punctuation = set([s for s in singles if s not in ['آ', 'و', '\u200c', '\u200f']])
train = list(
    set(
        chain(
            LoadTrainingPairs("Genesis"),
            LoadTrainingPairs("Psalms"),
            LoadTrainingPairs("Matthew"),
            LoadTrainingPairs("I Corinthians")
        )
    )
)

with open(f"transformations/training_pairs.json", mode="w", encoding="utf8") as f:
        json.dump(train, f, ensure_ascii=False)

### Build and train bilingual model

Use separate virtual environment because of dependency clash between hazm and transvec packages

In [1]:
import json
from gensim.models import Word2Vec as w2v
from transvec.transformers import TranslationWordVectorizer
import pandas as pd

with open("transformations/training_pairs.json", encoding="utf8") as f:
    train = json.load(f)
with open("transformations/NMV_sentences.json", encoding="utf8") as f:
    fa = json.load(f)
fa_model = w2v(fa, window=10, min_count=1)
with open("transformations/strongs_sentences.json", encoding="utf8") as f:
    strongs = json.load(f)
strongs_model = w2v(strongs, window = 10, min_count=1)

combined_model = TranslationWordVectorizer(strongs_model, fa_model).fit(train)

esv_df = pd.read_parquet("transformations/ESV.parquet").reset_index(drop=True)
nmv_df = pd.read_parquet("transformations/NMV_hazm.parquet").reset_index(drop=True)

def GetStrongsWordSimilarities(row):
    from numpy import dot
    from gensim import matutils
    import numpy as np
    print(f"{round((row/ nmv_df.shape[0])*100, 3)}% complete", end="\r", flush=True)
    word = nmv_df.iloc[row,4]
    
    if word in combined_model.sources[0]:
        word_vec = combined_model.get_vector(word)
        choices_df = esv_df.loc[
            (esv_df.book == nmv_df.iloc[row,0]) &
            (esv_df.idx_chapter == nmv_df.iloc[row,1]) &
            (esv_df.idx_verse == nmv_df.iloc[row,2]) &
            (esv_df.strongs.notna()),
            ["idx_word", "strongs"]
        ]
        choices = zip(choices_df.idx_word, choices_df.strongs)

        return [{"idx_word": choice[0],"strongs":choice[1], "similarity":dot(matutils.unitvec(word_vec), matutils.unitvec(combined_model.get_vector(choice[1])))} for choice in choices]
    else:
        return None

nmv_df["similarities"] = nmv_df.reset_index()["index"].apply(GetStrongsWordSimilarities)

In [49]:

nmv_df.shape

(772044, 6)

In [None]:
strongs_model.wv.most_similar_to_given()

In [15]:
combined_model.index_to_key

AttributeError: 'TranslationWordVectorizer' object has no attribute 'index_to_key'

In [None]:
import json
import requests
kjv = json.loads(requests.get("https://raw.githubusercontent.com/syncbible/syncbible/gh-pages/bibles/KJV.json").text)
with open("inputs/KJV.json", "w") as f:
    json.dump(kjv, f)