# Tagging NMV with strongs numbers
Getting translations of farsi words to match with english words with strongs number tags

## Convert NMV JSON to pandas dataframe

In [1]:
import pandas as pd
from hazm import word_tokenize

def NMVToDF():
    import json
    nmv = json.load(open("NMV.json", encoding="utf-8"))
    for book in nmv["books"]:
        for idx_chapter, chapter in enumerate(nmv["books"][book]):
            for idx_verse, verse in enumerate(chapter):
                print([book,idx_chapter+1,idx_verse+1], end="\r", flush=True)
                for idx_word, word in enumerate(word_tokenize(verse)):
                    yield pd.DataFrame({"book":[book],"idx_chapter":[idx_chapter], "idx_verse":[idx_verse], "idx_word":[idx_word], "word":[word]})

nmv_df = pd.concat(NMVToDF())
nmv_df.head()

['Revelation of John', 22, 21]

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word
0,Genesis,0,0,0,در
0,Genesis,0,0,1,آغاز
0,Genesis,0,0,2,،
0,Genesis,0,0,3,خدا
0,Genesis,0,0,4,آسمانها


## Convert ESV JSON to pandas dataframe

In [34]:
import pandas as pd

def ESVToDF():
    import json
    esv = json.load(open("ESV.json", encoding="utf-8"))
    for book in tqdm(esv["books"]):
        for idx_chapter, chapter in enumerate(esv["books"][book]):
            for idx_verse, verse in enumerate(chapter):
                for idx_word, word in enumerate(verse):
                    yield pd.DataFrame(
                        {
                            "book":[book],
                            "idx_chapter":[idx_chapter], 
                            "idx_verse":[idx_verse], 
                            "idx_word":[idx_word], 
                            "eng_word":[word[0]], 
                            "strongs":[word[1]] if len(word)>1 else [None]
                        }
                    )

esv_df = pd.concat(ESVToDF())
esv_df.head()

100%|██████████| 66/66 [05:08<00:00,  4.67s/it]


Unnamed: 0,book,idx_chapter,idx_verse,idx_word,eng_word,strongs
0,Genesis,0,0,0,In the,
0,Genesis,0,0,1,beginning,H7225
0,Genesis,0,0,2,",",
0,Genesis,0,0,3,God,H430
0,Genesis,0,0,4,created,H1254
...,...,...,...,...,...,...
0,Revelation of John,21,20,6,with,G3326
0,Revelation of John,21,20,7,all,G3956
0,Revelation of John,21,20,8,.,
0,Revelation of John,21,20,9,Amen,G281


In [44]:
esv_df.to_parquet("ESV_parquet")

## Translations and synsets

In [2]:
import pandas as pd
import json
from nltk.corpus import wordnet
from tqdm import tqdm
import six
from google.cloud import translate_v2 as translate
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))


def TranslateText(text):
    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")
    result = translate_client.translate(text, target_language="en", source_language="fa")
    return [r["translatedText"] if r["translatedText"] else None for r in result]

def RemoveStopWords(phrase):
    if type(phrase) == str:
        phrase_split = phrase.split(" ")
        if len(phrase_split) > 1:
            processed_phrase = " ".join((wrd for wrd in phrase_split if wrd not in en_stops))
            return processed_phrase
        else:
            return phrase
    else:
        return phrase

def TranslateChapter(df, books):
    for book in books:
        book_filtered_df = df.loc[df.book==book].copy()
        for chapter_id in tqdm(set(book_filtered_df.idx_chapter), desc=book):
            chapter_filtered_df = book_filtered_df.loc[book_filtered_df.idx_chapter == chapter_id].copy()

            for verse_id in set(chapter_filtered_df.idx_verse):
                verse_filtered_df = chapter_filtered_df.loc[chapter_filtered_df.idx_verse == verse_id].copy()

                word_list = verse_filtered_df.word_no_punctuation.tolist()

                verse_filtered_df["translated_word"] = TranslateText(word_list) 
                verse_filtered_df["translated_word_no_stopwords"] = verse_filtered_df.translated_word.apply(RemoveStopWords)

                yield verse_filtered_df


In [66]:
nmv = json.load(open("NMV.json", encoding="utf-8"))
books = list(nmv["books"].keys())
nmv_df = pd.read_parquet("NMV_parquet")

# For translate api v3
# translate_client = translate.TranslationServiceClient.from_service_account_json(
#                 'rugged-truck-342720-4470f5b1c878.json'
#             )

# For translate API v2
translate_client = translate.Client.from_service_account_json(
    'rugged-truck-342720-4470f5b1c878.json'
)

In [67]:
books_df = pd.concat(TranslateChapter(nmv_df,books))
books_df.to_parquet(f"NMV_full_parquet")

Genesis: 100%|██████████| 50/50 [02:50<00:00,  3.41s/it]
Exodus: 100%|██████████| 40/40 [02:12<00:00,  3.32s/it]
Leviticus: 100%|██████████| 27/27 [01:34<00:00,  3.52s/it]
Numbers: 100%|██████████| 36/36 [02:24<00:00,  4.01s/it]
Deuteronomy: 100%|██████████| 34/34 [01:48<00:00,  3.19s/it]
Joshua: 100%|██████████| 24/24 [01:13<00:00,  3.04s/it]
Judges: 100%|██████████| 21/21 [01:09<00:00,  3.31s/it]
Ruth: 100%|██████████| 4/4 [00:09<00:00,  2.31s/it]
I Samuel: 100%|██████████| 31/31 [01:28<00:00,  2.86s/it]
II Samuel: 100%|██████████| 24/24 [01:17<00:00,  3.21s/it]
I Kings: 100%|██████████| 22/22 [01:28<00:00,  4.03s/it]
II Kings: 100%|██████████| 25/25 [01:20<00:00,  3.22s/it]
I Chronicles: 100%|██████████| 29/29 [01:42<00:00,  3.55s/it]
II Chronicles: 100%|██████████| 36/36 [01:30<00:00,  2.53s/it]
Ezra: 100%|██████████| 10/10 [00:30<00:00,  3.09s/it]
Nehemiah: 100%|██████████| 13/13 [00:44<00:00,  3.39s/it]
Esther: 100%|██████████| 10/10 [00:18<00:00,  1.86s/it]
Job: 100%|██████████|

## Stripping punctuation

In [None]:
nmv_df = pd.read_parquet("NMV_parquet")

farsi_chars = "آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی~۰۱۲۳۴۵۶۷۸۹"
farsi_chars = list(farsi_chars)
tqdm.pandas(desc="Stripping punctuation")
nmv_df.loc[:,"word_no_punctuation"] = nmv_df.word.progress_apply(lambda x:''.join((char for char in x if char in farsi_chars)))
nmv_df.head(n=10)

## Rough Workings

In [1]:
import pandas as pd
from tqdm import tqdm
from nltk.corpus import wordnet

def GetSynsets(word):
    if word:
        return wordnet.synsets(word)

def GetEngSynsets(row):
    def CompareSynsets(eng_synsets):
        # print(eng_synsets)
        if row["farsi_synsets"] and eng_synsets:
            for t_synset in row["farsi_synsets"]:
                if t_synset:
                    for e_synset in eng_synsets:
                        if e_synset:
                            yield t_synset.wup_similarity(e_synset)
    # print(row)
    eng_verse = esv_df.loc[
        (esv_df.book == row["book"]) & 
        (esv_df.idx_chapter == row["idx_chapter"]) & 
        (esv_df.idx_verse == row["idx_verse"]) & 
        (esv_df.strongs.notna())]

    eng_verse["english_synsets"] = eng_verse.eng_word.apply(wordnet.synsets)

    eng_verse["max_similarity"] = eng_verse.english_synsets.apply(lambda x: max([val for val in CompareSynsets(x)] + [0]))
    # print(eng_verse)
    eng_verse = eng_verse[["idx_word", "eng_word", "strongs", "max_similarity"]].sort_values(by="max_similarity").tail(n=1)

    return eng_verse.values if eng_verse.max_similarity.squeeze()==1 else None

nmv_full = pd.read_parquet("NMV_full_parquet")

tqdm.pandas(desc="Farsi synsets")

nmv_full["farsi_synsets"] = nmv_full.translated_word_no_stopwords.progress_apply(GetSynsets)
esv_df = pd.read_parquet("ESV_parquet")


In [21]:
nmv_partial = nmv_full.head(n=100)
tqdm.pandas(desc="Match to english")
nmv_partial["eng_match"] = nmv_partial.progress_apply(GetEngSynsets, axis=1)

Match to english: 100%|██████████| 100/100 [00:24<00:00,  4.03it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nmv_partial["eng_match"] = nmv_partial.progress_apply(GetEngSynsets, axis=1)


In [22]:
nmv_partial

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word,word_no_punctuation,translated_word,translated_word_no_stopwords,farsi_synsets,eng_match
0,Genesis,0,0,0,در,در,At,At,"[Synset('astatine.n.01'), Synset('at.n.02')]",
0,Genesis,0,0,1,آغاز،,آغاز,the beginning,beginning,"[Synset('beginning.n.01'), Synset('beginning.n...","[[1, beginning, H7225, 1.0]]"
0,Genesis,0,0,2,خدا,خدا,God,God,"[Synset('god.n.01'), Synset('deity.n.01'), Syn...","[[3, God, H430, 1.0]]"
0,Genesis,0,0,3,آسمانها,آسمانها,Skies,Skies,"[Synset('sky.n.01'), Synset('flip.v.06'), Syns...",
0,Genesis,0,0,4,و,و,And,And,[],
...,...,...,...,...,...,...,...,...,...,...
0,Genesis,0,7,1,فَلَک,فلک,فلک,فلک,[],
0,Genesis,0,7,2,را,را,را,را,[],
0,Genesis,0,7,3,’آسمان‘,آسمان,the sky,sky,"[Synset('sky.n.01'), Synset('flip.v.06')]",
0,Genesis,0,7,4,نامید.,نامید,Called,Called,"[Synset('name.v.01'), Synset('call.v.02'), Syn...","[[2, called, H7121, 1.0]]"


Seems to have about a 35% match rate

In [23]:
nmv_partial.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 0
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   book                          100 non-null    object
 1   idx_chapter                   100 non-null    int64 
 2   idx_verse                     100 non-null    int64 
 3   idx_word                      100 non-null    int64 
 4   word                          100 non-null    object
 5   word_no_punctuation           100 non-null    object
 6   translated_word               100 non-null    object
 7   translated_word_no_stopwords  100 non-null    object
 8   farsi_synsets                 100 non-null    object
 9   eng_match                     35 non-null     object
dtypes: int64(3), object(7)
memory usage: 8.6+ KB


In [24]:
from gensim.models import Word2Vec as w2v
from hazm import sent_tokenize

fa_bible_corpus = nmv_df.word.to_list()
fa_bible_sentences = sent_tokenize(" ".join(fa_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
fa_bible_sentence_tokens = [word_tokenize(s) for s in fa_bible_sentences]
singles = set([w for w in fa_bible_corpus if len(w)==1])
punctuation = set([s for s in singles if s not in ['آ', 'و', '\u200c', '\u200f']])
fa_bible_sentence_tokens_no_punctuation = [[w for w in s if w not in punctuation] for s in fa_bible_sentence_tokens]

w2v_model = w2v(fa_bible_sentence_tokens_no_punctuation)


In [31]:
def get_vector_wrapper(s):
    try:
        v = w2v_model.wv.get_vector(s)
        return v
    except:
        return None


nmv_df["vector"] = nmv_df.word.apply(get_vector_wrapper)

In [32]:
nmv_df.h

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,word,vector
0,Genesis,0,0,0,در,"[-0.7641278, -0.5622938, -0.9979115, -0.506839..."
0,Genesis,0,0,1,آغاز,"[-0.06881103, 0.12053305, 0.06698449, 0.401715..."
0,Genesis,0,0,2,،,
0,Genesis,0,0,3,خدا,"[1.5973173, 1.1801493, 0.50769883, -0.2529177,..."
0,Genesis,0,0,4,آسمانها,"[0.258966, 0.03336788, -0.028665742, -0.094442..."


In [None]:
pd.read_parquet("NMV_parquet").head()

In [33]:
nmv_df.to_parquet("NMV_hazm_parquet")

In [38]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saaam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [39]:
from nltk import sent_tokenize as eng_sent_tokenize
from nltk import word_tokenize as eng_word_tokenize
from string import punctuation as eng_punctuation
import numpy as np

esv_df = pd.read_parquet("ESV_parquet")
esv_df["strongs_punctuation"] = np.where(
    (esv_df.strongs.isna()) & (esv_df.eng_word.isin([a for a in eng_punctuation])), 
    esv_df.eng_word,
    esv_df.strongs
)
strongs_bible_corpus = esv_df.strongs_punctuation.dropna().to_list()
strongs_bible_sentences = eng_sent_tokenize(" ".join(strongs_bible_corpus)) # nmv_df.groupby(["book", "idx_chapter", "idx_verse"])["word"].agg(list).to_list()
strongs_bible_sentence_tokens = [eng_word_tokenize(s) for s in strongs_bible_sentences]


In [40]:
strongs_bible_sentence_tokens


[['H7225', ',', 'H430', 'H1254', 'H8064', 'H776', '.'],
 ['H776',
  'H1961',
  'H8414',
  'H922',
  'H2822',
  'H5921',
  'H6440',
  'H8415',
  'H7307',
  'H430',
  'H7363',
  'H5921',
  'H6440',
  'H4325',
  '.'],
 ['H430', 'H559', ',', 'H1961', 'H216', ',', 'H216', '.'],
 ['H430',
  'H7200',
  'H216',
  'H3588',
  'H2896',
  'H430',
  'H914',
  'H996',
  'H216',
  'H996',
  'H2822',
  '.'],
 ['H430',
  'H7121',
  'H216',
  'H3117',
  'H2822',
  'H7121',
  'H3915',
  'H6153',
  'H1242',
  'H259',
  'H3117',
  '.'],
 ['H430',
  'H559',
  ',',
  'H7549',
  'H8432',
  'H4325',
  'H914',
  'H4325',
  'H4325',
  '.'],
 ['H430',
  'H6213',
  'H7549',
  'H914',
  'H4325',
  'H834',
  'H8478',
  'H7549',
  'H4325',
  'H834',
  'H5921',
  'H7549',
  'H3651',
  '.'],
 ['H430', 'H7121', 'H7549', 'H8064', '.'],
 ['H6153', 'H1242', 'H8145', 'H3117', '.'],
 ['H430',
  'H559',
  ',',
  'H4325',
  'H4480',
  'H8064',
  'H6960',
  'H413',
  'H259',
  'H4725',
  'H3004',
  'H7200',
  '.'],
 ['H3651', '

In [41]:
strongs_bible_sentence_tokens_no_punctuation = [[w for w in s if w not in [a for a in eng_punctuation]] for s in strongs_bible_sentence_tokens]

strongs_w2v_model = w2v(strongs_bible_sentence_tokens_no_punctuation)



In [43]:
def strongs_get_vector_wrapper(s):
    try:
        v = strongs_w2v_model.wv.get_vector(s)
        return v
    except:
        return None

esv_df["strongs_vector"] = esv_df.strongs.apply(strongs_get_vector_wrapper)
esv_df.head()

Unnamed: 0,book,idx_chapter,idx_verse,idx_word,eng_word,strongs,strongs_punctuation,strongs_vector
0,Genesis,0,0,0,In the,,,
0,Genesis,0,0,1,beginning,H7225,H7225,"[-0.047161054, -0.007426138, -0.018998928, 0.1..."
0,Genesis,0,0,2,",",,",",
0,Genesis,0,0,3,God,H430,H430,"[-0.9601267, 0.5998295, -0.28738803, 0.9247632..."
0,Genesis,0,0,4,created,H1254,H1254,"[-0.07384865, 0.014700174, -0.047103573, 0.193..."


In [None]:
# use following to restrict similarity to words in verse as a key list
w2v_model.wv.most_similar_to_given()

# need to implement https://arxiv.org/pdf/1309.4168.pdf approach to train linear relationship between strongs and farsi wordvec models