In [30]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity

def ActivateModel(modelname="bert-base-multilingual-cased"):
    from transformers import AutoConfig, AutoTokenizer, AutoModel

    # Activate model and tokenizer

    # "HooshvareLab/bert-fa-base-uncased"

    config = AutoConfig.from_pretrained(modelname)
    tokenizer = AutoTokenizer.from_pretrained(modelname)
    model = AutoModel.from_pretrained(modelname)
    return config, tokenizer, model

def LoadBible(parquet_filename):
    # Read pre-processed farsi bible as pandas dataframe
    return pd.read_parquet(f"transformations/{parquet_filename}.parquet")

def GenerateBibleWords(bible_df, target):     
    for bk in set(bible_df.book):
        for ic in bible_df.loc[bible_df.book == bk].idx_chapter:
            for iv in bible_df.loc[(bible_df.book == bk) & (bible_df.idx_chapter == ic)].idx_verse:
                # Prepare subset of verse(s) for feeding to model
                
                bible_words = (
                    bible_df
                    .loc[
                        (bible_df.book == bk) &
                        (bible_df.idx_chapter == ic) &
                        (bible_df.idx_verse == iv)
                    ]
                    .groupby(["book", "idx_chapter", "idx_verse", "idx_word"])[target]
                    .agg(" ".join)
                    .to_list()
                )
                split_bible_words = [a for sub in [s.split("/") for s in bible_words] for a in sub]
                yield split_bible_words, (bk, ic, iv)

def GetWordEmbeddings(words, model, tokenizer):

    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
        
    # Tokenize sentences
    encoded_input = tokenizer(words, padding=True, truncation=False, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    words_embeddings_df = pd.DataFrame({"word": words, "embeddings": embeddings.tolist()})
    return words_embeddings_df

def max_cos_sim(row, comparison):
    if len(row["word"]) > 1 or row["word"] in ['آ', 'و', '\u200c', '\u200f']:
        compare = comparison.copy()
        compare["arr"] = compare.embeddings.apply(lambda x:np.array(x).reshape(1,-1))
        row_arr = np.array(row["embeddings"]).reshape(1,-1)
        compare["sim"] = compare.arr.apply(lambda x:cosine_similarity(row_arr, x))
        out = compare.sort_values("sim").tail(n=1)[["word", "strongs", "sim"]].squeeze()
        return out

def GenerateMatches():
    original = LoadBible("original")
    nmv = LoadBible("NMV_hazm")
    config, tokenizer, model = ActivateModel()
    nmv_list = GenerateBibleWords(nmv, "word")
    orig_list = GenerateBibleWords(original, "eng_word")
    strongs_list = GenerateBibleWords(original, "strongs")
# figure out how to iterate through in parallel and avoid restarting generator each time. 
    for nmv_verse, orig_verse, strongs in zip(
        nmv_list, 
        orig_list,
        strongs_list
    ):
        nmv_embeddings = GetWordEmbeddings(nmv_verse[0], model, tokenizer)
        orig_embeddings = GetWordEmbeddings(orig_verse[0], model, tokenizer)
        orig_embeddings["strongs"] = strongs[0]
        nmv_embeddings[["orig_word", "strongs", "similarity"]] = nmv_embeddings.apply(max_cos_sim, axis=1, args=[orig_embeddings])
        nmv_embeddings[["book", "idx_chapter", "idx_verse"]] = nmv_verse[1]
        nmv_embeddings = nmv_embeddings.set_index(["book", "idx_chapter", "idx_verse"])
        yield nmv_embeddings.drop(columns="embeddings")


In [31]:
match_generator = GenerateMatches()

In [33]:
next(match_generator)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,word,orig_word,strongs,similarity
book,idx_chapter,idx_verse,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,Matthew,Matthew,Matthew,[[1.0000000000000013]]


Worked really well! except it used אלהים twice - for خدا and آفرید. Find a way to limit to one use per word.
Has a problem with names and confusing Jesus with Christ. Check titles of God.

In [43]:
next(GenerateBibleWords(nmv, "word"))

Unnamed: 0,verse,embeddings,orig_word
0,در,"[0.4512456953525543, -0.31960389018058777, 0.5...",את
1,آغاز,"[0.6214501261711121, -0.20138870179653168, 0.7...",בראשית
2,،,"[-0.10916757583618164, -0.277116984128952, 0.2...",את
3,خدا,"[0.3473285734653473, -0.14002932608127594, 1.4...",אלהים
4,آسمانها,"[0.24371211230754852, -0.4253353774547577, 1.4...",השמים
5,و,"[-0.2500661015510559, -0.6296098232269287, 0.3...",ואת
6,زمین,"[0.08398868888616562, -0.539448082447052, 1.50...",הארץ
7,را,"[0.31364795565605164, -0.2202995866537094, 0.6...",ברא
8,آفرید,"[0.5046108961105347, -0.2856839597225189, 0.43...",אלהים
9,.,"[-0.04137072339653969, -0.3036855161190033, 0....",את


In [None]:


nmv_words_embeddings["orig_word"] = nmv_words_embeddings.apply(max_cos_sim, axis=1)
nmv_words_embeddings



In [19]:
numpy.array(orig_words_embeddings.embeddings.values[2]).reshape(-1,1)

array([[ 0.30579352],
       [-0.35263091],
       [ 0.52627289],
       [ 0.19887884],
       [ 0.09378666],
       [-0.39361244],
       [-0.18285608],
       [-0.23386887],
       [-0.00737345],
       [ 0.1719248 ],
       [ 0.11668895],
       [-0.15767533],
       [ 0.03278651],
       [ 0.4927997 ],
       [-0.19492441],
       [-0.13549076],
       [-0.0706151 ],
       [ 0.63727289],
       [-0.23499364],
       [ 0.42784393],
       [ 0.71955812],
       [ 0.12057144],
       [-0.14444244],
       [-0.13567054],
       [-0.05788551],
       [-1.04369688],
       [-0.23726423],
       [ 0.00696002],
       [ 0.02243754],
       [-0.008213  ],
       [ 0.33700311],
       [ 0.36881605],
       [ 0.01510219],
       [ 0.25999826],
       [-0.50559771],
       [ 0.26791051],
       [-0.77487803],
       [-0.01248471],
       [ 0.22467226],
       [-0.19753914],
       [-0.02008138],
       [ 0.29175508],
       [-0.06818126],
       [ 0.15745869],
       [ 0.1348556 ],
       [ 0