In [3]:
import spacy
from scipy.spatial.distance import euclidean
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from pathlib import Path
from sentence_transformers import SentenceTransformer
import modal
from collections import Counter
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [91]:
model = spacy.load('en_core_web_lg')

In [4]:
files = Path('data/training')

In [158]:
data = {}
#iterate over files
for file in files.iterdir():
    if file.stem == 'vref':
        with open(file , 'r') as f:
            current_text = f.readlines()
            current_text = [x.strip() for x in current_text]
            vrefs = current_text
            vref_dict = {vref: i for i, vref in enumerate(vrefs)}
            continue
    with open(file , 'r') as f:
        current_text = f.readlines()
        current_text = [x.strip() for x in current_text]
        data[file.stem] = current_text

In [45]:
excluded_words = ['the', 'a', ',', '.', 'in', 'to', 'of', '-', '“', '”']

In [154]:
first_line = 0
last_line = 41899

In [None]:
words = {}
grammar = {}
expected_words = {}
for line_no in range(first_line, last_line):
    vref = vrefs[line_no]
    print(vref)
    expected_words[vref] = {}
    words = {}
    matches = {}
    checked_words = []
    verb_counter = {}

    for current_text in data:
        words[current_text] = []
        d = model(data[current_text][line_no].lower())
        for token in d:
            words[current_text].append({'token': str(token), 'vector': token.vector, 'pos': token.pos_})
    for current_text in data:
        for expected_word in words[current_text]:
            if expected_word['token'] in checked_words:
                continue
            if expected_word['token'] in excluded_words:
                continue
            if expected_word['token'] in matches:
                continue
            if np.count_nonzero(expected_word['vector']) == 0:
                continue
            top_similarities = np.array([])

            for other_text in data:
                distances = [euclidean(expected_word['vector'], other_word['vector']) for other_word in words[other_text]]
                if distances:
                    top_similarities = np.append(top_similarities, np.min(distances))
            if np.max(top_similarities) < 20:
                matches[expected_word['token']] = expected_word['vector']
                expected_words[vref][expected_word['token']] = {'vector': expected_word['vector'], 'pos': expected_word['pos']}
            checked_words.append(expected_word['token'])

In [149]:
rows = []
for vref, word_data in expected_words.items():
    for word, word_info in word_data.items():
        rows.append({'vref': vref, 'line_no': vref_dict[vref], 'word': word, 'pos': word_info['pos'], 'vector': word_info['vector']})

In [6]:
df = pd.DataFrame(rows)
df = df.fillna('')

In [None]:
for translation in data:
    df[translation] = df['vref'].apply(lambda x: data[translation][vref_dict[x]])

In [8]:
df.to_csv('data/expected_words.csv')

In [119]:
with open ('data/en-GNBUK.txt' , 'r') as f:
    current_text = f.readlines()
    current_text = [x.strip().lower() for x in current_text]
    verses = current_text

In [120]:
inference_words = {}
for line_no in range(first_line, last_line):
    inference_words[vrefs[line_no]] = {}
    verse_words = []
    d = model(verses[line_no])
    for token in d:
        inference_words[vrefs[line_no]][str(token)] = {'token': str(token), 'vector': token.vector}

In [84]:
def find_non_range_vref(line_no, vrefs, inference_words):
    while line_no >= 0:  # You can set a more appropriate lower limit
        vref = vrefs[line_no]
        verse_words = inference_words[vref]
        if not ('<' in verse_words and '>' in verse_words and 'range' in verse_words):
            return vref
        line_no -= 1
    return None 

In [121]:
for line_no in range(first_line, last_line):
    vref = vrefs[line_no]
    inference_vref = vref
    inference_vref = find_non_range_vref(line_no, vrefs, inference_words)
        # use the previous verse
    verse_words = inference_words[inference_vref]
    for expected_word, expected_word_info in expected_words[vref].items():
        distances = [euclidean(word['vector'], expected_word_info['vector']) for word in verse_words.values()]
        if np.min(distances) > 40:
            print(f'{vref}: {expected_word}: {np.min(distances)}')

MAT 1:6: king: 58.570579528808594
MAT 1:11: at: 118.23568725585938
MAT 1:16: is: 94.45436096191406
MAT 1:17: all: 59.84769821166992
MAT 1:19: her: 82.86419677734375
MAT 1:19: and: 50.42655563354492
MAT 1:20: son: 57.62818908691406
MAT 1:20: her: 64.83782196044922
MAT 1:23: they: 59.232181549072266
MAT 1:24: did: 72.64629364013672
MAT 1:24: his: 81.50347900390625
MAT 1:24: wife: 50.55107498168945
MAT 2:6: among: 57.05116271972656
MAT 2:7: then: 43.94667053222656
MAT 2:15: death: 62.96446990966797
MAT 2:16: under: 53.71119689941406
MAT 2:18: and: 53.76637649536133
MAT 2:21: land: 73.59423065185547
MAT 3:1: those: 51.08821105957031
MAT 3:1: days: 73.4655532836914
MAT 3:2: for: 66.38871765136719
MAT 3:3: voice: 51.382080078125
MAT 3:3: way: 60.681278228759766
MAT 3:9: say: 48.72941207885742
MAT 3:9: we: 81.80244445800781
MAT 3:9: children: 44.098846435546875
MAT 3:12: is: 105.95774841308594
MAT 3:13: then: 43.94667053222656


MAT 3:17: and: 51.96299362182617
MAT 4:1: was: 82.08447265625
MAT 4:2: he: 123.3864517211914
MAT 4:3: become: 59.81270980834961
MAT 4:4: mouth: 54.353843688964844
MAT 4:6: he: 100.50391387939453
MAT 4:6: foot: 41.56648635864258
MAT 4:10: for: 67.30860900878906
MAT 4:17: for: 66.38871765136719
MAT 4:18: sea: 63.23916244506836
MAT 4:18: into: 73.010498046875
MAT 4:18: for: 67.638427734375
MAT 4:18: they: 59.02048110961914
MAT 4:24: he: 100.50391387939453
MAT 5:3: is: 102.53864288330078
MAT 5:4: they: 45.875858306884766
MAT 5:4: be: 133.22691345214844
MAT 5:5: inherit: 40.54157257080078
MAT 5:6: and: 56.576560974121094
MAT 5:6: for: 63.84164810180664
MAT 5:6: they: 45.875858306884766
MAT 5:6: be: 133.22691345214844
MAT 5:7: they: 45.875858306884766
MAT 5:9: they: 45.875858306884766
MAT 5:9: be: 133.22691345214844
MAT 5:9: called: 53.478511810302734
MAT 5:10: for: 63.84164810180664
MAT 5:10: is: 101.4598617553711
MAT 5:13: ?: 65.57511901855469
MAT 5:13: be: 131.36361694335938
MAT 5:16: let

In [122]:
df[(df['vref'] == 'MAT 1:20') & (df['word'] == 'son')]

Unnamed: 0,vref,word,pos,vector,en-NLT07,eng-engylt,eng-eng-rv,eng-engasvbt,en-NASB,en-ESVUK,eng-engDBY,en-NIV11,en-NRSV
67,MAT 1:20,son,NOUN,"[-1.8598, -2.4616, -5.7032, -2.3932, -0.86175,...","as he considered this, an angel of the lord ap...","and on his thinking of these things, lo, a mes...","but when he thought on these things, behold, a...","but when he thought on these things, behold, a...","but when he had considered this, behold, an an...","but as he considered these things, behold, an ...","but while he pondered on these things, behold,...","but after he had considered this, an angel of ...","but just when he had resolved to do this, an a..."


In [4]:
df = pd.read_csv('data/expected_words.csv', index_col=0)

In [129]:
df['vector'] = df['vector'].apply(lambda x: np.fromstring(x[1:-1], sep=' '))

In [136]:
df

Unnamed: 0,vref,line_no,word,pos,vector,en-NLT07,eng-engylt,eng-eng-rv,eng-engasvbt,en-NASB,en-ESVUK,eng-engDBY,en-NIV11,en-NRSV
0,MAT 1:1,23213,jesus,PROPN,[ 2.1244 0.69442 -1.6792 -0.14436 -...,this is a record of the ancestors of jesus the...,"a roll of the birth of jesus christ, son of da...","the book of the generation of jesus christ, th...","the book of the generation of jesus christ, th...",the record of the genealogy of jesus the messi...,"the book of the genealogy of jesus christ, the...","book of the generation of jesus christ, son of...",this is the genealogy of jesus the messiah the...,an account of the genealogy of jesus the messi...
1,MAT 1:1,23213,david,PROPN,[-1.0181 -0.25792 -3.2743 -0.58327 -...,this is a record of the ancestors of jesus the...,"a roll of the birth of jesus christ, son of da...","the book of the generation of jesus christ, th...","the book of the generation of jesus christ, th...",the record of the genealogy of jesus the messi...,"the book of the genealogy of jesus christ, the...","book of the generation of jesus christ, son of...",this is the genealogy of jesus the messiah the...,an account of the genealogy of jesus the messi...
2,MAT 1:1,23213,abraham,PROPN,[-1.0802e+00 1.3458e-01 -2.1085e+00 -5.6739e-...,this is a record of the ancestors of jesus the...,"a roll of the birth of jesus christ, son of da...","the book of the generation of jesus christ, th...","the book of the generation of jesus christ, th...",the record of the genealogy of jesus the messi...,"the book of the genealogy of jesus christ, the...","book of the generation of jesus christ, son of...",this is the genealogy of jesus the messiah the...,an account of the genealogy of jesus the messi...
3,MAT 1:2,23214,abraham,PROPN,[-1.0802e+00 1.3458e-01 -2.1085e+00 -5.6739e-...,abraham was the father of isaac. isaac was the...,"abraham begat isaac, and isaac begat jacob, an...",abraham begat isaac; and isaac begat jacob; an...,abraham begat isaac; and isaac begat jacob; an...,"abraham was the father of isaac, isaac the fat...","abraham was the father of isaac, and isaac the...","abraham begat isaac; and isaac begat jacob, an...","abraham was the father of isaac, isaac the fat...","abraham was the father of isaac, and isaac the..."
4,MAT 1:2,23214,isaac,PROPN,[-0.851 -0.55611 -0.93684 -0.18849 -0.78...,abraham was the father of isaac. isaac was the...,"abraham begat isaac, and isaac begat jacob, an...",abraham begat isaac; and isaac begat jacob; an...,abraham begat isaac; and isaac begat jacob; an...,"abraham was the father of isaac, isaac the fat...","abraham was the father of isaac, and isaac the...","abraham begat isaac; and isaac begat jacob, an...","abraham was the father of isaac, isaac the fat...","abraham was the father of isaac, and isaac the..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2334,MAT 13:3,23610,went,VERB,[ 3.1810e-01 2.0360e-01 -3.1375e+00 -2.3839e+...,"he told many stories in the form of parables, ...","and he spake to them many things in similes, s...","and he spake to them many things in parables, ...","and he spake to them many things in parables, ...","and he spoke many things to them in parables, ...","and he told them many things in parables, sayi...","and he spoke to them many things in parables, ...","then he told them many things in parables, say...","and he told them many things in parables, sayi..."
2335,MAT 13:4,23611,some,DET,[-9.1138e-01 -1.2467e+00 -1.4404e+00 3.2280e+...,"as he scattered them across his field, some se...","and in his sowing, some indeed fell by the way...","and as he sowed, some seeds fell by the way si...","and as he sowed, some seeds fell by the waysid...","and as he sowed, some seeds fell beside the ro...","and as he sowed, some seeds fell along the pat...","and as he sowed, some [grains] fell along the ...","as he was scattering the seed, some fell along...","and as he sowed, some seeds fell on the path, ..."
2336,MAT 13:4,23611,fell,VERB,[ 1.1734 -7.2025 0.27734 -5.4665 ...,"as he scattered them across his field, some se...","and in his sowing, some indeed fell by the way...","and as he sowed, some seeds fell by the way si...","and as he sowed, some seeds fell by the waysid...","and as he sowed, some seeds fell beside the ro...","and as he sowed, some seeds fell along the pat...","and as he sowed, some [grains] fell along the ...","as he was scattering the seed, some fell along...","and as he sowed, some seeds fell on the path, ..."
2337,MAT 13:4,23611,and,CCONJ,[-3.3477e+00 -6.0854e+00 -3.6366e+00 5.3480e-...,"as he scattered them across his field, some se...","and in his sowing, some indeed fell by the way...","and as he sowed, some seeds fell by the way si...","and as he sowed, some seeds fell by the waysid...","and as he sowed, some seeds fell beside the ro...","and as he sowed, some seeds fell along the pat...","and as he sowed, some [grains] fell along the ...","as he was scattering the seed, some fell along...","and as he sowed, some seeds fell on the path, ..."


In [143]:
text_cols = [col for col in df.columns if col not in ['vref', 'line_no', 'word', 'pos', 'vector']]

In [144]:
text_cols

['en-NLT07',
 'eng-engylt',
 'eng-eng-rv',
 'eng-engasvbt',
 'en-NASB',
 'en-ESVUK',
 'eng-engDBY',
 'en-NIV11',
 'en-NRSV']

In [140]:
row = df.iloc[0]

In [145]:
{translation: row[translation] for translation in text_cols}

{'en-NLT07': 'this is a record of the ancestors of jesus the messiah, a descendant of david and of abraham:',
 'eng-engylt': 'a roll of the birth of jesus christ, son of david, son of abraham.',
 'eng-eng-rv': 'the book of the generation of jesus christ, the son of david, the son of abraham.',
 'eng-engasvbt': 'the book of the generation of jesus christ, the son of david, the son of abraham.',
 'en-NASB': 'the record of the genealogy of jesus the messiah, the son of david, the son of abraham:',
 'en-ESVUK': 'the book of the genealogy of jesus christ, the son of david, the son of abraham.',
 'eng-engDBY': 'book of the generation of jesus christ, son of david, son of abraham.',
 'en-NIV11': 'this is the genealogy of jesus the messiah the son of david, the son of abraham:',
 'en-NRSV': 'an account of the genealogy of jesus the messiah, the son of david, the son of abraham.'}