In [1]:

import pandas as pd
import os
import requests
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import gradio as gr
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the translation model and tokenizer
print('Loading translation model and tokenizer...')
translation_model_name = "facebook/nllb-200-distilled-600M"
translation_model = AutoModelForSeq2SeqLM.from_pretrained(
    translation_model_name)
translation_tokenizer = AutoTokenizer.from_pretrained(translation_model_name)

# Instantiate the translation pipelines
deu_translator = pipeline('translation', model=translation_model,
                          tokenizer=translation_tokenizer, src_lang="en", tgt_lang="de")
spa_translator = pipeline('translation', model=translation_model,
                          tokenizer=translation_tokenizer, src_lang="en", tgt_lang="es")
fra_translator = pipeline('translation', model=translation_model,
                          tokenizer=translation_tokenizer, src_lang="en", tgt_lang="fr")

# Load the model and tokenizer
print('Loading QA model and tokenizer...')
# model_name = "deepset/roberta-base-squad2"
model_name = "deepset/bert-large-uncased-whole-word-masking-squad2"

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

qa_pipeline = pipeline('question-answering',
                       model=model_name, tokenizer=model_name)


# Set up MACULA data as pandas dataframe
print('Downloading MACULA Greek data...')


def download_file(url, file_name):
    response = requests.get(url)
    with open(file_name, 'wb') as file:
        file.write(response.content)


file1_url = 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/Nestle1904/TSV/macula-greek.tsv'
file2_url = 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/sources/MARBLE/SDBG/marble-domain-label-mapping.json'
file1_name = 'macula-greek.tsv'
file2_name = 'marble-domain-label-mapping.json'

if file1_name not in os.listdir():
    download_file(file1_url, file1_name)

if file2_name not in os.listdir():
    download_file(file2_url, file2_name)

# Import Macula Greek data
mg = pd.read_csv('macula-greek.tsv', index_col='xml:id', sep='\t',
                 header=0, converters={'*': str}).fillna('missing')
# mg['domain'] = mg['domain'].astype(str).fillna('missing')

# Extract book, chapter, and verse into separate columns
mg[['book', 'chapter', 'verse']] = mg['ref'].str.extract(
    r'(\d?[A-Z]+)\s(\d+):(\d+)')

# Add columns for book + chapter, and book + chapter + verse for easier grouping
mg['book_chapter'] = mg['book'] + ' ' + mg['chapter'].astype(str)
mg['book_chapter_verse'] = mg['book_chapter'] + ':' + mg['verse'].astype(str)

# Import domain-label mapping

# Open the JSON file
with open('marble-domain-label-mapping.json', 'r') as f:

    # Load the contents of the file as a dictionary
    domain_labels = json.load(f)

domain_labels['missing'] = 'no domain'
domain_labels['nan'] = 'no domain'

# Use domain labels to create a new column


def get_domain_label(domain_string_number):
    labels = [domain_labels[label]
              for label in domain_string_number.split(' ')]
    return labels


mg['domain_label'] = mg['domain'].apply(get_domain_label)

# to get data for a specific word, use the following code:
# mg.loc['n40001001002'].to_dict() where n40001001002 is the word ID

# Create a dictionary with attribute descriptions
attribute_descriptions = {
    "after": "Encodes the following character, including a blank space.",
    "articular": "'true' if the word has an article (i.e., modified by the word 'the').",
    "case": "Grammatical case: nominative, genitive, dative, accusative, or vocative",
    "class": "On words, the class is the word's part of speech",
    "cltype": "Explicitly marks Verbless Clauses, Verb Elided Clauses, and Minor Clauses",
    "degree": "A derivative lexical category, indicating the degree of the adjective",
    "discontinuous": "'true' if the word is discontinuous with respect to sentence order due to reordering in the syntax tree",
    "domain": "Semantic domain information from the Semantic Dictionary of Biblical Greek (SDBG)",
    "frame": "Frames of verbs, refers to the arguments of the verb",
    "gender": "Grammatical gender values",
    "gloss": "SIL data, not Berean",
    "lemma": "Form of the word as it appears in a dictionary.",
    "ln": "Short for Louw-Nida, representing the semantic domain entry in Johannes P. Louw and Eugene Albert Nida, Greek-English Lexicon of the New Testament: Based on Semantic Domains (New York: United Bible Societies, 1996).",
    "mood": "Grammatical mood",
    "morph": "Morphological parsing codes",
    "normalized": "The normalized form of the token (i.e., no trailing or leading punctuation or accent shifting depending on context)",
    "number": "Grammatical number",
    "person": "Grammatical person",
    "ref": "Verse!word reference to this edition of the Nestle1904 text by USFM id",
    "referent": "The xml:id of the node to which a pronoun (i.e., 'he') refers. Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "role": "The clause-level role of the word.",
    "strong": "Strong's number for the lemma",
    "subjref": "The xml:id of the node that is the implied subject of a verb (for verbs without an explicit subject). Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "tense": "Grammatical tense form",
    "text": "Text content associated with the ID",
    "type": "Indicates different types of pronominals",
    "voice": "Grammatical voice",
    "xml:id": "XML ids occur on every word and encode the corpus ('n' for New Testament), the book (40 for Matthew), the chapter (001), verse (001), and word (001)."
}


def generate_prosaic_context(word_id, selected_fields=None):
    word_data = mg.loc[word_id].to_dict()
    prompt = f"{word_data['lemma']}'s "

    if not selected_fields:
        selected_fields = list(attribute_descriptions.keys())

    descriptions = []
    for key in selected_fields:
        value = word_data.get(key)
        if value not in (None, 'missing', 'nan'):
            attribute_description = attribute_descriptions.get(key)
            descriptions.append(f"{attribute_description} ({key}): {value}")

    prompt += ", ".join(descriptions)
    return prompt

# to generate prosaic context: prosaic_context = generate_prosaic_context(word_id)


# to return a sentence based on the book_chapter_verse value:
# mg[mg['book_chapter_verse'] == 'MAT 1:1'] # this will return every row for this verse
# for row in mg[mg['book_chapter_verse'] == 'MAT 1:1'].itertuples():
#     print(row)

# create a set from mg['book_chapter_verse'].unique()
unique_book_chapter_verse = set(mg['book_chapter_verse'].unique())

verseRef = 'JHN 3:16'

# def extract_text_and_gloss(verseRef):
#     if verseRef not in unique_book_chapter_verse:
#         return {text: 'verse not found', gloss: ''}
#     result = {}
#     for _, row in mg[mg['book_chapter_verse'] == verseRef].iterrows():
#         result[row['text']] = row['gloss']
#     return result

# text_and_gloss = extract_text_and_gloss(verseRef) # to get text and gloss




Loading translation model and tokenizer...
Loading QA model and tokenizer...
Downloading MACULA Greek data...


In [2]:
# Define some helper functions

def get_contextual_data(tokenId):
    prosaic_context = generate_prosaic_context(tokenId)
    return prosaic_context


def get_verse_content(verseRef, dataframe):
    unique_book_chapter_verse = set(dataframe['book_chapter_verse'])
    if verseRef not in unique_book_chapter_verse:
        return []
    matching_rows = dataframe[dataframe['book_chapter_verse'] == verseRef]
    tokens = [{"text": row['text'], "gloss": row['gloss'],"id": idx}
              for idx, row in matching_rows.iterrows()]
    return tokens


def answer_question(question, context):
    input_dict = {'question': question, 'context': context}
    answer = qa_pipeline(input_dict)
    return answer['answer']


def translate_text(text):
    deu_result = deu_translator(text)[0]['translation_text']
    spa_result = spa_translator(text)[0]['translation_text']
    fra_result = fra_translator(text)[0]['translation_text']
    return deu_result, spa_result, fra_result


def gradio_get_verse_content(verseRef):
    tokens = get_verse_content(verseRef, mg)
    return tokens


def gradio_wrapper(inputs, context):
    question = inputs
    answer = answer_question(question, context)
    deu_translation, spa_translation, fra_translation = translate_text(answer)
    return answer, deu_translation, spa_translation, fra_translation


In [3]:
#TODO: add a function to get discourse features for token using Jake's API

In [4]:
demo = gr.Blocks()

with demo:
    gr.Markdown("Flip text or image files using this demo.")
    with gr.Row():
        verse_reference = gr.Textbox(lines=1, label="Verse Reference", placeholder="Enter verse reference here (e.g. 'JHN 3:16')", value="JHN 3:16")
        text_output = gr.Textbox(placeholder="Verse data will appear here", max_lines=10)
    get_verse_content_button = gr.Button("Get Verse Content")

    get_verse_content_button.click(
        gradio_get_verse_content, inputs=verse_reference, outputs=text_output)

    word_id_input = gr.Textbox(lines=1, label="Word ID", placeholder="Enter word ID here (e.g. 'n43003016012')", value="n43003016012")
    context_output = gr.Textbox(placeholder="Context will appear here", max_lines=10)
    
    get_context_button = gr.Button("Get Context")
    
    get_context_button.click(
        get_contextual_data, inputs=word_id_input, outputs=context_output)
    
    question_input = gr.Textbox(lines=2, label="Question", placeholder="Enter question here")
    answer_output = gr.Textbox(placeholder="Answer will appear here", max_lines=10)
    
    get_answer_button = gr.Button("Get Answer")
    
    get_answer_button.click(
        gradio_wrapper, inputs=[question_input, context_output], outputs=answer_output)
    
demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


