In [1]:
!pip show transformers

Name: transformers
Version: 4.27.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /Users/ryderwishart/miniforge3/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, tokenizers, tqdm
Required-by: 


In [2]:

import pandas as pd
import os
import requests
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the translation model and tokenizer
print('Loading translation model and tokenizer...')
translation_model_name = "facebook/nllb-200-distilled-600M"
translation_model = AutoModelForSeq2SeqLM.from_pretrained(
    translation_model_name)
translation_tokenizer = AutoTokenizer.from_pretrained(translation_model_name)

# Instantiate the translation pipelines
deu_translator = pipeline('translation', model=translation_model,
                          tokenizer=translation_tokenizer, src_lang="en", tgt_lang="de")
spa_translator = pipeline('translation', model=translation_model,
                          tokenizer=translation_tokenizer, src_lang="en", tgt_lang="es")
fra_translator = pipeline('translation', model=translation_model,
                          tokenizer=translation_tokenizer, src_lang="en", tgt_lang="fr")

# Load the model and tokenizer
print('Loading inference model and tokenizer...')
# model_name = "bigscience/bloom-7b1"
# model_name = "decapoda-research/llama-7b-hf"
model_name = "openai-gpt"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

qa_pipeline = pipeline('text-generation',
                       model=model_name, tokenizer=model_name)


# Set up MACULA data as pandas dataframe
print('Downloading MACULA Greek data...')


def download_file(url, file_name):
    response = requests.get(url)
    with open(file_name, 'wb') as file:
        file.write(response.content)


file1_url = 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/Nestle1904/TSV/macula-greek.tsv'
file2_url = 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/sources/MARBLE/SDBG/marble-domain-label-mapping.json'
file1_name = 'macula-greek.tsv'
file2_name = 'marble-domain-label-mapping.json'

if file1_name not in os.listdir():
    download_file(file1_url, file1_name)

if file2_name not in os.listdir():
    download_file(file2_url, file2_name)

# Import Macula Greek data
mg = pd.read_csv('macula-greek.tsv', index_col='xml:id', sep='\t',
                 header=0, converters={'*': str}).fillna('missing')
# mg['domain'] = mg['domain'].astype(str).fillna('missing')

# Extract book, chapter, and verse into separate columns
mg[['book', 'chapter', 'verse']] = mg['ref'].str.extract(
    r'(\d?[A-Z]+)\s(\d+):(\d+)')

# Add columns for book + chapter, and book + chapter + verse for easier grouping
mg['book_chapter'] = mg['book'] + ' ' + mg['chapter'].astype(str)
mg['book_chapter_verse'] = mg['book_chapter'] + ':' + mg['verse'].astype(str)

# Import domain-label mapping

# Open the JSON file
with open('marble-domain-label-mapping.json', 'r') as f:

    # Load the contents of the file as a dictionary
    domain_labels = json.load(f)

domain_labels['missing'] = 'no domain'
domain_labels['nan'] = 'no domain'

# Use domain labels to create a new column


def get_domain_label(domain_string_number):
    labels = [domain_labels[label]
              for label in domain_string_number.split(' ')]
    return labels


mg['domain_label'] = mg['domain'].apply(get_domain_label)

# to get data for a specific word, use the following code:
# mg.loc['n40001001002'].to_dict() where n40001001002 is the word ID

# Create a dictionary with attribute descriptions
attribute_descriptions = {
    "after": "Encodes the following character, including a blank space.",
    "articular": "'true' if the word has an article (i.e., modified by the word 'the').",
    "case": "Grammatical case: nominative, genitive, dative, accusative, or vocative",
    "class": "On words, the class is the word's part of speech",
    "cltype": "Explicitly marks Verbless Clauses, Verb Elided Clauses, and Minor Clauses",
    "degree": "A derivative lexical category, indicating the degree of the adjective",
    "discontinuous": "'true' if the word is discontinuous with respect to sentence order due to reordering in the syntax tree",
    "domain": "Semantic domain information from the Semantic Dictionary of Biblical Greek (SDBG)",
    "frame": "Frames of verbs, refers to the arguments of the verb",
    "gender": "Grammatical gender values",
    "gloss": "SIL data, not Berean",
    "lemma": "Form of the word as it appears in a dictionary.",
    "ln": "The semantic domain entry in Louw and Nida's, 'Greek-English Lexicon of the New Testament: Based on Semantic Domains'.",
    "mood": "Grammatical mood",
    "morph": "Morphological parsing codes",
    "normalized": "The normalized form of the token (i.e., no trailing or leading punctuation or accent shifting depending on context)",
    "number": "Grammatical number",
    "person": "Grammatical person",
    "ref": "Verse!word reference to this edition of the Nestle1904 text by USFM id",
    "referent": "The xml:id of the node to which a pronoun (i.e., 'he') refers. Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "role": "The clause-level role of the word.",
    "strong": "Strong's number for the lemma",
    "subjref": "The xml:id of the node that is the implied subject of a verb (for verbs without an explicit subject). Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "tense": "Grammatical tense form",
    "text": "Text content associated with the ID",
    "type": "Indicates different types of pronominals",
    "voice": "Grammatical voice",
    "xml:id": "XML ids occur on every word and encode the corpus ('n' for New Testament), the book (40 for Matthew), the chapter (001), verse (001), and word (001)."
}

discourse_types = {
    'Main clauses': {'description': 'Main clauses are the top-level clauses in a sentence. They are the clauses that are not embedded in other clauses.'},
    'Historical Perfect': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'Specific Circumstance': {'description': 'The function of ἐγενετο ‘it came about’ and an immediately following temporal expression varies with the author (see DFNTG §10.3). In Matthew’s Gospel, it usually marks major divisions in the book (e.g. Mt 7:28). In Luke-Acts, in contrast, ‘it picks out from the general background the specific circumstance for the foreground events that are to follow’ (ibid.), as in Acts 9:37 (see also Mt 9:10).'},
    'Verb Focus+': {'description': 'Verb in final position in clause demonstrates verb focus.'},
    'Articular Pronoun': {'description': 'Articular pronoun, which often introduces an ‘intermediate step’ in a reported conversation.'},
    'Topical Genitive': {'description': 'A genitival constituent that is nominal is preposed within the noun phrase for two purposes: 1) to bring it into focus; 2) within a point of departure, to indicate that it is the genitive in particular which relates to a corresponding constituent of the context.(DFNTG §4.5)'},
    'Embedded DFE': {'description': "'Dominant focal elements' embedded within a constituent in P1."},
    'Reported Speech': {'description': 'Reported speech.'},
    'Ambiguous': {'description': 'Marked but ambiguous constituent order.'},
    'Over-encoding': {'description': 'Any instance in which more encoding than the default is employed to refer to an active participant or prop. Over-encoding is used in Greek, as in other languages: to mark the beginning of a narrative unit (e.g. Mt 4:5); and to highlight the action or speech concerned (e.g. Mt 4:7).'},
    'Highlighter': {'description': 'Presentatives - Interjections such as ἰδού and ἴδε ‘look!, see!’ typically highlight what immediately follows (Narr §5.4.2, NonNarr §7.7.3).'},
    'Referential PoD': {'description': 'Pre-verbal topical subject other referential point of departure (NARR §3.1, NonNarr §4.3, DFNTG §§2.2, 2.8; as in 1 Th 1:6).'},
    'annotations': {'description': 'Inline annotations.'},
    'Left-Dislocation': {'description': 'Point of departure - A type of SENTENCE in which one of the CONSTITUENTS appears in INITIAL position and its CANONICAL position is filled by a PRONOUN or a full LEXICAL NOUN PHRASE with the same REFERENCE, e.g. John, I like him/the old chap.”'},
    'Focus+': {'description': 'Constituents placed in P2 to give them focal prominence.'},
    'Tail-Head linkage': {'description': 'Point of departure involving renewal - Tail-head linkage involves “the repetition in a subordinate clause, at the beginning (the ‘head’) of a new sentence, of at least the main verb of the previous sentence (the tail)” (Dooley & Levinsohn 2001:16).'},
    'Postposed them subject': {'description': 'When a subject is postposed to the end of its clause (following nominals or adjuncts), it is marked ThS+ (e.g. Lk 1:41 [twice]). Such postposing typically marks as salient the participant who performs the next event in chronological sequence in the story (see Levinsohn 2014).'},
    'EmbeddedRepSpeech': {'description': 'Embedded reported speech - speech that is reported within a reported speech.'},
    'Futuristic Present': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'OT quotes': {'description': 'Old Testament quotations.'},
    'Constituent Negation': {'description': 'Negative pro-forms when they are in P2 indicate that the constituent has been negated rather than the clause as a whole.'},
    'Split Focal': {'description': 'The second part of a focal constituent with only the first part in P2 (NonNarr §5.5, DFNTG §4.4).'},
    'Right-Dislocated': {'description': 'Point of departure - A type of SENTENCE in which one of the CONSTITUENTS appears in FINAL position and its CANONICAL position is filled by a PRONOUN with the same REFERENCE, e.g. ... He’s always late, that chap.'},
    'Appositive': {'description': 'Appositive'},
    'Situational PoD': {'description': 'Situational point of departure (e.g. temporal, spatial, conditional―(NARR §3.1, NonNarr §4.3, DFNTG §§2.2, 2.8; as in 1 Th 3:4).'},
    'Historical Present': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'Noun Incorporation': {'description': 'Some nominal objects that appear to be in P2 may precede their verb because they have been “incorporated” (Rosen 1989) in the verb phrase. Typically, the phrase consists of an indefinite noun and a “light verb” such as “do, give, have, make, take” (Wikipedia entry on Light Verbs).'},
    'Thematic Prominence': {'description': 'Thematic prominence - In Greek, prominence is given to active participants and props who are the current centre of attention (NARR §4.6) by omitting the article (DFNTG §§9.2.3-9.4), by adding αυτος ‘-self’ (e.g. in 1 Th 3:11), by using the proximal demonstrative οὗτος (NARR chap. 9, Appendix 1; e.g. in 3:3), and by postposing the constituent concerned (e.g. Mt 14:29). If such constituents are NOT in postion P1, they are demonstrating topical prominence.'},
    'Cataphoric Focus': {'description': 'An expression that points forward to and highlights something which ‘is about to be expressed.’'},
    'Cataphoric referent': {'description': 'The clause or sentence to which a cataphoric reference refers when NOT introduced with ὅτι or ἵνα.'},
    'DFE': {'description': 'Constituents that may be moved from their default position to the end of a proposition to give them focal prominence include verbs, pronominals and objects that follow adjuncts (NonNarr §5.3, DFNTG §3.5). Such constituents, also called ‘dominant focal elements’or DFEs (Heimedinger 1999:167).'},
    'Embedded Focus+': {'description': 'A constituent of a phrase or embedded clause preposed for focal prominence.'}
}

# create a set from mg['book_chapter_verse'].unique()
unique_book_chapter_verse = set(mg['book_chapter_verse'].unique())



Loading translation model and tokenizer...
Loading inference model and tokenizer...


Some weights of OpenAIGPTLMHeadModel were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of OpenAIGPTLMHeadModel were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading MACULA Greek data...


In [3]:
ENDPOINT = 'https://macula-atlas-api-qa-25c5xl4maa-uk.a.run.app/graphql/'
headers = {"Content-Type": "application/json"}

query = """
query AnnotationFeatures($filters1: AnnotationFeatureFilter, $filters2: AnnotationFilter, $filters3: WordTokenFilter ) {
  annotationFeatures(filters: $filters1) {
    label
    uri
    instances(filters: $filters2) {
      uri
      tokens(filters: $filters3) {
        ref
        wordValue
        xmlId
        lemma {
          value
          prose
        }

      }
    }
  }
}
"""

def get_discourse_annotation_types(xmlId):
    tokenData = mg.loc[xmlId].to_dict()
    passage = tokenData['ref'].split('!')[0]
    
    variables = {
        "filters1": {
            "reference": passage,
        },
        "filters2": {
            "reference": passage,
        },
        "filters3": {
            "xmlId": xmlId,
        }
        }
    
    payload = {'query': query, 'variables': variables}
    
    response = requests.post(ENDPOINT, json=payload, headers=headers)
    
    response_data = json.loads(response.text) 
    annotation_features = response_data["data"]["annotationFeatures"]

    labels = [feature["label"] for feature in annotation_features]
    return labels

In [24]:
def generate_prosaic_context(word_id, selected_fields=None):
    word_data = mg.loc[word_id].to_dict()
    lemma = word_data['lemma']

    if not selected_fields:
        selected_fields = list(attribute_descriptions.keys())

    context_data = {
        "1. Lexical features": [],
        "2. Syntactic context and function": [],
        "3. Discourse context": [],
        "4. Social context": [],  # To be implemented
        "5. Cultural/encyclopedic knowledge": []
    }

    for key in selected_fields:
        value = word_data.get(key)
        if value not in (None, 'missing', 'nan'):
            if key == "class":
                context_data["1. Lexical features"].append(f"- {lemma} is a {value},")
            elif key == "gloss":
                context_data["1. Lexical features"].append(f"- meaning \"{value}.\"")
            elif key == "lemma":
                context_data["1. Lexical features"].append(f"- The lemma form of this word is {value},")
            elif key == "morph":
                context_data["1. Lexical features"].append(f"- and it is parsed as a {value}") # TODO: expand morphological parse codes into prose - although, is this necessary given the other data points?
            elif key == "strong":
                context_data["1. Lexical features"].append(f"- with a Strong's number of {value}.")
            elif key in ("person", "number", "gender", "case", "tense", "voice", "mood", "degree", "type"):
                context_data["2. Syntactic context and function"].append(f"- {attribute_descriptions[key]}: {value},")
            elif key in ("ln"):
                context_data["5. Cultural/encyclopedic knowledge"].append(f"- {attribute_descriptions[key]}: {value},")
            elif key in ("domain_label"):
                context_data["5. Cultural/encyclopedic knowledge"].append(f"- {attribute_descriptions[key]}: {domain_labels[value]},")

    discourse_features = get_discourse_annotation_types(word_id)
    if discourse_features:
        context_data["3. Discourse context"].append(f"This word functions within {len(discourse_features)} discourse features:")
        for feature in discourse_features:
            context_data["3. Discourse context"].append(f"- {feature} is defined as {discourse_types[feature]['description']}")

    output_lines = []
    for header, sentences in context_data.items():
        if sentences:
            output_lines.append(f"## {header}\n")
            output_lines.append("\n".join(sentences))
            output_lines.append("\n")

    prosaic_context = "".join(output_lines)
    return prosaic_context


In [25]:
# Define some helper functions

def get_contextual_data(tokenId):
    prosaic_context = generate_prosaic_context(tokenId)
    print(prosaic_context)
    return prosaic_context


def get_verse_content(verseRef, dataframe):
    unique_book_chapter_verse = set(dataframe['book_chapter_verse'])
    if verseRef not in unique_book_chapter_verse:
        return []
    matching_rows = dataframe[dataframe['book_chapter_verse'] == verseRef]
    tokens = [{"text": row['text'], "gloss": row['gloss'],"id": idx}
              for idx, row in matching_rows.iterrows()]
    return tokens


# def answer_question(question, context):
#     input_dict = {'question': question, 'context': context}
#     answer = qa_pipeline(input_dict)
#     return answer['answer']

def generate_answer(prompt, model, tokenizer, temperature=0.7, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, temperature=temperature, do_sample=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def answer_question(question, context, temperature=0.7):
    prompt = f"Context for question: {context}\nQuestion: {question}\nAnswer to question: "
    answer = generate_answer(prompt, model, tokenizer, temperature)
    return answer


def translate_text(text):
    deu_result = deu_translator(text)[0]['translation_text']
    spa_result = spa_translator(text)[0]['translation_text']
    fra_result = fra_translator(text)[0]['translation_text']
    return deu_result, spa_result, fra_result


def gradio_get_verse_content(verseRef):
    tokens = get_verse_content(verseRef, mg)
    return tokens


def gradio_wrapper(inputs, context, temperature=0.7):
    question = inputs
    answer = answer_question(question, context, temperature)
    # deu_translation, spa_translation, fra_translation = translate_text(answer)
    return answer, # deu_translation, spa_translation, fra_translation


In [26]:
demo = gr.Blocks()

with demo:
    gr.Markdown("Generate insights about a given word using MACULA data")
    with gr.Row():
        verse_reference = gr.Textbox(lines=1, label="Verse Reference", placeholder="Enter verse reference here (e.g. 'JHN 3:16')", value="JHN 3:16")
        text_output = gr.Textbox(placeholder="Verse data will appear here", max_lines=10)
    get_verse_content_button = gr.Button("Get Verse Content")

    get_verse_content_button.click(
        gradio_get_verse_content, inputs=verse_reference, outputs=text_output)

    word_id_input = gr.Textbox(lines=1, label="Word ID", placeholder="Enter word ID here (e.g. 'n43003016012')", value="n43003016012")
    context_output = gr.Textbox(placeholder="Context will appear here", max_lines=10)
    
    get_context_button = gr.Button("Get Context")
    
    get_context_button.click(
        get_contextual_data, inputs=word_id_input, outputs=context_output)
    
    # Temperature slider. Note: Slider.__init__() takes from 1 to 4 positional arguments
    temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
    
    question_input = gr.Textbox(lines=2, label="Question", placeholder="Enter question here")
    answer_output = gr.Textbox(placeholder="Answer will appear here", max_lines=10)
    
    get_answer_button = gr.Button("Get Answer")
    
    get_answer_button.click(
        gradio_wrapper, inputs=[question_input, context_output, temperature_slider], outputs=answer_output)
    
demo.launch()



Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.




## 1. Lexical features
- μονογενής is a adj,
- meaning "only begotten."
- The lemma form of this word is μονογενής,
- and it is parsed as a A-ASM
- with a Strong's number of 3439.
## 2. Syntactic context and function
- Grammatical case: nominative, genitive, dative, accusative, or vocative: accusative,
- Grammatical gender values: masculine,
- Grammatical number: singular,
## 3. Discourse context
This word functions within 3 discourse features:
- Focus+ is defined as Constituents placed in P2 to give them focal prominence.
- Verb Focus+ is defined as Verb in final position in clause demonstrates verb focus.
- Main clauses is defined as Main clauses are the top-level clauses in a sentence. They are the clauses that are not embedded in other clauses.
## 5. Cultural/encyclopedic knowledge
- Semantic domain information from the Semantic Dictionary of Biblical Greek (SDBG): Distinctive, Unique,
- The semantic domain entry in Louw and Nida's, 'Greek-English Lexicon of the New Testament: Base