In [2]:
!pip3.10 install gensim

Collecting gensim
  Using cached gensim-4.3.1-cp310-cp310-macosx_11_0_arm64.whl (24.0 MB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-6.3.0-py3-none-any.whl (56 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.1 smart-open-6.3.0


In [3]:
import requests, json, re, os
import pandas as pd
from gensim.models.tfidfmodel import TfidfModel

## Populate Greek token data

In [4]:

def download_file(url, file_name):
    response = requests.get(url)
    with open(file_name, 'wb') as file:
        file.write(response.content)


file1_url = 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/Nestle1904/TSV/macula-greek.tsv'
file2_url = 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/sources/MARBLE/SDBG/marble-domain-label-mapping.json'
file1_name = 'macula-greek.tsv'
file2_name = 'marble-domain-label-mapping.json'

if file1_name not in os.listdir():
    download_file(file1_url, file1_name)

if file2_name not in os.listdir():
    download_file(file2_url, file2_name)

# Import Macula Greek data
mg = pd.read_csv('macula-greek.tsv', index_col='xml:id', sep='\t',
                 header=0, converters={'*': str}).fillna('missing')
# add an 'id' column
mg['id'] = mg.index

# mg['domain'] = mg['domain'].astype(str).fillna('missing')

# Extract book, chapter, and verse into separate columns
mg[['book', 'chapter', 'verse']] = mg['ref'].str.extract(
    r'(\d?[A-Z]+)\s(\d+):(\d+)')

# Add columns for book + chapter, and book + chapter + verse for easier grouping
mg['book_chapter'] = mg['book'] + ' ' + mg['chapter'].astype(str)
mg['book_chapter_verse'] = mg['book_chapter'] + ':' + mg['verse'].astype(str)

# Import domain-label mapping

# Open the JSON file
with open('marble-domain-label-mapping.json', 'r') as f:

    # Load the contents of the file as a dictionary
    domain_labels = json.load(f)

domain_labels['missing'] = 'no domain'
domain_labels['nan'] = 'no domain'

# Use domain labels to create a new column


def get_domain_label(domain_string_number):
    labels = [domain_labels[label]
              for label in domain_string_number.split(' ')]
    return labels


mg['domain_label'] = mg['domain'].apply(get_domain_label)
mg.head()

Unnamed: 0_level_0,ref,role,class,type,gloss,text,after,lemma,normalized,strong,...,frame,subjref,referent,id,book,chapter,verse,book_chapter,book_chapter_verse,domain_label
xml:id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n40001001001,MAT 1:1!1,missing,noun,common,[The] book,Βίβλος,,βίβλος,Βίβλος,976,...,missing,missing,missing,n40001001001,MAT,1,1,MAT 1,MAT 1:1,[Written Language]
n40001001002,MAT 1:1!2,missing,noun,common,of [the] genealogy,γενέσεως,,γένεσις,γενέσεως,1078,...,missing,missing,missing,n40001001002,MAT,1,1,MAT 1,MAT 1:1,[Kinship Relations Involving Successive Genera...
n40001001003,MAT 1:1!3,missing,noun,proper,of Jesus,Ἰησοῦ,,Ἰησοῦς,Ἰησοῦ,2424,...,missing,missing,missing,n40001001003,MAT,1,1,MAT 1,MAT 1:1,[Persons]
n40001001004,MAT 1:1!4,missing,noun,proper,Christ,Χριστοῦ,,Χριστός,Χριστοῦ,5547,...,missing,missing,missing,n40001001004,MAT,1,1,MAT 1,MAT 1:1,[Persons]
n40001001005,MAT 1:1!5,missing,noun,common,son,υἱοῦ,,υἱός,υἱοῦ,5207,...,missing,missing,missing,n40001001005,MAT,1,1,MAT 1,MAT 1:1,[Kinship Relations Involving Successive Genera...


## Populate speaker quotation data locally

In [5]:
file_3_url = 'https://raw.githubusercontent.com/Clear-Bible/speaker-quotations/main/json/SpeakerProjections-clear.json'
file_4_url = 'https://raw.githubusercontent.com/Clear-Bible/speaker-quotations/main/json/character_detail.semantic_data.json'
file_3_name = 'SpeakerProjections-clear.json'
file_4_name = 'character_detail.semantic_data.json' # stores info about each unique character id (unique string value)

if file_3_name not in os.listdir():
    download_file(file_3_url, file_3_name)

if file_4_name not in os.listdir():
    download_file(file_4_url, file_4_name)
    
# Create a dataframe from the SpeakerProjections-clear.json file
with open('SpeakerProjections-clear.json', 'r') as f:
    speaker_projections = json.load(f)
    speaker_data = pd.DataFrame(speaker_projections)
    
# Create a dataframe from the character_detail.semantic_data.json file
with open('character_detail.semantic_data.json', 'r') as f:
    # this data is an array of JSON objects
    character_detail = json.load(f)
    character_data = pd.DataFrame(character_detail)


In [6]:
speaker_data.head() # I will reformat this data to make it easier to work with below

Unnamed: 0,GEN 1:3|GEN 1:3|God,GEN 1:6|GEN 1:6|God,GEN 1:9|GEN 1:9|God,GEN 1:11|GEN 1:11|God,GEN 1:14|GEN 1:15|God,GEN 1:20|GEN 1:20|God,GEN 1:22|GEN 1:22|God,GEN 1:24|GEN 1:24|God,GEN 1:26|GEN 1:26|God,GEN 1:28|GEN 1:30|God,...,REV 21:3|REV 21:4|voice from throne,REV 21:5|REV 21:8|God,REV 21:9|REV 21:9|angel (one of the seven),REV 22:6|REV 22:6|angel (one of the seven),REV 22:7|REV 22:7|Jesus,REV 22:9|REV 22:11|angel (one of the seven),REV 22:12|REV 22:16|Jesus,REV 22:17|REV 22:17|him who hears,"REV 22:17|REV 22:17|Holy Spirit, the",REV 22:20|REV 22:20|Jesus
SpeakerInstance,"{'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3...","{'StartVerse': 'GEN 1:6', 'EndVerse': 'GEN 1:6...","{'StartVerse': 'GEN 1:9', 'EndVerse': 'GEN 1:9...","{'StartVerse': 'GEN 1:11', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:14', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:20', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:22', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:24', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:26', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:28', 'EndVerse': 'GEN 1:...",...,"{'StartVerse': 'REV 21:3', 'EndVerse': 'REV 21...","{'StartVerse': 'REV 21:5', 'EndVerse': 'REV 21...","{'StartVerse': 'REV 21:9', 'EndVerse': 'REV 21...","{'StartVerse': 'REV 22:6', 'EndVerse': 'REV 22...","{'StartVerse': 'REV 22:7', 'EndVerse': 'REV 22...","{'StartVerse': 'REV 22:9', 'EndVerse': 'REV 22...","{'StartVerse': 'REV 22:12', 'EndVerse': 'REV 2...","{'StartVerse': 'REV 22:17', 'EndVerse': 'REV 2...","{'StartVerse': 'REV 22:17', 'EndVerse': 'REV 2...","{'StartVerse': 'REV 22:20', 'EndVerse': 'REV 2..."
Projections,"[{'StartWord': 'o010010030031', 'EndWord': 'o0...","[{'StartWord': 'o010010060031', 'EndWord': 'o0...","[{'StartWord': 'o010010090031', 'EndWord': 'o0...","[{'StartWord': 'o010010110031', 'EndWord': 'o0...","[{'StartWord': 'o010010140031', 'EndWord': 'o0...","[{'StartWord': 'o010010200031', 'EndWord': 'o0...","[{'StartWord': 'o010010220051', 'EndWord': 'o0...","[{'StartWord': 'o010010240031', 'EndWord': 'o0...","[{'StartWord': 'o010010260031', 'EndWord': 'o0...","[{'StartWord': 'o010010280071', 'EndWord': 'o0...",...,"[{'StartWord': 'n66021003009', 'EndWord': 'n66...","[{'StartWord': 'n66021005008', 'EndWord': 'n66...","[{'StartWord': 'n66021009025', 'EndWord': 'n66...","[{'StartWord': 'n66022006004', 'EndWord': 'n66...","[{'StartWord': 'n66022007002', 'EndWord': 'n66...","[{'StartWord': 'n66022009004', 'EndWord': 'n66...","[{'StartWord': 'n66022012001', 'EndWord': 'n66...","[{'StartWord': 'n66022017008', 'EndWord': 'n66...","[{'StartWord': 'n66022017013', 'EndWord': 'n66...","[{'StartWord': 'n66022020005', 'EndWord': 'n66..."


In [7]:
character_data.head()

Unnamed: 0,CharacterId,MaxSpeakers,Gender,Age,Comment,SDBH,LouwNida,FCBHCharacter,Divinity
0,2 other disciples,2,Male,Adult,"Not Peter (Simon), Thomas, Nathaniel, James, o...",,,,
1,250 Israelite leaders,250,Male,Adult,,,,,
2,a Jew,1,Male,Adult,,,,,
3,Aaron,1,Male,Adult,,[000172001001000],[93.1],,
4,Abednego,1,Male,Adult,original Hebrew name: Azariah,,,,


In [8]:
print(speaker_data['GEN 1:3|GEN 1:3|God'].to_dict())

{'SpeakerInstance': {'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'CharacterIds': ['God'], 'Alias': 'God (Yahweh)', 'QuoteType': 'Normal', 'DefaultCharacterId': 'God', 'Alternates': []}, 'Projections': [{'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑וֹר\u200e', 'Depth': 2}]}]}


In [9]:
# Transpose the DataFrame
transposed_speaker_data = speaker_data.transpose()

# Reset the index
transposed_speaker_data.reset_index(inplace=True)

# Rename the columns
transposed_speaker_data.columns = ["row_id", "instance_data", "projections"]

print(transposed_speaker_data.iloc[0].to_dict())


{'row_id': 'GEN 1:3|GEN 1:3|God', 'instance_data': {'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'CharacterIds': ['God'], 'Alias': 'God (Yahweh)', 'QuoteType': 'Normal', 'DefaultCharacterId': 'God', 'Alternates': []}, 'projections': [{'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑וֹר\u200e', 'Depth': 2}]}]}


In [10]:
# Normalize the 'instance_data' column
flattened_instance_data = pd.json_normalize(transposed_speaker_data['instance_data'])

# Merge the normalized DataFrame with the original transposed DataFrame
merged_speaker_data = pd.concat([transposed_speaker_data.drop(columns=['instance_data']), flattened_instance_data], axis=1)

print(merged_speaker_data.iloc[0].to_dict())

{'row_id': 'GEN 1:3|GEN 1:3|God', 'projections': [{'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑וֹר\u200e', 'Depth': 2}]}], 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'CharacterIds': ['God'], 'Alias': 'God (Yahweh)', 'QuoteType': 'Normal', 'DefaultCharacterId': 'God', 'Alternates': [], 'Delivery': nan}


### Turn speaker quotation data into one row per projection

In [11]:
# Create an empty DataFrame to store the result
expanded_speaker_data = pd.DataFrame()

# Iterate through the rows in the merged_speaker_data DataFrame
for idx, row in merged_speaker_data.iterrows():
    projections = row['projections']
    
    # Iterate through the projections
    for proj_idx, projection in enumerate(projections):
        # Create a new row with the inherited speaker instance data
        new_row = row.drop('projections').to_dict()
        new_row.update(projection)
        
        # Set the row ID with the projection index
        new_row['row_id'] = f"{row['row_id']}|{proj_idx}"
        
        # Append the new row to the expanded_speaker_data DataFrame
        expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)

AttributeError: 'DataFrame' object has no attribute 'append'

### Add 'tokens' and 'token_ids' columns to speaker quotation data

In [None]:
# Add 'tokens' and 'token_ids' columns with default empty lists
expanded_speaker_data = expanded_speaker_data.assign(tokens=[[]]*len(expanded_speaker_data), token_ids=[[]]*len(expanded_speaker_data))

# Iterate through the rows in the expanded_speaker_data DataFrame
for idx, row in expanded_speaker_data.iterrows():
    words = row['Words']
    
    # Extract the 'Text' and 'Id' values from the 'Words' data
    tokens = [word['Text'] for word in words]
    token_ids = [word['Id'] for word in words]
    
    # Assign the 'tokens' and 'token_ids' fields to the row
    expanded_speaker_data.at[idx, 'tokens'] = tokens
    expanded_speaker_data.at[idx, 'token_ids'] = token_ids


In [None]:
print(expanded_speaker_data.iloc[0].to_dict())

{'row_id': 'GEN 1:3|GEN 1:3|God|0', 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'CharacterIds': ['God'], 'Alias': 'God (Yahweh)', 'QuoteType': 'Normal', 'DefaultCharacterId': 'God', 'Alternates': [], 'Delivery': nan, 'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑וֹר\u200e', 'Depth': 2}], 'tokens': ['יְהִ֣י\u200e', 'א֑וֹר\u200e'], 'token_ids': ['o010010030031', 'o010010030041']}


## Populate semantic role data

In [None]:
# Create a dataframe from 'word_level_semantic_data.tsv'. First row is header
semantic_role_data = pd.read_csv('word_level_semantic_data.tsv', sep='\t', header=0)
semantic_role_data = semantic_role_data.fillna('')
for line in semantic_role_data.head(20).to_string().split('\n'):
    print(line)

          xml:id         ref       word    lemma               gloss                                         frame frame_verb_id frame_verb frame_type frame_node_head adjunct_label adjunct_node_head preposition_label preposition_node_head semantic_role_label semantic_role_node_head embedded_semantic_role_label embedded_semantic_role_head
0   n40001001001   MAT 1:1!1     Βίβλος   βίβλος          [The] book                                                                                                                                                                                     Attribute            n40001001001                                                         
1   n40001001002   MAT 1:1!2   γενέσεως  γένεσις  of [the] genealogy                                                                                                                                                                                                                                                            

In [None]:
# Create a new dataframe that has each column name in semantic_role_data as a row, and an array of the unique values in that column as the second column, plus the number of unique values as the third column.
semantic_role_data_summary = pd.DataFrame([[column, semantic_role_data[column].unique(), len(semantic_role_data[column].unique())] for column in semantic_role_data.columns], columns=['column_name', 'unique_values', 'num_unique_values'])
semantic_role_data_summary

Unnamed: 0,column_name,unique_values,num_unique_values
0,xml:id,"[n40001001001, n40001001002, n40001001003, n40...",137779
1,ref,"[MAT 1:1!1, MAT 1:1!2, MAT 1:1!3, MAT 1:1!4, M...",137779
2,word,"[Βίβλος, γενέσεως, Ἰησοῦ, Χριστοῦ, υἱοῦ, Δαυεὶ...",19477
3,lemma,"[βίβλος, γένεσις, Ἰησοῦς, Χριστός, υἱός, Δαυίδ...",5401
4,gloss,"[[The] book, of [the] genealogy, of Jesus, Chr...",20022
5,frame,"[, A0:n40001002001 A1:n40001002004, A0:n400010...",20299
6,frame_verb_id,"[, n40001002002, n40001002007, n40001002012, n...",28629
7,frame_verb,"[, γεννάω, μνηστεύω, μνηστεύω|ἔχω, τίκτω, συνέ...",5764
8,frame_type,"[, Agent, Patient, Patient|Agent, Recipient, A...",414
9,frame_node_head,"[, n40001002001, n40001002004, n40001002005, n...",24796


### Add additional lookup table for complete wordings for every semantic role

In [None]:
# Read roles.json and load it into a Python object
with open("roles.json", "r") as f:
    roles_data = json.load(f)

def extract_data(node):
    node_data = {
        "node_category": node["node_category"],
        "node_head": node["node_head"],
        "field_name": node["field_name"],
        "field_type": node["field_type"],
        "forms": [],
        "lemmas": [],
        "glosses": [],
        "ids": []
    }

    for word_data in node["words"].values():
        node_data["forms"].append(word_data["word"])
        node_data["lemmas"].append(word_data["lemma"])
        node_data["glosses"].append(word_data["gloss"])
        node_data["ids"].append(word_data["word_identifier"])

    return node_data

processed_data = [extract_data(node) for node in roles_data]

semantic_role_wordings_lookup = pd.DataFrame(processed_data)

for i in range(0, len(semantic_role_wordings_lookup)):
    # if len(semantic_role_wordings_lookup.iloc[i]['ids']) > 1:
    #     semantic_role_wordings_lookup.iloc[i].to_dict()
    for j in semantic_role_wordings_lookup.iloc[i]['ids']:
        if j.startswith('n66020004033'):
            print(semantic_role_wordings_lookup.iloc[i].to_dict())
            
# one-liner to get the semantic role wordings for a given word id
def get_complete_wordings_for_role(word_id):
    return semantic_role_wordings_lookup[semantic_role_wordings_lookup['ids'].apply(lambda x: word_id in x)].to_dict('records')
    

{'node_category': 'np', 'node_head': 'n66020004033', 'field_name': 'Role', 'field_type': 'Negation', 'forms': ['τὸ', 'θηρίον', 'οὐδὲ', 'τὴν', 'εἰκόνα', 'αὐτοῦ'], 'lemmas': ['ὁ', 'θηρίον', 'οὐδέ', 'ὁ', 'εἰκών', 'αὐτός'], 'glosses': ['', '', '', '', '', ''], 'ids': ['n66020004031', 'n66020004032', 'n66020004033', 'n66020004034', 'n66020004035', 'n66020004036']}


In [None]:
get_complete_wordings_for_role('n66020004033')

[{'node_category': 'np',
  'node_head': 'n66020004033',
  'field_name': 'Role',
  'field_type': 'Negation',
  'forms': ['τὸ', 'θηρίον', 'οὐδὲ', 'τὴν', 'εἰκόνα', 'αὐτοῦ'],
  'lemmas': ['ὁ', 'θηρίον', 'οὐδέ', 'ὁ', 'εἰκών', 'αὐτός'],
  'glosses': ['', '', '', '', '', ''],
  'ids': ['n66020004031',
   'n66020004032',
   'n66020004033',
   'n66020004034',
   'n66020004035',
   'n66020004036']}]

# Prompt expansion

In [None]:
# Need token ids for MAT 3:15
# Need token data for the token with with a gloss like 'him'
# Need to get the @Referent for that token
# Need to get the token data for the token that matches @Referent
# Need to get social situation data for this passage
# Need to get description of the social situation based on the token

## Define prosaic annotation category and value glosses

In [None]:
attribute_descriptions = {
    "after": "Encodes the following character, including a blank space.",
    "articular": "'true' if the word has an article (i.e., modified by the word 'the').",
    "case": "Grammatical case: nominative, genitive, dative, accusative, or vocative",
    "class": "On words, the class is the word's part of speech",
    "cltype": "Explicitly marks Verbless Clauses, Verb Elided Clauses, and Minor Clauses",
    "degree": "A derivative lexical category, indicating the degree of the adjective",
    "discontinuous": "'true' if the word is discontinuous with respect to sentence order due to reordering in the syntax tree",
    "domain": "Semantic domain information from the Semantic Dictionary of Biblical Greek (SDBG)",
    "frame": "Frames of verbs, refers to the arguments of the verb",
    "gender": "Grammatical gender values",
    "gloss": "SIL data, not Berean",
    "lemma": "Form of the word as it appears in a dictionary.",
    "ln": "The semantic domain entry in Louw and Nida's, 'Greek-English Lexicon of the New Testament: Based on Semantic Domains'.",
    "mood": "Grammatical mood",
    "morph": "Morphological parsing codes",
    "normalized": "The normalized form of the token (i.e., no trailing or leading punctuation or accent shifting depending on context)",
    "number": "Grammatical number",
    "person": "Grammatical person",
    "ref": "Verse!word reference to this edition of the Nestle1904 text by USFM id",
    "referent": "The xml:id of the node to which a pronoun (i.e., 'he') refers. Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "role": "The clause-level role of the word.",
    "strong": "Strong's number for the lemma",
    "subjref": "The xml:id of the node that is the implied subject of a verb (for verbs without an explicit subject). Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "tense": "Grammatical tense form",
    "text": "Text content associated with the ID",
    "type": "Indicates different types of pronominals",
    "voice": "Grammatical voice",
    "xml:id": "XML ids occur on every word and encode the corpus ('n' for New Testament), the book (40 for Matthew), the chapter (001), verse (001), and word (001)."
}

discourse_types = {
    'Main clauses': {'description': 'Main clauses are the top-level clauses in a sentence. They are the clauses that are not embedded in other clauses.'},
    'Historical Perfect': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'Specific Circumstance': {'description': 'The function of ἐγενετο ‘it came about’ and an immediately following temporal expression varies with the author (see DFNTG §10.3). In Matthew’s Gospel, it usually marks major divisions in the book (e.g. Mt 7:28). In Luke-Acts, in contrast, ‘it picks out from the general background the specific circumstance for the foreground events that are to follow’ (ibid.), as in Acts 9:37 (see also Mt 9:10).'},
    'Verb Focus+': {'description': 'Verb in final position in clause demonstrates verb focus.'},
    'Articular Pronoun': {'description': 'Articular pronoun, which often introduces an ‘intermediate step’ in a reported conversation.'},
    'Topical Genitive': {'description': 'A genitival constituent that is nominal is preposed within the noun phrase for two purposes: 1) to bring it into focus; 2) within a point of departure, to indicate that it is the genitive in particular which relates to a corresponding constituent of the context.(DFNTG §4.5)'},
    'Embedded DFE': {'description': "'Dominant focal elements' embedded within a constituent in P1."},
    'Reported Speech': {'description': 'Reported speech.'},
    'Ambiguous': {'description': 'Marked but ambiguous constituent order.'},
    'Over-encoding': {'description': 'Any instance in which more encoding than the default is employed to refer to an active participant or prop. Over-encoding is used in Greek, as in other languages: to mark the beginning of a narrative unit (e.g. Mt 4:5); and to highlight the action or speech concerned (e.g. Mt 4:7).'},
    'Highlighter': {'description': 'Presentatives - Interjections such as ἰδού and ἴδε ‘look!, see!’ typically highlight what immediately follows (Narr §5.4.2, NonNarr §7.7.3).'},
    'Referential PoD': {'description': 'Pre-verbal topical subject other referential point of departure (NARR §3.1, NonNarr §4.3, DFNTG §§2.2, 2.8; as in 1 Th 1:6).'},
    'annotations': {'description': 'Inline annotations.'},
    'Left-Dislocation': {'description': 'Point of departure - A type of SENTENCE in which one of the CONSTITUENTS appears in INITIAL position and its CANONICAL position is filled by a PRONOUN or a full LEXICAL NOUN PHRASE with the same REFERENCE, e.g. John, I like him/the old chap.”'},
    'Focus+': {'description': 'Constituents placed in P2 to give them focal prominence.'},
    'Tail-Head linkage': {'description': 'Point of departure involving renewal - Tail-head linkage involves “the repetition in a subordinate clause, at the beginning (the ‘head’) of a new sentence, of at least the main verb of the previous sentence (the tail)” (Dooley & Levinsohn 2001:16).'},
    'Postposed them subject': {'description': 'When a subject is postposed to the end of its clause (following nominals or adjuncts), it is marked ThS+ (e.g. Lk 1:41 [twice]). Such postposing typically marks as salient the participant who performs the next event in chronological sequence in the story (see Levinsohn 2014).'},
    'EmbeddedRepSpeech': {'description': 'Embedded reported speech - speech that is reported within a reported speech.'},
    'Futuristic Present': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'OT quotes': {'description': 'Old Testament quotations.'},
    'Constituent Negation': {'description': 'Negative pro-forms when they are in P2 indicate that the constituent has been negated rather than the clause as a whole.'},
    'Split Focal': {'description': 'The second part of a focal constituent with only the first part in P2 (NonNarr §5.5, DFNTG §4.4).'},
    'Right-Dislocated': {'description': 'Point of departure - A type of SENTENCE in which one of the CONSTITUENTS appears in FINAL position and its CANONICAL position is filled by a PRONOUN with the same REFERENCE, e.g. ... He’s always late, that chap.'},
    'Appositive': {'description': 'Appositive'},
    'Situational PoD': {'description': 'Situational point of departure (e.g. temporal, spatial, conditional―(NARR §3.1, NonNarr §4.3, DFNTG §§2.2, 2.8; as in 1 Th 3:4).'},
    'Historical Present': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'Noun Incorporation': {'description': 'Some nominal objects that appear to be in P2 may precede their verb because they have been “incorporated” (Rosen 1989) in the verb phrase. Typically, the phrase consists of an indefinite noun and a “light verb” such as “do, give, have, make, take” (Wikipedia entry on Light Verbs).'},
    'Thematic Prominence': {'description': 'Thematic prominence - In Greek, prominence is given to active participants and props who are the current centre of attention (NARR §4.6) by omitting the article (DFNTG §§9.2.3-9.4), by adding αυτος ‘-self’ (e.g. in 1 Th 3:11), by using the proximal demonstrative οὗτος (NARR chap. 9, Appendix 1; e.g. in 3:3), and by postposing the constituent concerned (e.g. Mt 14:29). If such constituents are NOT in postion P1, they are demonstrating topical prominence.'},
    'Cataphoric Focus': {'description': 'An expression that points forward to and highlights something which ‘is about to be expressed.’'},
    'Cataphoric referent': {'description': 'The clause or sentence to which a cataphoric reference refers when NOT introduced with ὅτι or ἵνα.'},
    'DFE': {'description': 'Constituents that may be moved from their default position to the end of a proposition to give them focal prominence include verbs, pronominals and objects that follow adjuncts (NonNarr §5.3, DFNTG §3.5). Such constituents, also called ‘dominant focal elements’or DFEs (Heimedinger 1999:167).'},
    'Embedded Focus+': {'description': 'A constituent of a phrase or embedded clause preposed for focal prominence.'}
}

## Define data provisioning functions

Each of these functions should accept a token or range of tokens. They should return a list of annotations for each token or range of tokens. If the annotation is not applicable to a token or range of tokens, the function should return an empty list.

In [None]:
ENDPOINT = 'https://macula-atlas-api-qa-25c5xl4maa-uk.a.run.app/graphql/'
headers = {"Content-Type": "application/json"}

In [None]:
# Levinsohn discourse features query

discourse_features_query = """
query AnnotationFeatures($filters1: AnnotationFeatureFilter, $filters2: AnnotationFilter, $filters3: WordTokenFilter ) {
  annotationFeatures(filters: $filters1) {
    label
    uri
    instances(filters: $filters2) {
      uri
      tokens(filters: $filters3) {
        ref
        wordValue
        xmlId
      }
    }
  }
}
"""

def get_discourse_annotation_types(xmlId):
    tokenData = mg.loc[xmlId].to_dict()
    passage = tokenData['ref'].split('!')[0]
    
    variables = {
        "filters1": {
            "reference": passage,
        },
        "filters2": {
            "reference": passage,
        },
        "filters3": {
            "xmlId": xmlId,
        }
        }
    
    payload = {'query': discourse_features_query, 'variables': variables}
    
    response = requests.post(ENDPOINT, json=payload, headers=headers)
    
    response_data = json.loads(response.text) 
    annotation_features = response_data["data"]["annotationFeatures"]

    labels = [feature["label"] for feature in annotation_features]
    return labels

In [5]:
situations_lookup_json = json.loads('''
                                    {
    "parameters": [
        {
            "name": "field",
            "type": "register_parameter",
            "summary": "Field is the subject matter of a situation and concerns the nature and the structure of the activity being carried out. It involves three parameters: abstractness, activity focus, and goals.",
            "system": "field"
        },
        {
            "name": "tenor",
            "type": "register_parameter",
            "summary": "Tenor pertains to the social roles and relationships among the participants involved in a situation. It covers five parameters: value orientation predisposition, publicity, number of speaking participants, control, and social distance.",
            "system": "tenor"
        },
        {
            "name": "mode",
            "type": "register_parameter",
            "summary": "Mode is the dimension of a situation through which participants are brought into contact with each other. It involves the systems of material contact and semantic contact, focusing on four parameters: language role, process sharing, channel, and medium.",
            "system": "mode"
        }
    ],
    "systems": [
        {
            "name": "abstractness",
            "type": "system",
            "summary": "The abstractness of a situation refers to the distinction between conceptual and practical activities, with the former involving theoretical or abstract activities, while the latter involves concrete or immediate actions."
        },
        {
            "name": "activity focus",
            "type": "system",
            "summary": "The activity focus of a situation involves the domain of experience that participants are focusing on. An experiential focus refers to what is happening, an interpersonal focus refers to why it is happening, and a logical focus refers to how, when, or where it is happening. The analysis only captures the beginning and the end of the focus, even if it changes during the episode."
        },
        {
            "name": "goals",
            "type": "system",
            "summary": "The goals of a situational activity involve the motivation of actions, and can be instructing, projecting, or asserting."
        },
        {
            "name": "control",
            "type": "system",
            "summary": "Control involves social tendencies related to deference between participants based on their relative status, power, authority, or institutional roles. Situations may be hierarchic or non-hierarchic, with the former being unequal and the latter being equal. In unequal relationships, there may be numerous subjects that cannot typically be discussed, whereas equal relationships allow for a greater range of meanings to be exchanged."
        },
        {
            "name": "plurality",
            "type": "system",
            "summary": "The plurality system pertains to the number of speaking participants in a situation, and includes the parameters of monological, dialogical, and multilogical. This system recognizes that more than two participants may be engaged in dialogical activity, interacting with each other in various overlapping arrangements over the course of a situation."
        },
        {
            "name": "value-orientation-disposition",
            "type": "system",
            "summary": "Value-orientation disposition refers to the nature of relative alliance or opposition between agents in a situation, and answers the question of whether the situation is presented as if there is agreement or opposition between participants. The system distinguishes between an allying disposition that realizes agreement and an opposing disposition that realizes disagreement."
        },
        {
            "name": "social-distance",
            "type": "system",
            "summary": "Social distance refers to the level of familiarity between participants in a situation. Close participants may exchange more kinds of meanings and require less explicitness in their communication, while distant participants tend to require more explicitness and have a more restricted set of possible meaning exchanges."
        },
        {
            "name": "publicity",
            "type": "system",
            "summary": "Publicity is a dimension of tenor that refers to the presence or absence of onlookers with regard to a social act, and the various levels of engagement such onlookers might reveal. It includes disinterested, interested (neutral or biased), and private situations."
        },
        {
            "name": "language-role",
            "type": "system",
            "summary": "Language role refers to the amount of work language does in accomplishing a situation's activity, and can be constitutive or ancillary depending on whether language is the primary means of accomplishing the activity or simply assists in the unfolding of non-linguistic actions."
        },
        {
            "name": "process-sharing",
            "type": "system",
            "summary": "Process sharing refers to the degree of active participation by more than one participant in the unfolding of text, and can be active or passive depending on whether participants share in the creation of the text or engage with it more passively."
        },
        {
            "name": "channel",
            "type": "system",
            "summary": "Channel refers to the physical mechanics of the addressee's interaction with the text, and can be phonic or graphic. It is closely related to process sharing, and is decided by the nature of the social activity and of the social relation between the participants."
        },
        {
            "name": "medium",
            "type": "system",
            "summary": "Medium refers to the style or patterning of the wordings themselves, and can be spoken or written. It is a matter of style, and is related to the extemporaneousness of the language realizing a situation."
        }
    ],
    "features": [
        {
            "name": "conceptual-ie-internally-oriented",
            "type": "feature",
            "register_parameter": "field",
            "description": "Conceptual field values involve abstract or theoretical activities, such as theological or philosophical discussions.",
            "system": "field",
            "parameter": "abstractness"
        },
        {
            "name": "practical-ie-outwardly-oriented",
            "type": "feature",
            "register_parameter": "field",
            "description": "Practical field values involve activities that are concrete or focused on immediate actions, such as fishing, a healing, or a miracle.",
            "system": "field",
            "parameter": "abstractness"
        },
        {
            "name": "experiential",
            "type": "feature",
            "register_parameter": "field",
            "description": "Experiential activity focus characterizes a situation whose linguistic activity chiefly relates to the unfolding of events or happenings, where participants are involved in carrying out or observing the activity.",
            "system": "field",
            "parameter": "activity_focus"
        },
        {
            "name": "interpersonal",
            "type": "feature",
            "register_parameter": "field",
            "description": "Interpersonal activity focus pertains to the social interaction between participants, focusing on their roles, relationships, and attitudes.",
            "system": "field",
            "parameter": "activity_focus"
        },
        {
            "name": "logical",
            "type": "feature",
            "register_parameter": "field",
            "description": "Logical activity focus involves reasoning, argumentation, or explanation, where participants engage in activities that require logical thinking.",
            "system": "field",
            "parameter": "activity_focus"
        },
        {
            "name": "instructing",
            "type": "feature",
            "register_parameter": "field",
            "description": "Instructing goals are centered around teaching, explaining, or providing guidance to others.",
            "system": "field",
            "parameter": "goals"
        },
        {
            "name": "projecting",
            "type": "feature",
            "register_parameter": "field",
            "description": "Projecting goals involve making predictions, prophesying, or discussing future events or possibilities.",
            "system": "field",
            "parameter": "goals"
        },
        {
            "name": "asserting",
            "type": "feature",
            "register_parameter": "field",
            "description": "Asserting goals involve stating or affirming beliefs, claims, or opinions, often in a declarative manner.",
            "system": "field",
            "parameter": "goals"
        },
        {
            "name": "allying",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Allying value orientation predisposition refers to participants who share the same views or are supportive of each other's positions.",
            "system": "tenor",
            "parameter": "value_orientation_predisposition"
        },
        {
            "name": "opposing",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Opposing value orientation predisposition refers to participants who hold different views or are antagonistic toward each other's positions.",
            "system": "tenor",
            "parameter": "value_orientation_predisposition"
        },
        {
            "name": "disinterested",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Disinterested publicity refers to a neutral stance where participants are not personally invested in the outcome or do not take sides.",
            "system": "tenor",
            "parameter": "publicity"
        },
        {
            "name": "neutral",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Neutral publicity refers to a situation where participants neither support nor oppose a particular stance or outcome.",
            "system": "tenor",
            "parameter": "publicity"
        },
        {
            "name": "on-someones-side",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "On-someones-side publicity refers to a situation where participants actively support a particular stance or outcome.",
            "system": "tenor",
            "parameter": "publicity"
        },
        {
            "name": "private",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Private publicity refers to a situation where participants actively oppose a particular stance or outcome.",
            "system": "tenor",
            "parameter": "publicity"
        },
        {
            "name": "monological",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Monological refers to situations with only one speaking participant, such as a monologue or a soliloquy.",
            "system": "tenor",
            "parameter": "number_of_speaking_participants"
        },
        {
            "name": "dialogical",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Dialogical refers to situations with two speaking participants, such as a dialogue or conversation.",
            "system": "tenor",
            "parameter": "number_of_speaking_participants"
        },
        {
            "name": "multilogical",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Multilogical refers to situations with three or more speaking participants, such as group discussions or debates.",
            "system": "tenor",
            "parameter": "number_of_speaking_participants"
        },
        {
            "name": "institutional",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Institutional control refers to situations where one participant or a group of participants hold authority or power over others.",
            "system": "tenor",
            "parameter": "control"
        },
        {
            "name": "non-institutional-or-neutralized",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Non-institutional or neutralized control refers to situations where no specific participant or group holds authority or power over others.",
            "system": "tenor",
            "parameter": "control"
        },
        {
            "name": "unclear",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Unclear control refers to situations where it is not evident who holds authority or power.",
            "system": "tenor",
            "parameter": "control"
        },
        {
            "name": "equalized",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Equalized control refers to situations where all participants have an equal share of authority or power.",
            "system": "tenor",
            "parameter": "control"
        },
        {
            "name": "close",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Close social distance refers to situations where participants have a close relationship or are familiar with each other.",
            "system": "tenor",
            "parameter": "social_distance"
        },
        {
            "name": "distant",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Distant social distance refers to situations where participants have a distant relationship or are not familiar with each other.",
            "system": "tenor",
            "parameter": "social_distance"
        },
        {
            "name": "constitutive",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Constitutive language role refers to situations where language is the primary means of carrying out the activity or achieving the goal.",
            "system": "mode",
            "parameter": "language_role"
        },
        {
            "name": "ancillary",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Ancillary language role refers to situations where language plays a secondary or supporting role in carrying out the activity or achieving the goal.",
            "system": "mode",
            "parameter": "language_role"
        },
        {
            "name": "addressee-more-active",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Addressee-more-active process sharing refers to situations where the recipient of the message is more actively involved in the communication process, such as asking questions or providing feedback.",
            "system": "mode",
            "parameter": "process_sharing"
        },
        {
            "name": "addressee-more-passive",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Addressee-more-passive process sharing refers to situations where the recipient of the message is less actively involved in the communication process, such as listening or reading without providing feedback.",
            "system": "mode",
            "parameter": "process_sharing"
        },
        {
            "name": "phonic",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Phonic channel refers to communication through sound, such as spoken language or music.",
            "system": "mode",
            "parameter": "channel"
        },
        {
            "name": "graphic",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Graphic channel refers to communication through visual means, such as written language, images, or symbols.",
            "system": "mode",
            "parameter": "channel"
        },
        {
            "name": "spoken",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Spoken medium refers to communication that takes place through speech, either in face-to-face conversations or through audio recordings.",
            "system": "mode",
            "parameter": "medium"
        },
        {
            "name": "written",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Written medium refers to communication that takes place through text, either in print or digital formats.",
            "system": "mode",
            "parameter": "medium"
        }
    ]
}
''')

In [6]:
# Situation data query
import json
SITUATIONS_ENDPOINT = "https://gospelgenre.ryderwishart.com/api/token/"

# just fetch the raw JSON from the situations endpoint plus the ref (e.g., MAT 3:13)
# Note, you only need to submit one token to get the social situation data, if available
# def get_situations_data(tokenRef):
#     # validate tokenRef
#     if not re.match(r'^\d?[A-Z]+ \d+:\d+$', tokenRef):
#         return {'error': 'invalid ref'};
#     expanded_endpoint = SITUATIONS_ENDPOINT + tokenRef
#     response = requests.get(expanded_endpoint)
#     return response.json()

clusterLabels = {
  0: 'narration/account',
  1: 'denouncement',
  2: 'forewarning/private discussion', # and predictions? look into this
  3: 'assignment', # with predictions of how it will go?
  4: 'charge',
  5: 'appraisal',
  6: 'questioning',
  7: 'controversial action', # jesus heals a person, and people ask questions about it
  8: 'disputation',
  9: 'vilifying story',
  10: 'rebuke',
  11: 'organizing',
  12: 'judicial examination',
  13: 'public execution',
  14: 'presumptive interaction',
  15: 'announcement',
  16: 'examination',
  17: 'public spectacle/novelty',
  18: 'correction',
  19: 'surprising turn of events',
  20: 'redirection', # NOTE: not sure about this one
  21: 'solicitation',
  22: 'illustrated lesson',
  23: 'conflict',
  24: 'oration',
  25: 'accommodation',
  26: 'challenge',
  27: 'disappointing request',
  28: 'disagreement',
} # NOTE: these should probably be reworded to represent situational activities better

with open("situations_prose_descriptions.json", "r") as f:
    situations_prose_descriptions = json.load(f)
    # each description has a title and description, and the title corresponds to the clusterLabels *value*
    # add descriptions to the clusterLabels dict
    for cluster in clusterLabels:
        matching_description = next((item for item in situations_prose_descriptions if item["title"].lower() == clusterLabels[cluster]), None)
        clusterLabels[cluster] = {
            'number': cluster,
            'title': matching_description['title'],
            'description': matching_description['description']
        }


# I'm instead just going to load the situations data locally, since there seems to be some problem with the next.js endpoint for some tokens/pericopes
def get_situations_data(tokenRef):
    print(tokenRef)
    with open("situations_data.json", "r") as f:
        situations_data = json.load(f)

        """
        Array of situations, like this:
                  
          [
            {
              "situation": "01-01",
              "title": [
                "The Genealogy of Christ"
              ],
              "preTextFeatures": "no embedded discourse",
              "viaTextFeatures": "no embedded discourse",
              "start": [
                "SBLGNT.Matt.1.1.w1"
              ],
              "section": [
                "01-01"
              ],
              "morphGntId": [
                "010101"
              ],
              "ref": "MAT 1:1!1 MAT 1:1!1 MAT 1:1!1 MAT 1:1!1 ...",
              "text": "Βίβλος γενέσεως Ἰησοῦ χριστοῦ υἱοῦ Δαυὶδ...",
              "token_ids": "n40001001001 n40001001002 n40001001003...",
              "token_refs": [
                "MAT 1:2!14",
                "MAT 1:3!9",
                "MAT 1:7!4",
                ...
        """
        # Find the situation where tokenRef is in token_refs array
        match = None
        for situation in situations_data:
            try:
                if not situation.get("token_refs"):
                    print("no token_refs for situation", situation["situation"])
                    continue
                if tokenRef in situation["token_refs"]:
                    match = situation
            except:
                continue
        
        if not situation['cluster']:
            return match
        else:
            # FIXME: this is so hacky it makes my eyes water
            type = clusterLabels[int(situation['cluster'][0])]
            return {
                **match,
                'type': type
            }
            
        

def process_features(lookup, feature_list):
    feature_descriptions = []
    system_descriptions = set()

    for feature_name in feature_list:
        feature = next(
            (item for item in lookup["features"] if item["name"] == feature_name), None
        )
        if feature:
            feature_descriptions.append(feature["description"])

            system = next(
                (
                    item
                    for item in lookup["systems"]
                    if item["name"] == feature["system"]
                ),
                None,
            )
            if system:
                system_descriptions.add(system["summary"])

    return feature_descriptions, system_descriptions


def generate_mutations(pre_text_features, via_text_features, lookup):
    mutations = []

    pre_text_features_set = set(pre_text_features)
    via_text_features_set = set(via_text_features)

    gained_features = via_text_features_set - pre_text_features_set
    lost_features = pre_text_features_set - via_text_features_set

    if gained_features:
        mutations.append("gained the following features: " + ", ".join(gained_features))

    if lost_features:
        mutations.append("lost the following features: " + ", ".join(lost_features))

    # These features still need descriptions (i.e., process_features), since they don't occur in the pre-text features
    gained_feature_descriptions, _ = process_features(lookup, gained_features)
    mutations.append("\n".join(gained_feature_descriptions))

    return mutations
  
# test mutations
test_sit = get_situations_data('MAT 3:15!1')
generate_mutations(test_sit['preTextFeatures'], test_sit['viaTextFeatures'], situations_lookup_json)


MAT 3:15!1
no token_refs for situation 02-15
no token_refs for situation 02-16


['gained the following features: asserting, close, multilogical, addressee-more-passive, allying, conceptual-ie-internally-oriented',
 'lost the following features: practical-ie-outwardly-oriented, addressee-more-active, opposing, distant, dialogical, instructing',
 "Asserting goals involve stating or affirming beliefs, claims, or opinions, often in a declarative manner.\nClose social distance refers to situations where participants have a close relationship or are familiar with each other.\nMultilogical refers to situations with three or more speaking participants, such as group discussions or debates.\nAddressee-more-passive process sharing refers to situations where the recipient of the message is less actively involved in the communication process, such as listening or reading without providing feedback.\nAllying value orientation predisposition refers to participants who share the same views or are supportive of each other's positions.\nConceptual field values involve abstract or 

In [9]:
print(clusterLabels)
# save the updated clusterLabels as a 'situation_types.json' file
with open('situation_types.json', 'w') as f:
    json.dump(clusterLabels, f, indent=2)



In [None]:
# # Token features query

# passage_tokens_query = """
# query PassageByReference($filters: PassageFilter) {
#   passage(filters: $filters) {
#     usfmRef
#     textContent
#     tokens {
#       ref
#       xmlId
#       data
#     }
#   }
# }
# """

# def get_passage_token_features(startRef, endRef = None):
#     ref = startRef
#     if endRef and endRef != startRef: # FIXME: add some additional validation here...?
#         ref = startRef + '-' + endRef
    
#     variables = {
#         "filters": {
#             "reference": ref,
#         }
#     }
    
#     payload = {'query': passage_tokens_query, 'variables': variables}
    
#     response = requests.post(ENDPOINT, json=payload, headers=headers)
    
#     response_data = json.loads(response.text)
#     # print(response_data)
#     tokens = response_data["data"]["passage"][0]["tokens"]
    
#     return tokens
  
# def get_token(tokens, query_string=None):
#     print('get_tokens', tokens, query_string)
#     """
#     Accepts an array of tokens, and a query string. If the string is not supplied, return None.
    
#     Filters the tokens by checking each property of the token against the query string. Return the first match.
#     # TODO: add a way to return multiple matches?
#     """
#     if not query_string:
#         return None
    
#     for token in tokens:
#         for key in token:
#             if query_string.lower() in str(token[key]).lower():
#                 return token
#             if key == 'data':
#                 for data_key in token[key]:
#                     if query_string.lower() in str(token[key][data_key]).lower():
#                         return token
#     return None
    

In [None]:
# def get_all_annotations_by_token_and_query_string(token_ref: str, query_string: str = None):
#     """
#     Accepts a token ref, and a query string. If the string is not supplied, return the token from the passage that has the same ref as the token ref.
    
#     If the query string is supplied, return the token from the passage that has some property that matches the query string.
#     """
    
#     # Use sets to avoid storing duplicates
#     query_token_discourse_features = set()
#     query_tokens_data = set()
    
#     # Get situation data
    
#     # you only need to get the situation data once, assuming the tokens all belong to one pericope
#     query_situation_data = get_situations_data(token_ref) 
#     # FIXME: add some validation here to check the result and ensure 
#     # all the token refs are in the resulting 
#     # { "matchingSituation": 
#     #     "token_ids": [...all token refs should be in here], 
#     #     [...etc.]
#     # }
    
#     # Get token data, including the XML id for every token_ref 
#     # in the token_refs, and the discourse features
    
#     # for each token ref, get execute all the retrieval functions defined above
#     print(token_ref)
#     passage_tokens = get_passage_token_features(token_ref)
#     print(len(passage_tokens), 'tokens found in passage for ref', token_ref)
#     # for tok in passage_tokens:
#     #     xml_id = tok['xmlId']
#     #     query_tokens_data.add({xml_id: token_data})
#     #     query_token_xml_ids.add(xml_id)
#     #     discourse_features = get_discourse_annotation_types(xml_id)
#     # query_token_discourse_features.add(discourse_features)
#     print(passage_tokens)
#     if query_string:
#         passage_token_matching_query_string = get_token(passage_tokens, query_string)
#     else:
#         passage_token_matching_query_string = passage_tokens
#     return {
#         "matchingSituation": query_situation_data,
        
#     } # TODO: does it make more sense to incrementally build up a pandas DF?
    

In [None]:
# get_all_annotations_by_token_and_query_string('MAT 3:14', 'him')

In [None]:
# Get speaker quotation data for a token
"""
speaker_data.columns =
['CharacterId',
'MaxSpeakers',
'Gender',
'Age',
'Comment',
'SDBH',
'LouwNida',
'FCBHCharacter',
'Divinity']
"""
def get_speaker_quotation_data(token_ref: str):
    """
    Accepts a token ref, and returns the speaker quotation data (from expanded_speaker_data) for that token.
    """
    # The token id is the row Name
    token_data = mg[mg["ref"] == token_ref]
    # print(token_data)
    token_id = token_data.index[0]
    # print(token_ref, 'matched to', token_id)
    speaker_data_for_token = expanded_speaker_data[expanded_speaker_data["token_ids"].apply(lambda x: token_id in x)]
    # print(speaker_data_for_token)
    
    if speaker_data_for_token.empty:
        return None
    
    speaker_ids = speaker_data_for_token["CharacterIds"].iloc[0]
    # print(speaker_ids)
    results = []
    for speaker_id in speaker_ids:
            
        speaker_character_data = character_data[character_data["CharacterId"] == speaker_id]
        speaker_character_data = {key: value for key, value in speaker_character_data.iloc[0].items() if not type(value) == float}
        # print('speaker_character_data', speaker_character_data)
        # print(speaker_character_data)
    
        result = {
            "who_is_speaking": speaker_id,
            "delivery_tone": speaker_data_for_token["Delivery"].iloc[0],
            # "contained_in_speech_by": # TODO: somehow I would like to note that the Baptist's speech is contained in the Narrator's speech
            "what_is_said_truncated": ' '.join(speaker_data_for_token["tokens"].iloc[0][:10]) + '...',
            "what_is_said_complete": ' '.join(speaker_data_for_token["tokens"].iloc[0]),
        }
        for character_item in speaker_character_data:
            result[character_item] = speaker_character_data[character_item]
            
        results.append(result)
        
    return results
    
# test
get_speaker_quotation_data('MAT 3:14!6')

[{'who_is_speaking': 'John the Baptist',
  'delivery_tone': 'humble',
  'what_is_said_truncated': 'ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι καὶ σὺ ἔρχῃ πρός...',
  'what_is_said_complete': 'ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι καὶ σὺ ἔρχῃ πρός με',
  'CharacterId': 'John the Baptist',
  'MaxSpeakers': 1,
  'Gender': 'Male',
  'LouwNida': ['93.190a',
   '93.190b',
   '93.190e',
   '93.190f',
   '93.190d',
   '93.190c']}]

### Functions to provision semantic role data

In [None]:
semantic_role_wordings_lookup.head()

# find row that contains 'n57001020001' in 'ids' column. Note that 'ids' is an array of strings
print(semantic_role_wordings_lookup[semantic_role_wordings_lookup['ids'].apply(lambda x: 'n41001005003' in x)])
semantic_role_data[semantic_role_data['xml:id'] == 'n41001005003']
# semantic_role_data.loc[semantic_role_data['xml:id'].apply(lambda x: 'n40003014' in x)]
# semantic_role_wordings_lookup[semantic_role_wordings_lookup['ids'].apply(lambda x: 'n41001005003' in x)].iloc[0].to_dict()
# NOTE: id n40003014006 is apparently an error in the data that I just happened to be looking at.

Empty DataFrame
Columns: [node_category, node_head, field_name, field_type, forms, lemmas, glosses, ids]
Index: []


Unnamed: 0,xml:id,ref,word,lemma,gloss,frame,frame_verb_id,frame_verb,frame_type,frame_node_head,adjunct_label,adjunct_node_head,preposition_label,preposition_node_head,semantic_role_label,semantic_role_node_head,embedded_semantic_role_label,embedded_semantic_role_head
18355,n41001005003,MRK 1:5!3,πρὸς,πρός,to,,,,,,Goal,n41001005003,Directed towards,n41001005003,,,,


In [None]:
# Use semantic_role_data to get the semantic role data for a token

import pandas as pd
import re

def describe_semantic_configuration(query_id, df, output_template):
    # Find the row with the query ID
    row = df.loc[df['xml:id'] == query_id].iloc[0]

    # Find the matching rows for the verb frame IDs, handling multiple frame verb rows
    frame_verb_ids = row['frame_verb_id'].split('|')
    
    # FIXME: If there are no frame verb IDs, we probably just need to find the syntactic clause containing this token and return the treedown
    
    
    frame_verb_rows = [df.loc[df['xml:id'] == frame_verb_id].iloc[0] for frame_verb_id in frame_verb_ids if not df.loc[df['xml:id'] == frame_verb_id].empty]

    outputs = []

    for frame_verb_row in frame_verb_rows:
        # Extract all frame roles and IDs using regex, and store them in a dictionary where the role (e.g., A0) is the key and the value is a list of xml:ids
        role_id_dict = {role: ids.split(';') for role, ids in re.findall(r'(A[0-9A]+):([^;\s]+)', frame_verb_row['frame'])}

        for role_id in role_id_dict:
            id = role_id_dict[role_id][0] 
            # get the complete wordings relating to the id:
            wordings_dict = get_complete_wordings_for_role(id) # TODO: This is still not working properly, but I'm not sure why. I don't understand the complete contents of each data file yet.
            # print('********', id, wordings_dict)
            
        # Find the rows with the extracted role IDs
        role_rows = {role: df.loc[df['xml:id'].isin(ids)] for role, ids in role_id_dict.items()}

        # Generate the output
        frame_verb_lemma, frame_verb_gloss, frame_verb_role = frame_verb_row['lemma'], frame_verb_row['gloss'], 'Process'

        a0_rows = role_rows.get('A0', pd.DataFrame())
        if a0_rows.empty or not 'lemma' in a0_rows:
            '''''' # I'm not sure of the syntax to 'continue/pass' without skipping the rest of the loop... -\_(ツ)_/-
            # 'no a0 rows lemma', a0_rows
        else:
            a0_lemmas, a0_glosses, a0_roles = a0_rows['lemma'].tolist(), a0_rows['gloss'].tolist(), a0_rows['semantic_role_label'].tolist()
            a0_string = f'A0: {", ".join(a0_lemmas)}'
            a0_gloss = f'{", ".join(a0_glosses)}'

        all_other_role_strings = []
        all_other_role_gloss_strings = []
        for role, rows in role_rows.items():
            if role == 'A0':
                continue
            lemmas, glosses, roles = rows['lemma'].tolist(), rows['gloss'].tolist(), rows['semantic_role_label'].tolist()
            role_string = f'[{role}: {", ".join(lemmas)}]'
            role_gloss_string = f'[{roles[0]}: {", ".join(glosses)}]'
            all_other_role_strings.append(role_string)
            all_other_role_gloss_strings.append(role_gloss_string)

        output = output_template.format(
            a0_string=a0_string,
            frame_verb_lemma=frame_verb_lemma,
            all_other_role_strings=" ".join(all_other_role_strings),
            a0_role=a0_roles[0],
            a0_gloss=a0_gloss,
            frame_verb_role=frame_verb_role,
            frame_verb_gloss=frame_verb_gloss,
            all_other_role_gloss_strings=" ".join(all_other_role_gloss_strings)
        )

        outputs.append(output)

    return "\n".join(outputs)

# Example usage:
query_id = 'n41001005008'
output_template = "[{a0_string}] [{frame_verb_lemma}] {all_other_role_strings} / [{a0_role}: {a0_gloss}] [{frame_verb_role}: {frame_verb_gloss}] {all_other_role_gloss_strings}"
result = describe_semantic_configuration(query_id, semantic_role_data, output_template)
print(result)


[A0: χώρα] [ἐκπορεύομαι]  / [Source: region] [Process: were going out] 
[A0: χώρα] [βαπτίζω] [A1: χώρα] / [Source: region] [Process: were being baptized] [Source: region]
[A0: χώρα] [ἐξομολογέω] [A1: ἁμαρτία] / [Source: region] [Process: confessing] [Goal: sins]


### Get Treedown plaintext

In [None]:
# Get the plain treedown representation for a token's sentence

# example endpoint: "https://labs.clear.bible/symphony-dev/api/GNT/Nestle1904/lowfat?usfm-ref=JHN%2014:1" - JHN 14:1

from lxml import etree

def process_element(element, usfm_ref, indent=0):
    treedown_str = ""
    
    if element.get("class") == "cl":
        treedown_str += "\n" + "  " * indent

    if element.get("role"):
        role = element.attrib["role"]
        if role == "adv":
            role = '+'
        treedown_str += "\n" + "  " * indent + role + ": "
        
    # bold the matching token using usfm ref
    if element.tag == "w" and element.get("ref") == usfm_ref:
        treedown_str += "**" + element.text + "**"
        treedown_str += element.attrib.get("after", "") + ' '

    if element.tag == "w" and element.text:
        treedown_str += element.attrib.get("gloss", "") + f'[{element.text}]'
        treedown_str += element.attrib.get("after", "") + ' '

    for child in element:
        treedown_str += process_element(child, usfm_ref, indent + 1)

    return treedown_str


def get_treedown_by_ref(usfm_ref):
    print('Getting treedown for ref', usfm_ref)
    usfm_passage = usfm_ref.split('!')[0]
    endpoint = "https://labs.clear.bible/symphony-dev/api/GNT/Nestle1904/lowfat?usfm-ref=" + usfm_passage
    
    # Note: the response is XML like this:
    '''
    <sentences xml:lang="grc" ref="JHN 14:1">
        <sentence>
        <p>
        <milestone unit="verse" id="JHN 14:1">JHN 14:1</milestone>
        Μὴ ταρασσέσθω ὑμῶν ἡ καρδία·
        </p>
        <wg>
        <wg class="cl" rule="ADV-V-S">
        <w role="adv" ref="JHN 14:1!1" after=" " class="adv" id="n43014001001" lemma="μή" normalized="Μή" strong="3361" gloss="Not" domain="069002" ln="69.3" morph="PRT-N" unicode="Μὴ">Μὴ</w>
        ...
    '''
    text_response = requests.get(endpoint).text
    # print(text_response)
    
    xml = etree.fromstring(text_response.encode('utf-8'))
    # turn xml into simple treedown, with all text on one line, except a new line for <wg class="cl".../> elements, and a new indented line for <w role.../> elements
    
    treedown = process_element(xml, usfm_passage)
    return treedown
# test

get_treedown_by_ref('MAT 3:14')

Getting treedown for ref MAT 3:14


'But[δὲ]  \n        \n          s: -[ὁ]  \n          v: was hindering[διεκώλυεν]  \n          o: Him[αὐτὸν]  \n          \n          +: \n            v: saying[λέγων]  \n            o: \n              \n                s: I[Ἐγὼ]  \n                o: need[χρείαν]  \n                v: have[ἔχω]  \n                \n                +: \n                  +: by[ὑπὸ]  You[σοῦ]  \n                  v: to be baptized[βαπτισθῆναι], and[καὶ]  \n                \n                  s: You[σὺ]  \n                  v: come[ἔρχῃ]  \n                  +: to[πρός]  me[με]; '

### Get OpenText syntax

In [None]:
OPENTEXT_ENDPOINT = "https://ww-network-annotations---macula-atlas-api-qa-25c5xl4maa-uk.a.run.app/graphql/"

opentext_query = """
query TokensWithAllOpenTextSystemValues(
  $wordTokensFilters: WordTokenFilter
  $wordTokensPagination: OffsetPaginationInput
  $annotationInstancesFilters: AnnotationFilter
) {
  wordTokens(filters: $wordTokensFilters, pagination: $wordTokensPagination) {
    ref
    annotationInstances(filters: $annotationInstancesFilters) {
      uri
      feature {
        label
        data
      }
    }
  }
}
"""


def get_opentext_syntax_data(xmlId):
    tokenData = mg.loc[xmlId].to_dict()
    passage = tokenData["ref"].split("!")[0]

    opentext_variables = {
        "wordTokensFilters": {"passageReference": passage},
        "annotationInstancesFilters": {
            "uri": {
                "regex": "https://github.com/OpenText-org/placeholder-data:system-values..+.all"
            }
        },
    }

    opentext_payload = {"query": opentext_query, "variables": opentext_variables}

    response = requests.post(OPENTEXT_ENDPOINT, json=opentext_payload, headers=headers)
    response_data = json.loads(response.text)
    words = response_data["data"]["wordTokens"]

    results = []
    for word in words:
        ref = word["ref"]
        if ref == tokenData["ref"]:
            annotation_features = word["annotationInstances"]
            for feature in annotation_features:
                feature_name = feature["feature"]["label"]
                if feature_name.startswith("$"):
                    feature_description = f'The grammar requires a {feature_name} lemma here'
                else:
                    feature_description = feature["feature"]["data"]["description"]
                    if feature_name.endswith("_tbd"):
                          feature_description += " (Fall back to morphological description of this wording)"
                    results.append(
                    {
                        "feature_name": feature_name,
                        "feature_description": feature_description,
                    }
                )
    # for feature in annotation_features:
    #     feature_name = feature["uri"].split(".")[-1]
    #     feature_description = feature["data"]["description"]
    #     instances = feature["instances"]
    #     results.append(
    #         {
    #             "feature_name": feature_name,
    #             "feature_description": feature_description,
    #             "instances": instances,
    #         }
    #     )

    return results


# test
# get_opentext_syntax_data("MAT 3:14!6")

In [None]:
get_opentext_syntax_data("n57001001001")


[{'feature_name': '#articulation',
  'feature_description': 'The indicate exploits the most general type of indication (i.e. ὁ) as a derivational marker (i.e. "the article"), which makes it possible to indicate an experience as conspicuous while also defining it using some other explicit definition. The resulting indication is expressed by an articular substantive.'},
 {'feature_name': '#predicating',
  'feature_description': 'The move combines a generalization (a predicate) with an explicit introduction of an instantiating entity (a subject). This is the most frequently selected speech act type. Notably, it activates a number of additional modal systems related to interpersonal meaning that are not activated in the simpler speech act types (which are purely ideational).'},
 {'feature_name': 'alertness_tbd',
  'feature_description': 'This tense form has not yet been properly analyzed. (Fall back to morphological description of this wording)'},
 {'feature_name': 'categorized',
  'featur

## Get TFIDF score for string

In [None]:
from collections.abc import Mapping
import numpy
from unidecode import unidecode
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
tfidf_dictionary = Dictionary.load('tfidf.dictionary.model')
tfidf_model = TfidfModel().load('tfidf.model')

perseus_stopwords = "μή, ἑαυτοῦ, ἄν, ἀλλ', ἀλλά, ἄλλος, ἀπό, ἄρα, αὐτός, δ', δέ, δή, διά, δαί, δαίς, ἔτι, ἐγώ, ἐκ, ἐμός, ἐν, ἐπί, εἰ, εἰμί, εἴμι, εἰς, γάρ, γε, γα, ἡ, ἤ, καί, κατά, μέν, μετά, μή, ὁ, ὅδε, ὅς, ὅστις, ὅτι, οὕτως, οὗτος, οὔτε, οὖν, οὐδείς, οἱ, οὐ, οὐδέ, οὐκ, περί, πρός, σύ, σύν, τά, τε, τήν, τῆς, τῇ, τι, τί, τις, τίς, τό, τοί, τοιοῦτος, τόν, τούς, τοῦ, τῶν, τῷ, ὑμός, ὑπέρ, ὑπό, ὡς, ὦ, ὥστε, ἐάν, παρά, σός".split(', ')
perseus_stopwords += "συ δ μοι".split(' ')
perseus_stopwords = [unidecode(w) for w in perseus_stopwords]

def tfidf_tokenize(string):
    output = string
    # Filter numeric digits from token
    output = ''.join(filter(lambda x: x.isalpha() or x == ' ', string))
    return [token.lower() for token in output.split() if unidecode(token.lower()) not in perseus_stopwords] # use unidecode to strip accents temporarily

# example
input_text = "Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
input_tokens = [w for w in tfidf_tokenize(input_text)]
input_bow = tfidf_dictionary.doc2bow(input_tokens)
input_tfidf = tfidf_model[input_bow]
summary = sorted(input_tfidf, key=lambda x: x[1], reverse=True)[:10]
print('Most significant words in input text: ')
for result in summary:
    id, score = result
    token = tfidf_dictionary[id]
    print(f'{score:.2f}: {token}')
    
def get_tfidf_summary(input_text):
    input_tokens = [w for w in tfidf_tokenize(input_text)]
    input_bow = tfidf_dictionary.doc2bow(input_tokens)
    input_tfidf = tfidf_model[input_bow]
    summary = sorted(input_tfidf, key=lambda x: x[1], reverse=True)[:10]
    output = []
    for result in summary:
        id, score = result
        token = tfidf_dictionary[id]
        output.append((f'{score:.2f}',token))
    return output

print(get_tfidf_summary(input_text))

Most significant words in input text: 
0.56: λόγος
0.56: θεόν
0.48: ἀρχῇ
0.39: θεὸς
[('0.56', 'λόγος'), ('0.56', 'θεόν'), ('0.48', 'ἀρχῇ'), ('0.39', 'θεὸς')]


## Define function to synthesize all available annotations into prose

This function is at the heart of this project. It should accept a token or range of tokens. It should return a string that synthesizes all available annotations for the token or range of tokens. 

If no annotations are available, it should return a generic explanation of the fact that the user should try asking more precisely about a word or phrase, with a verse reference if a specific context is in mind. 

If no passage refs are applicable, then return information about the lemma, lemmas, cultural/encyclopedic, or grammatical phenomena in question.

If an error occurs, return a message to the user that an error occurred and that they should try again with a different query.

In [None]:
# Generate prosaic context function
def generate_prosaic_context(word_id, selected_fields=None):
    # The user may pass in a verse!word ref or an id
    if not '!' in word_id:
        word_data = mg.loc[word_id].to_dict()
        # Get annotations using combined annotations function
        word_ref = word_data['ref']
    else:
        # copy the value of word_id into word_ref (not just the variable reference)
        word_ref = f'{word_id}'
        word_data = mg.loc[mg['ref'] == word_ref].iloc[0].to_dict()
        # word_id will be the 'id' column of word_data
        word_id = word_data['id']
    
    lemma = word_data['lemma']
        
    # print(word_ref)

    if not selected_fields:
        selected_fields = list(attribute_descriptions.keys())

    context_data = {
        "1. Lexical features": ['This data is useful for identifying the meaning of a word across all of its various contexts.'],
        "2. Syntactic context and function": ['This data is useful for identifying the meaning of a word in a specific sentential context.'],
        "3. Discourse context": ['This data is useful for identifying the way a word contributes to a larger discourse context.'],
        "4. Social context": ['This data is useful for giving top-down clarification about what is going on in a passage, which helps narrow down ambiguities in more low-level data.'],
        "5. Cultural/encyclopedic knowledge": ['This data is useful for filling in background historical or cultural information that is not explicit or is assumed in the text.'],
    }

    for key in selected_fields:
        value = word_data.get(key)
        if value not in (None, 'missing', 'nan'):
            if key == "class":
                context_data["1. Lexical features"].append(f"- {key}: {lemma} is a {value},")
            elif key == "gloss":
                context_data["1. Lexical features"].append(f"- {key}: meaning \"{value}.\"")
            elif key == "lemma":
                context_data["1. Lexical features"].append(f"- {key}: The lemma form of this word is {value},")
            elif key == "morph":
                # context_data["1. Lexical features"].append(f"- {key}: and it is parsed as a {value}") # TODO: expand morphological parse codes into prose - although, is this necessary given the other data points?
                pass
            elif key == "strong":
                # context_data["1. Lexical features"].append(f"- {key}: with a Strong's number of {value}.")
                pass
            elif key == "subjref":
                # The id of the implied subject of the verb  - need to retrieve this from the macula greek 'mg' dataframe
                subject_referent_data = mg.loc[value].to_dict()
                # get the 'text' 'gloss' 'lemma' and semantic role label
                subject_referent_text = subject_referent_data['text']
                subject_referent_gloss = subject_referent_data['gloss']
                subject_referent_lemma = subject_referent_data['lemma']
                # subject_referent_semantic_role_label, complete_wordings = semantic_role_data.loc[semantic_role_data['xml:id'] == value].iloc[0]['semantic_role_label'], get_complete_wordings_for_role(value)
                # for wording in complete_wordings:
                #     complete_wordings_array = []
                #     if wording['forms'] != subject_referent_text:
                #         complete_wordings_array.append(' '.join(wording['forms']))
                #         break
                #     else:
                #         complete_wordings = None
                # complete_wordings_string = ', '.join(complete_wordings_array) if complete_wordings_array else None
                
                context_data["2. Syntactic context and function"].append(f"- Subject referent: {attribute_descriptions[key]}: {subject_referent_text} ({subject_referent_gloss}, lemma {subject_referent_lemma}) is the subject of the verb.") # and it plays the role of {subject_referent_semantic_role_label}")
                # if complete_wordings_string:
                #     context_data["2. Syntactic context and function"].append(f" (complete wording: {complete_wordings_string})")
                # context_data["2. Syntactic context and function"].append(f" in its semantic configuration.")
            elif key == "referent":
                # The id of the (usually pronominal) referent - need to retrieve this from the macula greek 'mg' dataframe
                referent_data = mg.loc[value].to_dict()
                # get the 'text' 'gloss' 'lemma' and semantic role label
                referent_text = referent_data['text']
                referent_gloss = referent_data['gloss']
                referent_lemma = referent_data['lemma']
                # referent_semantic_role_label, complete_wordings = semantic_role_data.loc[semantic_role_data['xml:id'] == value].iloc[0]['semantic_role_label'], get_complete_wordings_for_role(value)
                # for wording in complete_wordings:
                #     complete_wordings_array = []
                #     if wording['forms'] != referent_text:
                #         complete_wordings_array.append(' '.join(wording['forms']))
                #         break
                #     else:
                #         complete_wordings = None
                # complete_wordings_string = ', '.join(complete_wordings_array) if complete_wordings_array else None
                
                context_data["2. Syntactic context and function"].append(f"- Referent: {attribute_descriptions[key]}: {referent_text} ({referent_gloss}, lemma {referent_lemma}) is the referent of the pronoun") # playing the role of {referent_semantic_role_label}")
                # if complete_wordings_string:
                #     context_data["2. Syntactic context and function"].append(f" (complete wording: {complete_wordings_string})")
                # context_data["2. Syntactic context and function"].append(f" in its semantic configuration.")
            elif key in ("person", "number", "gender", "case", "tense", "voice", "mood", "degree", "type"):
                context_data["2. Syntactic context and function"].append(f"- {key}: {attribute_descriptions[key]}: {value},")
            elif key in ("ln"):
                context_data["5. Cultural/encyclopedic knowledge"].append(f"- {key}: {attribute_descriptions[key]}: {value},")
            elif key in ("domain_label"):
                context_data["5. Cultural/encyclopedic knowledge"].append(f"- {key} (relates to the general subject matter if the lemma): {attribute_descriptions[key]}: {domain_labels[value]},")

    # Add semantic role data to syntactic context: 'this word is the {semantic_role} in the configuration {semantic_configuration}' 
    semantic_configuration = describe_semantic_configuration(word_id, semantic_role_data, output_template) # output_template was defined above
    # Find the matching row from semantic_role_data, and get all of the labels plus the column names
    role_data_row = semantic_role_data.loc[semantic_role_data['xml:id'] == word_id].iloc[0]
    semantic_role_info = {key: value for key, value in role_data_row.items() if not value == ''}
    if semantic_role_info.get('semantic_role_label'):
        semantic_role = semantic_role_info['semantic_role_label']    
        context_data["2. Syntactic context and function"].append(f"- Semantic configuration (useful for figuring out what is taking place in the sentence and how this word plays a role): This word is the {semantic_role_info['semantic_role_label']} {'in `' + semantic_configuration + '`' if semantic_configuration else ''}, and it has the following data: {semantic_role_info}")
    # TODO: need to exploit the roles.json file in order to get all related wordings for each frame. 
    
    
    # Add opentext syntax data to syntactic context
    opentext_syntax_data = get_opentext_syntax_data(word_id)
    if opentext_syntax_data:
        context_data["2. Syntactic context and function"].append(f"- OpenText syntax (useful for identifying all of the grammatical choices that led up to this word, such as whether it is part of a derived 'entity' definition or a nested 'turn' or a particular kind of speech act):\n  This word has the following syntactic selection features:")
        for feature in opentext_syntax_data:
            context_data["2. Syntactic context and function"].append(f"  - {feature['feature_name']}: {feature['feature_description']}")
    
    # Add treedown syntax data
    treedown_data = get_treedown_by_ref(word_ref)
    if treedown_data:
        context_data["2. Syntactic context and function"].append(f"- Treedown syntax: This word is part of the following sentence:\n{treedown_data}")
    
    # Add Levinsohn discourse features
    discourse_features = get_discourse_annotation_types(word_id)
    if discourse_features:
        context_data["3. Discourse context"].append(f"This word functions within {len(discourse_features)} discourse features (these are useful heuristic interpretive annotations that tell you about the nature of the proposition a word is in):")
        for feature in discourse_features:
            context_data["3. Discourse context"].append(f"- {feature} is defined as {discourse_types[feature]['description']}")

    speaker_information = get_speaker_quotation_data(word_ref)
    """
    Speaker information is an array of speakers, like this:
    [{'who_is_speaking': 'John the Baptist',
        'delivery_tone': 'humble',
        'what_is_said_truncated': 'ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι καὶ σὺ ἔρχῃ πρός...',
        'what_is_said_complete': 'ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι καὶ σὺ ἔρχῃ πρός με',
        'CharacterId': 'John the Baptist',
        'MaxSpeakers': 1, # this represents the number of speakers who are speaking at the same time (up to n)
        'Gender': 'Male', # this value, if present, can inform the pronouns used in the prosaic description below
        'LouwNida': ['93.190a',
        '93.190b',
        '93.190e',
        '93.190f',
        '93.190d',
        '93.190c']}]
        
    With possible additional values for the speaker, like this:
    'Age', # this value, if present, can be appended to the parenthetical content after the speaker's name (e.g. "John the Baptist ({Age} old)"
    'Comment', # this value, such as an alternate name for the speaker, can be appended to the parenthetical content after the speaker's name (e.g. "John the Baptist ({Age} old, {Comment})"
    'SDBH', # not used, available in macula greek database, but potentially useful for profiling the subject matter of the speech
    'LouwNida', # not used, available in macula greek database, but potentially useful for profiling the subject matter of the speech
    'FCBHCharacter', # not used, alternate id
    'Divinity' # will be 'Y' if present, otherwise not present. This value can be used to determine whether to use "he" or "He" in the prosaic description below, with "He" used for divinities.
    """
    if speaker_information:
        context_data["3. Discourse context"].append(f"Speaker data is critical to identifying quoted material and relating it to the proper speaker.")
        if len(speaker_information) == 1:
            speaker = speaker_information[0]
            context_data["3. Discourse context"].append(f"This word is spoken by {speaker['who_is_speaking']}")
            if speaker.get("Divinity") == "Y":
                context_data["3. Discourse context"].append(f"- {speaker['who_is_speaking']} is a divinity")
            if speaker.get("Age"):
                context_data["3. Discourse context"].append(f", age: {speaker['Age']}")
            if speaker.get("Comment"):
                context_data["3. Discourse context"].append(f" ({speaker['Comment']})")
            speech = speaker.get("what_is_said_complete")
            if len(speech) > 100:
                speech = speaker.get("what_is_said_truncated")
            context_data["3. Discourse context"].append(f", who says (in a {speaker['delivery_tone']} tone), \"{speech}\"")
                
        else:
            context_data["3. Discourse context"].append(f"This word is spoken by {len(speaker_information)} speaker(s): ")
            for speaker in speaker_information:
                context_data["3. Discourse context"].append(f"This word is spoken by {speaker['who_is_speaking']}")
                if speaker.get("Divinity") == "Y":
                    context_data["3. Discourse context"].append(f"- {speaker['who_is_speaking']} is a divinity")
                if speaker.get("Age"):
                    context_data["3. Discourse context"].append(f", {speaker['Age']} years old")
                if speaker.get("Comment"):
                    context_data["3. Discourse context"].append(f" ({speaker['Comment']})")
                speech = speaker.get("what_is_said_complete")
                if len(speech) > 100:
                    speech = speaker.get("what_is_said_truncated")
                context_data["3. Discourse context"].append(f", who says (in a {speaker['Delivery']} tone), \"{speech}\"")

    lookup = situations_lookup_json
    situation_data = get_situations_data(word_ref)
    # if situation_data and situation_data.get('matchingSituation'):
    #     situation_data = situation_data['matchingSituation']
    if situation_data:
        pre_text_features = situation_data['preTextFeatures']
        via_text_features = situation_data['viaTextFeatures']

        pre_text_feature_descriptions, pre_text_system_descriptions = process_features(lookup, pre_text_features)
        via_text_feature_descriptions, via_text_system_descriptions = process_features(lookup, via_text_features)

        mutations = generate_mutations(pre_text_features, via_text_features, lookup)
        print(mutations)

        context_data["4. Social context"].append(f"This word is part of the passage '{situation_data['title'][0]}'")

        # Add situation type information
        if situation_data.get('type'):
            situation_type = situation_data['type']
            context_data["4. Social context"].append(f"This passage is a {situation_type['title']} situation, which can be described in typical terms as follows: {situation_type['description']}")

        context_data["4. Social context"].append(f"It begins as a {' '.join(pre_text_features)} situation")
        context_data["4. Social context"].extend(pre_text_feature_descriptions)
        # context_data["4. Social context"].extend(pre_text_system_descriptions)

        # context_data["4. Social context"].append(f"And ends as a {' '.join(via_text_features)} situation")
        # context_data["4. Social context"].extend(via_text_feature_descriptions)
        # context_data["4. Social context"].extend(via_text_system_descriptions)

        if mutations and mutations[-1] != '':
            context_data["4. Social context"].append("During the passage, the situation:")
            context_data["4. Social context"].extend('\n'.join(mutations))
    output_lines = []
    for header, sentences in context_data.items():
        if sentences:
            output_lines.append(f"## {header}\n")
            output_lines.append("\n".join(sentences))
            output_lines.append("\n")

    prosaic_context = "".join(output_lines)
    # print(prosaic_context)
    return prosaic_context


In [None]:
# generate_prosaic_context('n40001020001')

In [None]:
generate_prosaic_context('MAT 1:5!4')

Getting treedown for ref MAT 1:5!4
MAT 1:5!4
no token_refs for situation 02-15
no token_refs for situation 02-16
['']




## Specific prompt expansion functions

In [None]:
def get_lexical_information(word_id):
    # Retrieve and return lexical prose only for the given word
    word_data = mg.loc[word_id].to_dict()
    lemma = word_data['lemma']
    text = word_data['text']
    output = f"Lexical information for {lemma}:\n"
    for key, value in word_data.items():
        if value not in (None, 'missing', 'nan'):
            if key == "class":
                output += f"- {key}: {text} is a {value},\n"
            elif key == "gloss":
                output += f"- {key}: meaning \"{value}.\"\n"
            elif key == "lemma":
                output += f"- {key}: The lemma form of this word is {value},\n"
            elif key == "morph":
                # output += f"- {key}: and it is parsed as a {value}\n" # TODO: expand morphological parse codes into prose - although, is this necessary given the other data points?
                pass
            elif key == "strong":
                # output += f"- {key}: with a Strong's number of {value}\n"
                pass
            elif key in ("ln"):
                # output += f"- {key}: {attribute_descriptions[key]}: {value}\n"
                pass
            elif key in ("domain_label"):
                # output += f"- {key} (relates to the general subject matter if the lemma): {attribute_descriptions[key]}: {domain_labels[value]}\n"
                pass
    
    return output

def get_tfidf_filtered_lexical_information(token_ids=None):
    print('getting token data for token ids', token_ids)
    # Retrieve and return TFIDF-filtered lexical information for the given passage
    tokens = [mg.loc[token_id].to_dict() for token_id in token_ids]
    
    passage_string = ' '.join([token['text'] for token in tokens])
    most_significant_token_tuples_in_passage = get_tfidf_summary(passage_string) # returns an array of (score,token) tuples
    print(most_significant_token_tuples_in_passage)
    
    tokens_data = []
    for score, token in most_significant_token_tuples_in_passage:
        token_data = mg.loc[mg['text'].apply(lambda x: x.lower()) == token].iloc[0].to_dict()
        token_data['tfidf_score'] = score
        tokens_data.append(token_data)
    
    return tokens_data

def get_syntactic_information(word_id):
    # Retrieve and return syntactic information for the given word
    word_data = mg.loc[word_id].to_dict()
    word_ref = word_data['ref']
    lemma = word_data['lemma']
    text = word_data['text']
    output = f"Syntactic information for {lemma}:\n"
    for key, value in word_data.items():
        if value not in (None, 'missing', 'nan'):
            if key == "subjref":
                # The id of the implied subject of the verb  - need to retrieve this from the macula greek 'mg' dataframe
                subject_referent_data = mg.loc[value].to_dict()
                # get the 'text' 'gloss' 'lemma' and semantic role label
                subject_referent_text = subject_referent_data['text']
                subject_referent_gloss = subject_referent_data['gloss']
                subject_referent_lemma = subject_referent_data['lemma']
                # subject_referent_semantic_role_label, complete_wordings = semantic_role_data.loc[semantic_role_data['xml:id'] == value].iloc[0]['semantic_role_label'], get_complete_wordings_for_role(value)
                # for wording in complete_wordings:
                #     complete_wordings_array = []
                #     if wording['forms'] != subject_referent_text:
                #         complete_wordings_array.append(' '.join(wording['forms']))
                #         break
                #     else:
                #         complete_wordings = None
                # complete_wordings_string = ', '.join(complete_wordings_array) if complete_wordings_array else None
                
                output += f"- Subject referent: {attribute_descriptions[key]}: {subject_referent_text} ({subject_referent_gloss}, lemma {subject_referent_lemma}) is the subject of the verb.\n" # and it plays the role of {subject_referent_semantic_role_label}")
                # if complete_wordings_string:
                #     output += f" (complete wording: {complete_wordings_string})\n"
                # output += f" in its semantic configuration.\n"
            elif key == "referent":
                # The id of the (usually pronominal) referent - need to retrieve this from the macula greek 'mg' dataframe
                referent_data = mg.loc[value].to_dict()
                # get the 'text' 'gloss' 'lemma' and semantic role label
                referent_text = referent_data['text']
                referent_gloss = referent_data['gloss']
                referent_lemma = referent_data['lemma']
                # referent_semantic_role_label, complete_wordings = semantic_role_data.loc[semantic_role_data['xml:id'] == value].iloc[0]['semantic_role_label'], get_complete_wordings_for_role(value)
                # for wording in complete_wordings:
                #     complete_wordings_array = []
                #     if wording['forms'] != referent_text:
                #         complete_wordings_array.append(' '.join(wording['forms']))
                #         break
                #     else:
                #         complete_wordings = None
                # complete_wordings_string = ', '.join(complete_wordings_array) if complete_wordings_array else None
                
                output += f"- Referent: {attribute_descriptions[key]}: {referent_text} ({referent_gloss}, lemma {referent_lemma}) is the referent of the pronoun.\n" # playing the role of {referent_semantic_role_label}\n"
                # if complete_wordings_string:
                #     output += f" (complete wording: {complete_wordings_string})\n"
                # output += f" in its semantic configuration.\n"
            elif key in ("person", "number", "gender", "case", "tense", "voice", "mood", "degree", "type"):
                output += f"- {key}: {attribute_descriptions[key]}: {value},\n"
     
    # Add semantic role data to syntactic context: 'this word is the {semantic_role} in the configuration {semantic_configuration}' 
    semantic_configuration = describe_semantic_configuration(word_id, semantic_role_data, output_template) # output_template was defined above
    # Find the matching row from semantic_role_data, and get all of the labels plus the column names
    role_data_row = semantic_role_data.loc[semantic_role_data['xml:id'] == word_id].iloc[0]
    semantic_role_info = {key: value for key, value in role_data_row.items() if not value == ''}
    if semantic_role_info.get('semantic_role_label'):
        semantic_role = semantic_role_info['semantic_role_label']    
        output += f"- Semantic configuration (useful for figuring out what is taking place in the sentence and how this word plays a role): This word is the {semantic_role} {'in `' + semantic_configuration + '`' if semantic_configuration else ''}, and it has the following data: {semantic_role_info}\n"
    # TODO: need to exploit the roles.json file in order to get all related wordings for each frame. 
    
    
    # Add opentext syntax data to syntactic context
    # opentext_syntax_data = get_opentext_syntax_data(word_id)
    # if opentext_syntax_data:
    #     output += f"- OpenText syntax (useful for identifying all of the grammatical choices that led up to this word, such as whether it is part of a derived 'entity' definition or a nested 'turn' or a particular kind of speech act):\n  This word has the following syntactic selection features:\n"
    #     for feature in opentext_syntax_data:
    #         output += f"  - {feature['feature_name']}: {feature['feature_description']}\n"
    
    # Add treedown syntax data
    treedown_data = get_treedown_by_ref(word_ref)
    if treedown_data:
        output += f"- Treedown syntax: This word is part of the following sentence:\n{treedown_data}\n"
    
    return output

# def get_syntactic_information_for_verse(verse_ref):
    

def get_discourse_information(word_id):
    # Retrieve and return discourse information for the given word
    output = ''
    word_data = mg.loc[word_id].to_dict()
    word_ref = word_data['ref']
    
    discourse_features = get_discourse_annotation_types(word_id)
    if discourse_features:
        output += f"This word functions within {len(discourse_features)} discourse features (these are useful heuristic interpretive annotations that tell you about the nature of the proposition a word is in):\n"
        for feature in discourse_features:
            output += f"- {feature} is defined as {discourse_types[feature]['description']}\n"
    
    speaker_information = get_speaker_quotation_data(word_ref)
    if speaker_information:
        output += f"\nSpeaker data is critical to identifying quoted material and relating it to the proper speaker.\n"
        if len(speaker_information) == 1:
            speaker = speaker_information[0]
            output += f"This word is spoken by {speaker['who_is_speaking']}"
            if speaker.get("Divinity") == "Y":
                output += f"- {speaker['who_is_speaking']} is a divinity"
            if speaker.get("Age"):
                output += f", age: {speaker['Age']}"
            if speaker.get("Comment"):
                output += f" ({speaker['Comment']})"
            if len(speech) > 100:
                speech = speaker.get("what_is_said_truncated")
            else:
                speech = speaker.get("what_is_said_complete")
            output += f", who says (in a {speaker['delivery_tone']} tone), \"{speech}\"\n"
        
        else:
            output += f"This word is spoken by {len(speaker_information)} speaker(s): \n"
            for speaker in speaker_information:
                output += f"This word is spoken by {speaker['who_is_speaking']}"
                if speaker.get("Divinity") == "Y":
                    output += f"- {speaker['who_is_speaking']} is a divinity"
                if speaker.get("Age"):
                    output += f", {speaker['Age']} years old"
                if speaker.get("Comment"):
                    output += f" ({speaker['Comment']})"
                if len(speech) > 100:
                    speech = speaker.get("what_is_said_truncated")
                else:
                    speech = speaker.get("what_is_said_complete")
                output += f", who says (in a {speaker['Delivery']} tone), \"{speech}\"\n"
    return output

def get_social_information(word_id):
    # Retrieve and return social information for the given word
    word_data = mg.loc[word_id].to_dict()
    word_ref = word_data['ref']
    
    lookup = situations_lookup_json
    situation_data = get_situations_data(word_ref)
    # if situation_data and situation_data.get('matchingSituation'):
    #     situation_data = situation_data['matchingSituation']
    if situation_data:
        pre_text_features = situation_data['preTextFeatures']
        via_text_features = situation_data['viaTextFeatures']
        
        if pre_text_features == 'no embedded discourse':
            return None

        pre_text_feature_descriptions, pre_text_system_descriptions = process_features(lookup, pre_text_features)
        via_text_feature_descriptions, via_text_system_descriptions = process_features(lookup, via_text_features)

        mutations = generate_mutations(pre_text_features, via_text_features, lookup)
        print(mutations)

        output = f"This word is part of the passage '{situation_data['title'][0]}'\n"

        # Add situation type information
        if situation_data.get('type'):
            situation_type = situation_data['type']
            output += f"This passage is a {situation_type['title']} situation, which can be described in typical terms as follows: {situation_type['description']}\n"

        output += f"It begins as a {' '.join(pre_text_features)} situation\n"
        output += '\n'.join(pre_text_feature_descriptions)
        # output += '\n'.join(pre_text_system_descriptions)

        # output += f"And ends as a {' '.join(via_text_features)} situation\n"
        # output += '\n'.join(via_text_feature_descriptions)
        # output += '\n'.join(via_text_system_descriptions)

        if mutations and mutations[-1] != '':
            output += "During the passage, the situation:\n"
            output += '\n'.join(mutations)
    
    return output

def get_cultural_information(word_id):
    # Retrieve and return cultural information for the given word
    word_data = mg.loc[word_id].to_dict()
    lemma = word_data['lemma']
    text = word_data['text']
    output = f"Cultural information for {lemma}:\n"
    
    ln_data = word_data['ln']
    domain_data = word_data['domain_label']
    if not(domain_data):
        print('no domain data for word!', word_data)
    
    if domain_data not in (None, 'missing', 'nan'):
        domain_string = '; '.join(domain_data)
        output += f"- Domain label (relates to the general subject matter if the lemma): {domain_string},\n"
    if ln_data not in (None, 'missing', 'nan'):
        output += f"- Louw and Nida domain: {attribute_descriptions['ln']}: {ln_data},\n"
                
    return output

def get_context_for_word(word_id, selected_fields=['lexis']):
    output = ''
    # Generate and return the prosaic context for the given word
    for field in selected_fields:
        if field not in ('lexis', 'syntax', 'discourse', 'social', 'cultural'):
            raise ValueError(f"Invalid field name '{field}'")
        elif field == 'lexis':
            lexical_data = get_lexical_information(word_id)
            if lexical_data:
                output += lexical_data
        elif field == 'syntax':
            syntax_data = get_syntactic_information(word_id)
            if syntax_data:
                output += syntax_data
        elif field == 'discourse':
            discourse_data = get_discourse_information(word_id)
            if discourse_data:
                output += discourse_data
        elif field == 'social':
            social_data = get_social_information(word_id)
            if social_data:
                output += social_data
        elif field == 'cultural':
            cultural_data = get_cultural_information(word_id)
            if cultural_data:
                output += cultural_data
    return output
            
def get_context_for_verse(verse_ref):
    # Generate and return the prosaic context for the given verse
    if '!' in verse_ref:
        # If a word ref gets passed in, just get all of the words for the word's verse
        verse_tokens = mg.loc[mg['book_chapter_verse'] == verse_ref.split('!')[0]].to_dict('records')
    else:
        # If a verse ref gets passed in, get all of the words for the verse
        verse_tokens = mg.loc[mg['book_chapter_verse'] == verse_ref].to_dict('records')
    
    token_ids = [token['id'] for token in verse_tokens]
    print(token_ids)
    # Get the most distinctive words in the verse
    most_significant_tokens = get_tfidf_filtered_lexical_information(token_ids)
    print('most_significant_tokens in verse', len(most_significant_tokens), f'of {len(token_ids)} total tokens')
    
    word_data = []
    for token in most_significant_tokens:
        print('Processing token', token['text'])
        # Get lexical and cultural context for those words
        word_data.append(get_context_for_word(token['id']))
    
        
    # Get discourse context for first word in verse
    word_data.append(get_context_for_word(most_significant_tokens[0]['id'], selected_fields=['discourse']))
    # Get syntactic context for first word in verse
    word_data.append(get_context_for_word(most_significant_tokens[0]['id'], selected_fields=['syntax']))
    
    # Get social context for verse using first word
    word_data.append(get_context_for_word(most_significant_tokens[0]['id'], selected_fields=['social']))
    
    
    
    # filter out duplicate lines in word data
    word_data = list(set(word_data))
    
    return '\n'.join(word_data)
    
        
def get_context_for_chapter(chapter):
    # Generate and return the prosaic context for the given chapter
    pass

def get_context_for_book(book):
    # Generate and return the prosaic context for the given book
    pass

def get_context_for_pericope(pericope):
    # Generate and return the prosaic context for the given pericope
    pass


In [None]:
print(get_context_for_verse('MAT 1:5!3'))

['n40001005001', 'n40001005002', 'n40001005003', 'n40001005004', 'n40001005005', 'n40001005006', 'n40001005007', 'n40001005008', 'n40001005009', 'n40001005010', 'n40001005011', 'n40001005012', 'n40001005013', 'n40001005014', 'n40001005015', 'n40001005016', 'n40001005017', 'n40001005018', 'n40001005019', 'n40001005020', 'n40001005021']
getting token data for token ids ['n40001005001', 'n40001005002', 'n40001005003', 'n40001005004', 'n40001005005', 'n40001005006', 'n40001005007', 'n40001005008', 'n40001005009', 'n40001005010', 'n40001005011', 'n40001005012', 'n40001005013', 'n40001005014', 'n40001005015', 'n40001005016', 'n40001005017', 'n40001005018', 'n40001005019', 'n40001005020', 'n40001005021']
[('0.68', 'ἰωβὴδ'), ('0.41', 'ἐγέννησεν'), ('0.32', 'ῥαχάβ'), ('0.29', 'σαλμὼν'), ('0.27', 'βόες'), ('0.26', 'ῥούθ'), ('0.22', 'ἰεσσαί')]
most_significant_tokens in verse 7 of 21 total tokens
Processing token Ἰωβὴδ
Processing token ἐγέννησεν
Processing token Ῥαχάβ
Processing token Σαλμὼν
Proc

In [None]:
# get_tfidf_filtered_lexical_information(token_ids=['n40001020001', 'n40001020002', 'n40001020003', 'n40001020004'])

In [None]:
# print(get_lexical_information('n40001020001'))

In [None]:
# print(get_syntactic_information('n40001020001'))

In [None]:
# print(get_discourse_information('n41010020001'))

In [None]:
# print(get_social_information('n43010020001'))

In [None]:
# print(get_cultural_information('n43010020001'))

# Question answering pipeline

Example questions:

In MAT 3:15, it says, "Jesus said to him", but who is the "him" (αὐτῷ) referring to?
What is the social context of this passage?

## Build Gradio UI for question answering pipeline

In [None]:
def get_contextual_data(tokenId):
    prosaic_context = generate_prosaic_context(tokenId)
    return prosaic_context

def generate_answer(prompt, model, tokenizer, temperature=0.7, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, temperature=temperature, do_sample=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def answer_question(question, context, temperature=0.7):
    prompt = f"Context for question: {context}\nQuestion: {question}\nAnswer to question: "
    answer = generate_answer(prompt, model, tokenizer, temperature)
    return answer

def gradio_wrapper(inputs, context, temperature=0.7):
    question = inputs
    answer = answer_question(question, context, temperature)
    # deu_translation, spa_translation, fra_translation = translate_text(answer)
    return answer, # deu_translation, spa_translation, fra_translation


In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
# import gradio as gr
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer

# # Load the translation model and tokenizer
# # print('Loading translation model and tokenizer...')
# # translation_model_name = "facebook/nllb-200-distilled-600M"
# # translation_model = AutoModelForSeq2SeqLM.from_pretrained(
# #     translation_model_name)
# # translation_tokenizer = AutoTokenizer.from_pretrained(translation_model_name)

# # # Instantiate the translation pipelines
# # deu_translator = pipeline('translation', model=translation_model,
# #                           tokenizer=translation_tokenizer, src_lang="en", tgt_lang="de")
# # spa_translator = pipeline('translation', model=translation_model,
# #                           tokenizer=translation_tokenizer, src_lang="en", tgt_lang="es")
# # fra_translator = pipeline('translation', model=translation_model,
# #                           tokenizer=translation_tokenizer, src_lang="en", tgt_lang="fr")

# # Load the model and tokenizer
# print('Loading inference model and tokenizer...')
# # model_name = "bigscience/bloom-7b1"
# # model_name = "decapoda-research/llama-7b-hf"
# # model_name = "mosaicml/mpt-1b-redpajama-200b-dolly"
# # model_name = 'chtan/gpt4-alpaca-lora_mlp-7b'
# model_name = "databricks/dolly-v2-3b"

# # model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
# # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# qa_pipeline = pipeline('text-generation',
#                        model=model_name, tokenizer=model_name, trust_remote_code=True)
# # qa_pipeline.to('mps')
# # pipeline.enable_attention_slicing()

In [None]:
# demo = gr.Blocks()

# with demo:
#     gr.Markdown("Generate insights about a given word using MACULA data")
#     # with gr.Row():
#     #     verse_reference = gr.Textbox(lines=1, label="Verse Reference", placeholder="Enter verse reference here (e.g. 'JHN 3:16')", value="JHN 3:16")
#     #     text_output = gr.Textbox(placeholder="Verse data will appear here", max_lines=10)
#     # get_verse_content_button = gr.Button("Get Verse Content")

#     # get_verse_content_button.click(
#     #     gradio_get_verse_content, inputs=verse_reference, outputs=text_output)

#     word_id_input = gr.Textbox(lines=1, label="Word ID", placeholder="Enter word ID here (e.g. 'n40003014006')", value="n40003014006")
#     context_output = gr.Textbox(placeholder="Context will appear here", max_lines=10)
    
#     get_context_button = gr.Button("Get Context")
    
#     get_context_button.click(
#         get_contextual_data, inputs=word_id_input, outputs=context_output)
    
#     # Temperature slider. Note: Slider.__init__() takes from 1 to 4 positional arguments
#     temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
    
#     question_input = gr.Textbox(lines=2, label="Question", placeholder="Enter question here", value="Who is 'I' in this passage, and what is going on here between him and whoever he's speaking to?")
#     answer_output = gr.Textbox(placeholder="Answer will appear here", max_lines=10)
    
#     get_answer_button = gr.Button("Get Answer")
    
#     get_answer_button.click(
#         gradio_wrapper, inputs=[question_input, context_output, temperature_slider], outputs=answer_output)
    
# demo.launch()

In [None]:
import getpass
secret_key = getpass.getpass('Enter OpenAI secret key: ')

In [None]:
os.environ['OPENAI_API_KEY'] = secret_key

## Non-chat completion of questions

In [None]:
import openai

# def generate_autocomplete(
#         text,
#         summary_context=None,
#         full_context=None,
#         context_selection="Summary",
#         prompt_template="## Context:\n{context}\n\n## Question:\n{user_text}\n\n## Answer based on context:",
#     ):
#         if "Summarized" in context_selection:
#             context = summary_context
#         elif "Full" in context_selection:
#             context = full_context
#         else:
#             context = None
#         if not context:
#             prompt = f"## Based on the context, complete the following:\n{text}"
#         else:
#             prompt = prompt_template.format(
#                 context=context, user_text=text
#             )
#         print("+++/ Prompt:\n", prompt, "\n/+++")
#         response = openai.Completion.create(
#             # engine="text-davinci-003",
#             engine="gpt-3.5-turbo",
#             prompt=prompt,
#             # max_tokens=50,
#             n=1,
#             stop=None,
#             temperature=0.8,
#         )
#         autocomplete = response.choices[0].text.strip()
#         return autocomplete
    
# def ada_recursive_summary(
#         context,
#     ):
#         print("+++ Context:\n", context, "\n+++")
#         prompt = f"## Context: {context}\n\n## Summary of the most significant contextual information about this word's syntax, semantics, social, and cultural contexts (does not include exhaustive situational description):"
#         response = openai.Completion.create(
#             # engine="text-ada-001",
#             # engine="text-curie-001",
#             # engine="text-davinci-003",
#             engine="gpt-3.5-turbo",
#             prompt=prompt,
#             temperature=0.6,
#             top_p=1,
#             frequency_penalty=0.0,
#             presence_penalty=0.6,
#             # max_tokens=350,
#             n=1,
#             stop=None,
#         )
#         summary = response.choices[0].text.strip()
#         # print("+++ Summary:\n", summary, "\n+++")
#         return summary

## Chat-model completions

In [None]:
import openai

def ada_recursive_summary(context, temperature):
    print("+++ Context:\n", context, "\n+++")

    messages = [
        {
            "role": "system",
            "content": "You are a language model trained in linguistics, and you are great at summarizing structured data while focusing on linguistic features without delving into theological issues. Please provide a concise summary of the given structured data, focusing on the key lexical, syntactic, discourse, and social context features that would be helpful in answering qualitative questions: Your summaries do not include exhaustive situational descriptions.",
        },
        {"role": "user", "content": f"Summarize the following context: {context}"},
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=temperature,
        top_p=1,
        frequency_penalty=0.0,
        presence_penalty=0.6,
        n=1,
        stop=None,
    )
    summary = response.choices[0].message.content.strip()
    return summary

def generate_autocomplete(
    text,
    summary_context=None,
    full_context=None,
    context_selection="Summary",
    temperature=0,
):
    if "Summarized" in context_selection:
        context = summary_context
    elif "Full" in context_selection:
        context = full_context
    else:
        context = None

    messages = []
    if context:
        messages.append(
            {"role": "system", "content": f"You are a language model trained in linguistics, and you are great at answering qualitative questions based on structured data while focusing on evaluative, exegetical, and interpretive issues without delving into theological matters. Here is the context: {context}"}
        )

    messages.append({"role": "user", "content": f"Based on this data, please answer the following qualitative question: {text}"})

    print("+++/ Messages:\n", messages, "\n/+++")

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        n=1,
        stop=None,
        temperature=temperature,
    )
    autocomplete = response.choices[0].message.content.strip()
    return autocomplete

"""

messages = [
{
"role": "system",
"content": "You are a language model trained in linguistics, and you are great at summarizing structured data while focusing on linguistic features without delving into theological issues. Please provide a concise textual commentary on the given {linguistic_data} (use the treedown representation for the larger text context of the target word) by examining its key lexical choices, syntactic structures, discourse organization, social context, and cultural references. Avoid personal opinions and maintain objectivity. Illuminate the passage's nuances, foster clarity, and establish connections within the work, empowering readers to grasp the author's intentions and the interplay between language and content.",
}
]

Add in this paragraph:

A strong textual commentary utilizes linguistic data to provide a detailed analysis of a passage, enhancing comprehension and interpretation. It examines lexical choices, syntactic structures, discourse organization, social context, and cultural information. By focusing on the text itself, it uncovers how language conveys meaning, while maintaining objectivity and avoiding personal opinions and speculative theological claims. This approach illuminates the passage's nuances, fosters clarity, and establishes connections within the work. It empowers readers to grasp the author's intentions, the text's impact, and the interplay between language and content.
"""

def generate_commentary_article_of_passage(linguistic_data, temperature=0.7):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a language model trained in linguistics," 
                " and you are great at summarizing structured data while"
                " focusing on linguistic features without delving into theological issues."
                " Please provide a concise textual commentary on the given {linguistic_data}"
                " (use the treedown representation for the larger text context of the target word)"
                " by examining its key lexical choices, syntactic structures, discourse organization,"
                " social context, and cultural references. Avoid personal opinions and maintain objectivity."
                " Illuminate the passage's nuances, foster clarity, and establish connections within the work,"
                " empowering readers to grasp the author's intentions and the interplay between language and content."
                " Please format your output using the following headings:"
                " 1. Lexical features"
                " 2. Syntactic context and function"
                " 3. Discourse context"
                " 4. Social context"
                " 5. Cultural/encyclopedic knowledge"
                
                ).format(linguistic_data=linguistic_data),
        }
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        n=1,
        stop=None,
        temperature=temperature,
    )
    
    commentary = response.choices[0].message.content.strip()
    return commentary

def generate_comprehension_questions_from_context(context, temperature=0.7):
    messages = [
        {
            "role": "system",
            "content": (
                "Please provide a set of comprehension questions and answers for the passage based on the given {context}.\n\nFocus on how the linguistics data drives correct comprehension and do not include questions that are not directly related to the linguistics data. Your output should be formatted like this:\n\nQuestion: [question here]\nAnswer: [answer here]\n\n"
                ).format(context=context),
        }
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        n=1,
        stop=None,
        temperature=temperature,
    )
    
    commentary = response.choices[0].message.content.strip()
    return commentary

# Generate translation directives:
"""
Creating source-language translation directives from the dataset you shared would involve formulating guidelines that help translators accurately and effectively translate the original text into the target language. The goal is to preserve as much of the original meaning, context, and nuance as possible, while also ensuring the translation is accessible and understandable to the target audience.

Here's an example of what such directives might look like, using the data you provided for 'MRK 1:5!4':

Translator's Tips for MRK 1:5!4
Lexical Features: The word 'αὐτὸν' is a pronoun meaning 'him.' Be aware that it can have different meanings in different contexts. The syntactic context is potentially needed to clarify the functional significance of its case. Make sure your translation reflects the pronoun’s masculine gender, and singular number appropriately for your target language.

Syntactic Context and Function: 'αὐτὸν' plays the role of 'Direction' in the sentence, which is indicated in part by its accusative case, which construes a very general relation of some kind. In addition, the word is affected by its participation in the sentence, as indicated by the use of a lexeme that is always in the middle voice. Also, the word is part of a circumstance (informational or spatio-temporal environment) and is reiterated, which means it refers to "the same" experience again, expressed by the intensive pronoun αὐτός. Ensure the translation reflects the complex syntactic role of this word.

Discourse Context: 'αὐτὸν' is used in a main clause. Make sure the translation reflects the fact that this word is not part of a subordinate clause or embedded speech.

Social Context: The word appears in the passage 'The Ministry of John the Baptist,' a forewarning/private discussion situation. Pay attention to the overall tone and context of the passage while translating. The passage is neither procedural nor flexible in terms of the formality of its language, and institutional norms and expectations play a key role.

Cultural/Encyclopedic Knowledge: The word 'αὐτὸν' falls under the semantic domain of 'Whom or What Spoken or Written About.' While this is a simple pronoun and is not likely a source of cross-cultural confusion, it is important to recognize the word is an intensive pronoun (often glossed 'self'), in which sense it differs from pronouns in many languages. Be sure to translate this word based on its functional significance, not strictly its decontextualized lexical meaning.

Remember, each of these points should guide your translation, but they are not hard and fast rules. The ultimate goal is to create a translation that is accurate, understandable, and meaningful to the target audience.
"""

def generate_translation_directives(context, temperature=0.7):
    messages = [
        {
            "role": "system",
            "content": (
                "You are an experienced translation consultant, capable"
                " at instructing translators on how to accurately and effectively"
                " translate the original text into the target language.\n"
                "Here is some linguistic data: {context}\n\nI would like to generate source-language translation"
                " directives using this linguistic data, similar to  SIL's 'Translator's Notes' resource."
                " Please avoid generic comments about 'taking x or y into account' and instead focus on the"
                " specific linguistic features of the"
                " text that translators need to be aware of in order to accurately translate the text into the"
                " target language. The goal is to preserve as much of the original meaning, context, and nuance"
                " as possible, while also ensuring the translation is accessible and understandable to the target audience."
            ).format(context=context)
        }
    ]
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        n=1,
        stop=None,
        temperature=temperature,
    )
    directives = response.choices[0].message.content.strip()
    return directives

        
# TODO: gather all non-repeated data points for each word in a verse

In [None]:
import gradio as gr

app = gr.Blocks(theme='bethecloud/storj_theme')

with app:
    gr.Markdown("## Gather context data")

    with gr.Column():
        with gr.Row():
            verse_ref_or_id_input = gr.Textbox(lines=1, label="Verse Reference", value="MRK 1:5!4")
            get_context_button = gr.Button("Get Linguistic Data for Context")
        context_output = gr.Textbox(lines=5, label="Context", format="markdown")
    get_context_button.click(get_contextual_data, inputs=verse_ref_or_id_input, outputs=context_output)

    gr.Markdown("## Summarize context data")

    with gr.Column():
        summary_temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
        summarize_button = gr.Button("Summarize context")
        summary_output = gr.Textbox(label="Summary")
    summarize_button.click(ada_recursive_summary, inputs=[context_output, summary_temperature_slider], outputs=summary_output)

    gr.Markdown("## Answer question based on context data")

    with gr.Row():
        with gr.Column():
            context_selection_box = gr.CheckboxGroup(["Summarized", "Full"], label="Context Selection")
            question_input = gr.Textbox(lines=2, label="Question", value="What does this word refer to in this passage, who is involved in the passage, and what are the participants trying to accomplish?")
            answer_temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
            answer_button = gr.Button("Answer question")
        answer_output = gr.Textbox(label="Answer")
    answer_button.click(
        generate_autocomplete,
        inputs=[question_input, summary_output, context_output, context_selection_box, answer_temperature_slider],
        outputs=answer_output,
    )
    
    # Add row for generating commentary from full output
    with gr.Column():
        commentary_temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
        commentary_button = gr.Button("Generate Commentary")
    commentary_output = gr.Textbox(label="Commentary")
    commentary_button.click(generate_commentary_article_of_passage, inputs=[context_output, commentary_temperature_slider], outputs=commentary_output)
    
    # Add row for generating comprehension questions from full output
    with gr.Column():
        questions_temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
        questions_button = gr.Button("Generate Comprehension Questions")
    questions_output = gr.Textbox(label="Comprehension Questions")
    questions_button.click(generate_comprehension_questions_from_context, inputs=[context_output, questions_temperature_slider], outputs=questions_output)
    
    # Add row for generating translation directives from full output
    with gr.Column():
        directives_temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
        directives_button = gr.Button("Generate Translation Directives")
    directives_output = gr.Textbox(label="Translation Directives")
    directives_button.click(generate_translation_directives, inputs=[context_output, directives_temperature_slider], outputs=directives_output)
    
    # Add a placeholder row for generating simplified audio content
    # with gr.Column():
    #     audio_temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
    #     audio_button = gr.Button("Generate Simplified Audio Content")
    # audio_output = gr.Textbox(label="Simplified Audio Content")
    
    
app.launch()




Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




Getting treedown for ref MRK 1:5!4
MRK 1:5!4
no token_refs for situation 02-15
no token_refs for situation 02-16
['']
Getting treedown for ref MRK 1:5!8
MRK 1:5!8
no token_refs for situation 02-15
no token_refs for situation 02-16
['']
+++ Context:
 ## 1. Lexical features
This data is useful for identifying the meaning of a word across all of its various contexts.
- class: χώρα is a noun,
- gloss: meaning "region."
- lemma: The lemma form of this word is χώρα,
## 2. Syntactic context and function
This data is useful for identifying the meaning of a word in a specific sentential context.
- case: Grammatical case: nominative, genitive, dative, accusative, or vocative: nominative,
- gender: Grammatical gender values: feminine,
- number: Grammatical number: singular,
- type: Indicates different types of pronominals: common,
- Semantic configuration (useful for figuring out what is taking place in the sentence and how this word plays a role): This word is the Source in `[A0: χώρα] [ἐκπορεύο

In [None]:
# Break down prose generation into the various sub-categories

# (in each case, filter out any duplicate data points before generating prose)
# get lexical information for word or TFIDF-filtered lexical information for passage
# get syntactic information for word or passage
# get discourse information for word or passage
# get social information for word or passage
# get cultural information for word or passage

# get context for word (above)
# get context for verse
# get context for chapter
# get context for book
# get context for pericope

# identify the passage being queried about


# def get_lexical_information(word_id):
#     if not '!' in word_id:
#         word_data = mg.loc[word_id].to_dict()
#         word_ref = word_data['ref']
#     else:
#         word_ref = f'{word_id}'
#         word_data = mg.loc[mg['ref'] == word_ref].iloc[0].to_dict()
#         word_id = word_data['id']
    
#     lemma = word_data['lemma']
#     lexical_data = {
#         "class": None,
#         "gloss": None,
#         "lemma": lemma,
#         "morph": None,
#         "strong": None,
#         "subjref": None,
#         "referent": None,
#         "person": None,
#         "number": None,
#         "gender": None,
#         "case": None,
#         "tense": None,
#         "voice": None,
#         "mood": None,
#         "degree": None,
#         "type": None,
#         "ln": None,
#         "domain_label": None
#     }

#     for key in lexical_data.keys():
#         value = word_data.get(key)
#         if value not in (None, 'missing', 'nan'):
#             lexical_data[key] = value

#     return lexical_data


# def get_syntactic_information(word_id):
#     syntactic_data = {
#         "semantic_configuration": None,
#         "semantic_role_info": None,
#         "opentext_syntax_data": None,
#         "treedown_data": None
#     }

#     semantic_configuration = describe_semantic_configuration(word_id, semantic_role_data, output_template)
#     role_data_row = semantic_role_data.loc[semantic_role_data['xml:id'] == word_id].iloc[0]
#     semantic_role_info = {key: value for key, value in role_data_row.items() if not value == ''}
#     opentext_syntax_data = get_opentext_syntax_data(word_id)
#     treedown_data = get_treedown_by_ref(word_ref)

#     syntactic_data["semantic_configuration"] = semantic_configuration
#     syntactic_data["semantic_role_info"] = semantic_role_info
#     syntactic_data["opentext_syntax_data"] = opentext_syntax_data
#     syntactic_data["treedown_data"] = treedown_data

#     return syntactic_data


# def get_discourse_information(word_id):
#     discourse_data = {
#         "discourse_features": None,
#         "speaker_information": None
#     }

#     discourse_features = get_discourse_annotation_types(word_id)
#     speaker_information = get_speaker_quotation_data(word_ref)

#     discourse_data["discourse_features"] = discourse_features
#     discourse_data["speaker_information"] = speaker_information

#     return discourse_data


# def get_social_information(word_id):
#     social_data = {
#         "situation_data": None
#     }

#     lookup = situations_lookup_json
#     situation_data = get_situations_data(word_ref)

#     social_data["situation_data"] = situation_data

#     return social_data


# def get_cultural_information(word_id):
#     cultural_data = {
#         "lexical_notes": None
#     }

#     lexical_notes = get_lexical_notes(word_id)

#     cultural_data["lexical_notes"] = lexical_notes

#     return cultural_data


# def get_context_for_word(word_id, selected_fields=None):
#     lexical_data = get_lexical_information(word_id)
#     syntactic_data = get_syntactic_information(word_id)
#     discourse_data = get_discourse_information(word_id)
#     social_data = get_social_information(word_id)
#     cultural_data = get_cultural_information(word_id)

#     context_data = {
#         "1. Lexical features": ['This data is useful for identifying the meaning of a word across all of its various contexts.'],
#         "2. Syntactic context and function": ['This data is useful for identifying the meaning of a word in a specific sentential context.'],
#         "3. Discourse context": ['This data is useful for identifying the way a word contributes to a larger discourse context.'],
#         "4. Social context": ['This data is useful for giving top-down clarification about what is going on in a passage, which helps narrow down ambiguities in more low-level data.'],
#         "5. Cultural/encyclopedic knowledge": ['This data is useful for filling in background historical or cultural information that is not explicit or is assumed in the text.'],
#     }

#     context_data["1. Lexical features"].extend(get_lexical_features(lexical_data, selected_fields))
#     context_data["2. Syntactic context and function"].extend(get_syntactic_context(syntactic_data))
#     context_data["3. Discourse context"].extend(get_discourse_context(discourse_data))
#     context_data["4. Social context"].extend(get_social_context(social_data))
#     context_data["5. Cultural/encyclopedic knowledge"].extend(get_cultural_knowledge(cultural_data))

#     output_lines = []
#     for header, sentences in context_data.items():
#         if sentences:
#             output_lines.append(f"## {header}\n")
#             output_lines.append("\n".join(sentences))
#             output_lines.append("\n")

#     prosaic_context = "".join(output_lines)

#     return prosaic_context

# def get_lexical_features(lexical_data, selected_fields=None):
#     attribute_descriptions = {
#     "class": "This word belongs to the class",
#     "gloss": "The meaning of this word is",
#     "lemma": "The lemma form of this word is",
#     "morph": "This word is parsed as",
#     "strong": "This word has a Strong's number of",
#     "subjref": "This word is the subject referent of the verb",
#     "referent": "This word is the referent of the pronoun",
#     "person": "The person of this word is",
#     "number": "The number of this word is",
#     "gender": "The gender of this word is",
#     "case": "The case of this word is",
#     "tense": "The tense of this word is",
#     "voice": "The voice of this word is",
#     "mood": "The mood of this word is",
#     "degree": "The degree of this word is",
#     "type": "The type of this word is",
#     "ln": "The Louw-Nida domain of this word is",
#     "domain_label": "This word relates to the general subject matter of the lemma and is labeled as"
#     }
#     if not selected_fields:
#         selected_fields = list(attribute_descriptions.keys())

#     lexical_features = []
#     for key in selected_fields:
#         value = lexical_data[key]
#         if value not in (None, 'missing', 'nan'):
#             if key == "class":
#                 lexical_features.append(f"- {key}: {attribute_descriptions[key]} {value},")
#             elif key == "gloss":
#                 lexical_features.append(f"- {key}: {attribute_descriptions[key]} \"{value}.\"")
#             elif key == "lemma":
#                 lexical_features.append(f"- {key}: {attribute_descriptions[key]} {value},")
#             elif key == "morph":
#                 pass
#             elif key == "strong":
#                 pass
#             elif key == "subjref":
#                 subject_referent_text = get_word_text(value)
#                 subject_referent_gloss = get_word_gloss(value)
#                 subject_referent_lemma = get_word_lemma(value)
#                 lexical_features.append(f"- Subject referent: {attribute_descriptions[key]} {subject_referent_text} ({subject_referent_gloss}, lemma {subject_referent_lemma}) is the subject of the verb.")
#             elif key == "referent":
#                 referent_text = get_word_text(value)
#                 referent_gloss = get_word_gloss(value)
#                 referent_lemma = get_word_lemma(value)
#                 lexical_features.append(f"- Referent: {attribute_descriptions[key]} {referent_text} ({referent_gloss}, lemma {referent_lemma}) is the referent of the pronoun.")
#             elif key in ("person", "number", "gender", "case", "tense", "voice", "mood", "degree", "type"):
#                 lexical_features.append(f"- {key}: {attribute_descriptions[key]} {value},")
#             elif key in ("ln"):
#                 lexical_features.append(f"- {key}: {attribute_descriptions[key]} {value},")
#             elif key in ("domain_label"):
#                 domain_label = get_domain_label(value)
#                 lexical_features.append(f"- {key} ({attribute_descriptions[key]}): {domain_label},")
            
#     return lexical_features


# def get_syntactic_context(syntactic_data):
#     context_sentences = []
#     semantic_configuration = syntactic_data["semantic_configuration"]
#     semantic_role_info = syntactic_data["semantic_role_info"]
#     opentext_syntax_data = syntactic_data["opentext_syntax_data"]
#     treedown_data = syntactic_data["treedown_data"]

#     if semantic_configuration:
#         context_sentences.append("- Semantic configuration (useful for figuring out what is taking place in the sentence and how this word plays a role):")
#         context_sentences.append(f"  This word is the {semantic_role_info['semantic_role_label']} {'in `' + semantic_configuration + '`' if semantic_configuration else ''}, and it has the following data:")
#         for key, value in semantic_role_info.items():
#             if key != "semantic_role_label":
#                 context_sentences.append(f"  - {key}: {value}")

#     if opentext_syntax_data:
#         context_sentences.append("- OpenText syntax (useful for identifying all of the grammatical choices that led up to this word, such as whether it is part of a derived 'entity' definition or a nested 'turn' or a particular kind of speech act):")
#         for feature in opentext_syntax_data:
#             context_sentences.append(f"  - {feature['feature_name']}: {feature['feature_description']}")

#     if treedown_data:
#         context_sentences.append("- Treedown syntax: This word is part of the following sentence:")
#         context_sentences.append(treedown_data)

#     return context_sentences


# def get_discourse_context(discourse_data):
#     context_sentences = []
#     discourse_features = discourse_data["discourse_features"]
#     speaker_information = discourse_data["speaker_information"]

#     if discourse_features:
#         context_sentences.append(f"This word functions within {len(discourse_features)} discourse features (these are useful heuristic interpretive annotations that tell you about the nature of the proposition a word is in):")
#         for feature in discourse_features:
#             context_sentences.append(f"- {feature} is defined as {discourse_types[feature]['description']}")

#     if speaker_information:
#             context_sentences.append("Speaker data is critical to identifying quoted material and relating it to the proper speaker.")
#             if len(speaker_information) == 1:
#                 speaker = speaker_information[0]
#                 context_sentences.append(f"This word is spoken by {speaker['who_is_speaking']}")
#                 if speaker.get("Divinity") == "Y":
#                     context_sentences.append(f"- {speaker['who_is_speaking']} is a divinity")
#                 if speaker.get("Age"):
#                     context_sentences.append(f", age: {speaker['Age']}")
#                 if speaker.get("Comment"):
#                     context_sentences.append(f" ({speaker['Comment']})")
#                 speech = speaker.get("what_is_said_complete")
#                 if len(speech) > 100:
#                     speech = speaker.get("what_is_said_truncated")
#                 context_sentences.append(f", who says (in a {speaker['delivery_tone']} tone), \"{speech}\"")
                    
#             else:
#                 context_sentences.append(f"This word is spoken by {len(speaker_information)} speaker(s): ")
#                 for speaker in speaker_information:
#                     context_sentences.append(f"This word is spoken by {speaker['who_is_speaking']}")
#                     if speaker.get("Divinity") == "Y":
#                         context_sentences.append(f"- {speaker['who_is_speaking']} is a divinity")
#                     if speaker.get("Age"):
#                         context_sentences.append(f", {speaker['Age']} years old")
#                     if speaker.get("Comment"):
#                         context_sentences.append(f" ({speaker['Comment']})")
#                     speech = speaker.get("what_is_said_complete")
#                     if len(speech) > 100:
#                         speech = speaker.get("what_is_said_truncated")
#                     context_sentences.append(f", who says (in a {speaker['Delivery']} tone), \"{speech}\"")

#     return context_sentences

# def get_social_information(word_id):
#     social_data = {
#         "sociolinguistic_data": ["This data provides insights into the sociolinguistic aspects of the word, such as its usage in different dialects or social contexts."],
#         "register": ["This data indicates the register or formality level of the word, which can help determine its appropriate usage."],
#         "style": ["This data describes the style or literary usage of the word, which can provide additional nuances in its meaning or connotation."],
#         "socio_cultural_context": ["This data highlights the socio-cultural context associated with the word, including its usage in specific cultural or historical settings."],
#     }

#     word_data = mg.loc[word_id].to_dict()
#     selected_fields = ["sociolinguistic_data", "register", "style", "socio_cultural_context"]

#     for key in selected_fields:
#         value = word_data.get(key)
#         if value not in (None, 'missing', 'nan'):
#             social_data[key].append(f"- {attribute_descriptions[key]}: {value}")

#     context_sentences = []
#     for key, sentences in social_data.items():
#         if sentences:
#             context_sentences.append(f"## {key.capitalize()}\n")
#             context_sentences.extend(sentences)
#             context_sentences.append("\n")

#     return context_sentences




In [None]:


# def get_similar_docs(query_string):
#     return collection.search(query_string, search_type='similarity')

# def get_range_of_examples(query_string):
#     return collection.search(query_string, search_type='mmr')

# def get_prosaic_context_for_verse(verse_ref_string):
#     return get_context_for_verse(verse_ref_string)

# # prompt should be something like this:
# """
# (
#     "You are a language model trained in linguistics," 
#     " and you are great at summarizing structured data while"
#     " focusing on linguistic features without delving into theological issues."
#     " Please provide a concise textual commentary on the given {linguistic_data}"
#     " (use the treedown representation for the larger text context of the target word)"
#     " by examining its key lexical choices, syntactic structures, discourse organization,"
#     " social context, and cultural references. Avoid personal opinions and maintain objectivity."
#     " Illuminate the passage's nuances, foster clarity, and establish connections within the work,"
#     " empowering readers to grasp the author's intentions and the interplay between language and content."
#     " Please format your output using the following headings:"
#     " 1. Lexical features"
#     " 2. Syntactic context and function"
#     " 3. Discourse context"
#     " 4. Social context"
#     " 5. Cultural/encyclopedic knowledge"    
# ).format(linguistic_data=linguistic_data)
# """

# def answer_question_with_context(input_question):
#     # first, find the relevant bible verse source ids
#     similar_verses = get_similar_docs(input_question)
#     # second, get the prosaic context for the first verse (later this could be a multi-verse analysis...)
#     verse_refs = [verse['source'] for verse in similar_verses]
#     print('>>>>>>,' verse_refs)
#     prose = get_prosaic_context_for_verse(verse_refs[0])
#     return prose

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1085990072.py, line 36)

In [None]:
# from langchain.chains.base import Chain
# from langchain.retrievers.tfidf import TFIDFRetriever
# from langchain.schema import Document
# from langchain.callbacks.manager import (
#     AsyncCallbackManagerForChainRun,
#     CallbackManagerForChainRun,
# )

# class RetrieverWithContextChain(Chain):
#     retriever: TFIDFRetriever
#     context_provider: MyCustomChain
#     output_key: str = "text"

#     @property
#     def input_keys(self) -> List[str]:
#         return ["query"]

#     @property
#     def output_keys(self) -> List[str]:
#         return [self.output_key]

#     def _call(
#         self,
#         inputs: Dict[str, Any],
#         run_manager: Optional[CallbackManagerForChainRun] = None,
#     ) -> Dict[str, str]:
#         # Retrieve relevant documents based on the query
#         relevant_docs = self.retriever.get_relevant_documents(inputs["query"])
        
#         # Assume the first document is the most relevant one
#         most_relevant_doc = relevant_docs[0] if relevant_docs else Document()

#         # Provide context for the verse reference in the most relevant document
#         context_response = self.context_provider.run(
#             {"verse_reference": most_relevant_doc.metadata.get("source")},
#             callbacks=run_manager.get_child() if run_manager else None,
#         )

#         return {self.output_key: context_response.get("text", "")}

#     # TODO: define `_acall` method for asynchronous operation