<a href="https://colab.research.google.com/github/ryderwishart/biblical-machine-learning/blob/main/gpt-inferences/agentive_bible_querying.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# General Setup

In [9]:
import getpass
secret_key = getpass.getpass('Enter OpenAI secret key: ')
os.environ['OPENAI_API_KEY'] = secret_key

Enter OpenAI secret key: ··········


In [None]:
# !pip install langchain openai chromadb tiktoken unidecode

# Setup Bible Passage Retrieval Using MACULA Data

In [6]:
import os
os.environ["LANGCHAIN_TRACING"] = "false"

In [148]:
import requests, json, re, os
import pandas as pd

def download_file(url, file_name):
    response = requests.get(url)
    with open(file_name, "wb") as file:
        file.write(response.content)

# file1_url = "https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/Nestle1904/TSV/macula-greek.tsv"
file1_url = "https://github.com/Clear-Bible/macula-greek/raw/feature/add-sentence-id-to-tsv/Nestle1904/TSV/macula-greek.tsv" # PR version with sentence IDs
file1_name = "macula-greek.tsv"

# file1_url = "https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/Nestle1904/TSV/macula-greek.tsv"
file2_url = "https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/sources/MARBLE/SDBG/marble-domain-label-mapping.json"
file2_name = "marble-domain-label-mapping.json"

if file1_name not in os.listdir():
    download_file(file1_url, file1_name)

if file2_name not in os.listdir():
    download_file(file2_url, file2_name)


In [149]:
# Import Macula Greek data
mg = pd.read_csv(
    "macula-greek.tsv", index_col="xml:id", sep="\t", header=0, converters={"*": str}
).fillna("missing")
# add an 'id' column
mg["id"] = mg.index

# mg['domain'] = mg['domain'].astype(str).fillna('missing')

# Extract book, chapter, and verse into separate columns
mg[["book", "chapter", "verse"]] = mg["ref"].str.extract(r"(\d?[A-Z]+)\s(\d+):(\d+)")

# Add columns for book + chapter, and book + chapter + verse for easier grouping
mg["book_chapter"] = mg["book"] + " " + mg["chapter"].astype(str)
mg["book_chapter_verse"] = mg["book_chapter"] + ":" + mg["verse"].astype(str)



In [14]:
mg.head()

Unnamed: 0_level_0,sentence,ref,role,class,type,gloss,text,after,lemma,normalized,...,ln,frame,subjref,referent,id,book,chapter,verse,book_chapter,book_chapter_verse
xml:id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n40001001001,n40001001001_n40001001008,MAT 1:1!1,missing,noun,common,[The] book,Βίβλος,,βίβλος,Βίβλος,...,33.38,missing,missing,missing,n40001001001,MAT,1,1,MAT 1,MAT 1:1
n40001001002,n40001001001_n40001001008,MAT 1:1!2,missing,noun,common,of [the] genealogy,γενέσεως,,γένεσις,γενέσεως,...,10.24 33.19,missing,missing,missing,n40001001002,MAT,1,1,MAT 1,MAT 1:1
n40001001003,n40001001001_n40001001008,MAT 1:1!3,missing,noun,proper,of Jesus,Ἰησοῦ,,Ἰησοῦς,Ἰησοῦ,...,93.169a,missing,missing,missing,n40001001003,MAT,1,1,MAT 1,MAT 1:1
n40001001004,n40001001001_n40001001008,MAT 1:1!4,missing,noun,proper,Christ,Χριστοῦ,,Χριστός,Χριστοῦ,...,93.387,missing,missing,missing,n40001001004,MAT,1,1,MAT 1,MAT 1:1
n40001001005,n40001001001_n40001001008,MAT 1:1!5,missing,noun,common,son,υἱοῦ,,υἱός,υἱοῦ,...,10.30,missing,missing,missing,n40001001005,MAT,1,1,MAT 1,MAT 1:1


In [150]:


# Import domain-label mapping
# Open the JSON file
with open("marble-domain-label-mapping.json", "r") as f:
    # Load the contents of the file as a dictionary
    domain_labels = json.load(f)

domain_labels["missing"] = "no domain"
domain_labels["nan"] = "no domain"

# Use domain labels to create a new column


def get_domain_label(domain_string_number):
    labels = [domain_labels[label] for label in domain_string_number.split(" ")]
    return labels


mg["domain_label"] = mg["domain"].apply(get_domain_label)
mg.head()


# Create a dataframe from the SpeakerProjections-clear.json file
with open("SpeakerProjections-clear.json", "r") as f:
    speaker_projections = json.load(f)
    speaker_data = pd.DataFrame(speaker_projections)

# Create a dataframe from the character_detail.semantic_data.json file
with open("character_detail.semantic_data.json", "r") as f:
    # this data is an array of JSON objects
    character_detail = json.load(f)
    character_data = pd.DataFrame(character_detail)

# Transpose the DataFrame
transposed_speaker_data = speaker_data.transpose()

# Reset the index
transposed_speaker_data.reset_index(inplace=True)

# Rename the columns
transposed_speaker_data.columns = ["row_id", "instance_data", "projections"]

# Normalize the 'instance_data' column
flattened_instance_data = pd.json_normalize(transposed_speaker_data["instance_data"])

# Merge the normalized DataFrame with the original transposed DataFrame
merged_speaker_data = pd.concat(
    [transposed_speaker_data.drop(columns=["instance_data"]), flattened_instance_data],
    axis=1,
)

# Create an empty DataFrame to store the result
expanded_speaker_data = pd.DataFrame()

# Iterate through the rows in the merged_speaker_data DataFrame
for idx, row in merged_speaker_data.iterrows():
    projections = row["projections"]

    # Iterate through the projections
    for proj_idx, projection in enumerate(projections):
        # Create a new row with the inherited speaker instance data
        new_row = row.drop("projections").to_dict()
        new_row.update(projection)

        # Set the row ID with the projection index
        new_row["row_id"] = f"{row['row_id']}|{proj_idx}"

        # Append the new row to the expanded_speaker_data DataFrame
        expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)

# Add 'tokens' and 'token_ids' columns with default empty lists
expanded_speaker_data = expanded_speaker_data.assign(
    tokens=[[]] * len(expanded_speaker_data),
    token_ids=[[]] * len(expanded_speaker_data),
)

# Iterate through the rows in the expanded_speaker_data DataFrame
for idx, row in expanded_speaker_data.iterrows():
    words = row["Words"]

    # Extract the 'Text' and 'Id' values from the 'Words' data
    tokens = [word["Text"] for word in words]
    token_ids = [word["Id"] for word in words]

    # Assign the 'tokens' and 'token_ids' fields to the row
    expanded_speaker_data.at[idx, "tokens"] = tokens
    expanded_speaker_data.at[idx, "token_ids"] = token_ids


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_dat

In [40]:
### SENTENCE-BASED TEXTS

import pandas as pd
import numpy as np

# Initialize lists
text_list = []
dict_list = []
id_list = []

# Group the DataFrame by 'sentence'
# grouped = mg.groupby('sentence')
grouped = mg.groupby('book_chapter_verse')

for name, group in grouped:
    # Combine the 'text' and 'after' fields into a single string for each group
    greek_text = ''.join(group['text'] + group['after'].replace(np.nan, ' ', regex=True))
    sentence_gloss = ''.join(group['gloss'].replace(np.nan, '[no gloss]', regex=True) + group['after'].replace(np.nan, '', regex=True) + ' ')
    text_list.append(sentence_gloss)
    
    # Extract book, chapter, and verse from the group
    book = group['book'].values[0]
    chapter = group['book_chapter'].str.split().str[1].values[0]
    verse = group['book_chapter_verse'].str.split(':').str[1].values[0]
    b_c_v = group['book_chapter_verse'].values[0]
    # id_entry = group['sentence'].values[0]
    id_entry = '|'.join(group['id'].tolist())
    
    
    # dict_entry = {'book': book, 'chapter': chapter, 'verse': verse, 'greek': greek_text}
    dict_entry = {'source': b_c_v, 'book': book, 'chapter': chapter, 'verse': verse, 'ids': id_entry, 'greek': greek_text}
    dict_list.append(dict_entry)
    
    # Use the 'xml:id' field to create an ID for the verse
    id_entry = group['id'].tolist()
    id_list.append(id_entry)

# Print the lists for testing
print(text_list[:5])
print(dict_list[:5])
print(id_list[:5])

['Not  I want  for  you  to be ignorant, brothers, that  the  fathers  of us  all  under  the  cloud  were  and  all  through  the  sea  passed, ', 'Neither  are you to grumble, as  some  of them  grumbled, and  perished  by  the  Destroyer. ', 'These things  now  [as] types  happened  to them, were written  then  for  admonition  of us, to  whom  the  ends  of the  ages  are arrived. ', 'Therefore  the [one]  thinking  to stand  let him take heed  lest  he fall, ', 'Temptation  you  not  has seized  if  not  what is common to man· faithful  now  -  [is] God, who  not  will allow  you  to be tempted  beyond  what  you are able, but  will provide  with  the  temptation  also  the  escape  -  to be able  to endure [it]. ']
[{'source': '1CO 10:1', 'book': '1CO', 'chapter': '10', 'verse': '1', 'ids': 'n46010001001|n46010001002|n46010001003|n46010001004|n46010001005|n46010001006|n46010001007|n46010001008|n46010001009|n46010001010|n46010001011|n46010001012|n46010001013|n46010001014|n46010001

In [28]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.document_loaders import TextLoader
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from pathlib import Path

# Load Language Model
llm = OpenAI(temperature=0)

In [41]:
# Create embeddings and store in a vectorstore
embeddings = OpenAIEmbeddings()
collection = Chroma("bible-qa", embeddings)



In [None]:
# Add greek texts with metadata
collection.add_texts(
    texts=text_list,
    metadatas=dict_list,
    ids=['|'.join(id) for id in id_list],
    
    )

In [44]:
# Inspect some texts 
print('MAT 1:1 -->', collection.search('MAT 1:1', search_type='similarity'))
print('blind -->', collection.search('blind', search_type='similarity'))
# TODO: sort out metadata filtering
print('blind with filters -->', collection.search('blind in 1PE', search_type='similarity'))

MAT 1:1 --> [Document(page_content='Jacob  then  begat  -  Joseph  the  husband  of Mary, out of  whom  was born  Jesus  the [One]  being called  Christ. ', metadata={'source': 'MAT 1:16', 'book': 'MAT', 'chapter': '1', 'verse': '16', 'ids': 'n40001016001|n40001016002|n40001016003|n40001016004|n40001016005|n40001016006|n40001016007|n40001016008|n40001016009|n40001016010|n40001016011|n40001016012|n40001016013|n40001016014|n40001016015', 'greek': 'Ἰακὼβ δὲ ἐγέννησεν τὸν Ἰωσὴφ τὸν ἄνδρα Μαρίας,ἐξ ἧς ἐγεννήθη Ἰησοῦς ὁ λεγόμενος Χριστός.'}), Document(page_content='and  behold  you will conceive  in  womb  and  will bring forth  a son, and  you shall call  the  name  of Him  Jesus. ', metadata={'source': 'LUK 1:31', 'book': 'LUK', 'chapter': '1', 'verse': '31', 'ids': 'n42001031001|n42001031002|n42001031003|n42001031004|n42001031005|n42001031006|n42001031007|n42001031008|n42001031009|n42001031010|n42001031011|n42001031012|n42001031013|n42001031014', 'greek': 'καὶ ἰδοὺ συλλήμψῃ ἐν γαστρὶ καὶ 

In [None]:
# TODO: create some filter information for retriever

# from langchain.llms import OpenAI
# from langchain.retrievers.self_query.base import SelfQueryRetriever
# from langchain.chains.query_constructor.base import AttributeInfo

# metadata_field_info=[
#     AttributeInfo(
#         name="genre",
#         description="The genre of the movie", 
#         type="string or list[string]", 
#     ),
#     AttributeInfo(
#         name="year",
#         description="The year the movie was released", 
#         type="integer", 
#     ),
#     AttributeInfo(
#         name="director",
#         description="The name of the movie director", 
#         type="string", 
#     ),
#     AttributeInfo(
#         name="rating",
#         description="A 1-10 rating for the movie",
#         type="float"
#     ),
# ]
# document_content_description = "Brief summary of a movie"
# llm = OpenAI(temperature=0)
# retriever = SelfQueryRetriever.from_llm(llm, vectorstore, document_content_description, metadata_field_info, verbose=True)

# Setup Context Retrieval Functions

These functions are designed to accept either a token ID, or a Bible passage reference, and return expanded context data in natural language.

In [153]:
attribute_descriptions = {
    "after": "Encodes the following character, including a blank space.",
    "articular": "'true' if the word has an article (i.e., modified by the word 'the').",
    "case": "Grammatical case: nominative, genitive, dative, accusative, or vocative",
    "class": "On words, the class is the word's part of speech",
    "cltype": "Explicitly marks Verbless Clauses, Verb Elided Clauses, and Minor Clauses",
    "degree": "A derivative lexical category, indicating the degree of the adjective",
    "discontinuous": "'true' if the word is discontinuous with respect to sentence order due to reordering in the syntax tree",
    "domain": "Semantic domain information from the Semantic Dictionary of Biblical Greek (SDBG)",
    "frame": "Frames of verbs, refers to the arguments of the verb",
    "gender": "Grammatical gender values",
    "gloss": "SIL data, not Berean",
    "lemma": "Form of the word as it appears in a dictionary.",
    "ln": "The semantic domain entry in Louw and Nida's, 'Greek-English Lexicon of the New Testament: Based on Semantic Domains'.",
    "mood": "Grammatical mood",
    "morph": "Morphological parsing codes",
    "normalized": "The normalized form of the token (i.e., no trailing or leading punctuation or accent shifting depending on context)",
    "number": "Grammatical number",
    "person": "Grammatical person",
    "ref": "Verse!word reference to this edition of the Nestle1904 text by USFM id",
    "referent": "The xml:id of the node to which a pronoun (i.e., 'he') refers. Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "role": "The clause-level role of the word.",
    "strong": "Strong's number for the lemma",
    "subjref": "The xml:id of the node that is the implied subject of a verb (for verbs without an explicit subject). Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "tense": "Grammatical tense form",
    "text": "Text content associated with the ID",
    "type": "Indicates different types of pronominals",
    "voice": "Grammatical voice",
    "xml:id": "XML ids occur on every word and encode the corpus ('n' for New Testament), the book (40 for Matthew), the chapter (001), verse (001), and word (001)."
}

discourse_types = {
    'Main clauses': {'description': 'Main clauses are the top-level clauses in a sentence. They are the clauses that are not embedded in other clauses.'},
    'Historical Perfect': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'Specific Circumstance': {'description': 'The function of ἐγενετο ‘it came about’ and an immediately following temporal expression varies with the author (see DFNTG §10.3). In Matthew’s Gospel, it usually marks major divisions in the book (e.g. Mt 7:28). In Luke-Acts, in contrast, ‘it picks out from the general background the specific circumstance for the foreground events that are to follow’ (ibid.), as in Acts 9:37 (see also Mt 9:10).'},
    'Verb Focus+': {'description': 'Verb in final position in clause demonstrates verb focus.'},
    'Articular Pronoun': {'description': 'Articular pronoun, which often introduces an ‘intermediate step’ in a reported conversation.'},
    'Topical Genitive': {'description': 'A genitival constituent that is nominal is preposed within the noun phrase for two purposes: 1) to bring it into focus; 2) within a point of departure, to indicate that it is the genitive in particular which relates to a corresponding constituent of the context.(DFNTG §4.5)'},
    'Embedded DFE': {'description': "'Dominant focal elements' embedded within a constituent in P1."},
    'Reported Speech': {'description': 'Reported speech.'},
    'Ambiguous': {'description': 'Marked but ambiguous constituent order.'},
    'Over-encoding': {'description': 'Any instance in which more encoding than the default is employed to refer to an active participant or prop. Over-encoding is used in Greek, as in other languages: to mark the beginning of a narrative unit (e.g. Mt 4:5); and to highlight the action or speech concerned (e.g. Mt 4:7).'},
    'Highlighter': {'description': 'Presentatives - Interjections such as ἰδού and ἴδε ‘look!, see!’ typically highlight what immediately follows (Narr §5.4.2, NonNarr §7.7.3).'},
    'Referential PoD': {'description': 'Pre-verbal topical subject other referential point of departure (NARR §3.1, NonNarr §4.3, DFNTG §§2.2, 2.8; as in 1 Th 1:6).'},
    'annotations': {'description': 'Inline annotations.'},
    'Left-Dislocation': {'description': 'Point of departure - A type of SENTENCE in which one of the CONSTITUENTS appears in INITIAL position and its CANONICAL position is filled by a PRONOUN or a full LEXICAL NOUN PHRASE with the same REFERENCE, e.g. John, I like him/the old chap.”'},
    'Focus+': {'description': 'Constituents placed in P2 to give them focal prominence.'},
    'Tail-Head linkage': {'description': 'Point of departure involving renewal - Tail-head linkage involves “the repetition in a subordinate clause, at the beginning (the ‘head’) of a new sentence, of at least the main verb of the previous sentence (the tail)” (Dooley & Levinsohn 2001:16).'},
    'Postposed them subject': {'description': 'When a subject is postposed to the end of its clause (following nominals or adjuncts), it is marked ThS+ (e.g. Lk 1:41 [twice]). Such postposing typically marks as salient the participant who performs the next event in chronological sequence in the story (see Levinsohn 2014).'},
    'EmbeddedRepSpeech': {'description': 'Embedded reported speech - speech that is reported within a reported speech.'},
    'Futuristic Present': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'OT quotes': {'description': 'Old Testament quotations.'},
    'Constituent Negation': {'description': 'Negative pro-forms when they are in P2 indicate that the constituent has been negated rather than the clause as a whole.'},
    'Split Focal': {'description': 'The second part of a focal constituent with only the first part in P2 (NonNarr §5.5, DFNTG §4.4).'},
    'Right-Dislocated': {'description': 'Point of departure - A type of SENTENCE in which one of the CONSTITUENTS appears in FINAL position and its CANONICAL position is filled by a PRONOUN with the same REFERENCE, e.g. ... He’s always late, that chap.'},
    'Appositive': {'description': 'Appositive'},
    'Situational PoD': {'description': 'Situational point of departure (e.g. temporal, spatial, conditional―(NARR §3.1, NonNarr §4.3, DFNTG §§2.2, 2.8; as in 1 Th 3:4).'},
    'Historical Present': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'Noun Incorporation': {'description': 'Some nominal objects that appear to be in P2 may precede their verb because they have been “incorporated” (Rosen 1989) in the verb phrase. Typically, the phrase consists of an indefinite noun and a “light verb” such as “do, give, have, make, take” (Wikipedia entry on Light Verbs).'},
    'Thematic Prominence': {'description': 'Thematic prominence - In Greek, prominence is given to active participants and props who are the current centre of attention (NARR §4.6) by omitting the article (DFNTG §§9.2.3-9.4), by adding αυτος ‘-self’ (e.g. in 1 Th 3:11), by using the proximal demonstrative οὗτος (NARR chap. 9, Appendix 1; e.g. in 3:3), and by postposing the constituent concerned (e.g. Mt 14:29). If such constituents are NOT in postion P1, they are demonstrating topical prominence.'},
    'Cataphoric Focus': {'description': 'An expression that points forward to and highlights something which ‘is about to be expressed.’'},
    'Cataphoric referent': {'description': 'The clause or sentence to which a cataphoric reference refers when NOT introduced with ὅτι or ἵνα.'},
    'DFE': {'description': 'Constituents that may be moved from their default position to the end of a proposition to give them focal prominence include verbs, pronominals and objects that follow adjuncts (NonNarr §5.3, DFNTG §3.5). Such constituents, also called ‘dominant focal elements’or DFEs (Heimedinger 1999:167).'},
    'Embedded Focus+': {'description': 'A constituent of a phrase or embedded clause preposed for focal prominence.'}
}

In [154]:
ENDPOINT = 'https://macula-atlas-api-qa-25c5xl4maa-uk.a.run.app/graphql/'
headers = {"Content-Type": "application/json"}

## Syntax
- [x] Lowfat analysis
  - [x] Perhaps the lowfat data should be displayed using brackets instead of treedown?
- [x] We want all of the semantic configurations listed... this could be something like Rick's display, or something else, but for now it will function as a partial inventory of semantic configurations reminiscent of thematic formations `[x] [did] [y]` 
- [x] OpenText morphosyntax values (note: currently just all opentext values, not limited to morphosyntax)

In [None]:
# def get_syntactic_information(word_id):
#     # Retrieve and return syntactic information for the given word
#     word_data = mg.loc[word_id].to_dict()
#     word_ref = word_data['ref']
#     lemma = word_data['lemma']
#     text = word_data['text']
#     output = f"Syntactic information for {lemma}:\n"
#     for key, value in word_data.items():
#         if value not in (None, 'missing', 'nan'):
#             if key == "subjref":
#                 # The id of the implied subject of the verb  - need to retrieve this from the macula greek 'mg' dataframe
#                 subject_referent_data = mg.loc[value].to_dict()
#                 # get the 'text' 'gloss' 'lemma' and semantic role label
#                 subject_referent_text = subject_referent_data['text']
#                 subject_referent_gloss = subject_referent_data['gloss']
#                 subject_referent_lemma = subject_referent_data['lemma']
#                 # subject_referent_semantic_role_label, complete_wordings = semantic_role_data.loc[semantic_role_data['xml:id'] == value].iloc[0]['semantic_role_label'], get_complete_wordings_for_role(value)
#                 # for wording in complete_wordings:
#                 #     complete_wordings_array = []
#                 #     if wording['forms'] != subject_referent_text:
#                 #         complete_wordings_array.append(' '.join(wording['forms']))
#                 #         break
#                 #     else:
#                 #         complete_wordings = None
#                 # complete_wordings_string = ', '.join(complete_wordings_array) if complete_wordings_array else None
                
#                 output += f"- Subject referent: {attribute_descriptions[key]}: {subject_referent_text} ({subject_referent_gloss}, lemma {subject_referent_lemma}) is the subject of the verb.\n" # and it plays the role of {subject_referent_semantic_role_label}")
#                 # if complete_wordings_string:
#                 #     output += f" (complete wording: {complete_wordings_string})\n"
#                 # output += f" in its semantic configuration.\n"
#             elif key == "referent":
#                 # The id of the (usually pronominal) referent - need to retrieve this from the macula greek 'mg' dataframe
#                 referent_data = mg.loc[value].to_dict()
#                 # get the 'text' 'gloss' 'lemma' and semantic role label
#                 referent_text = referent_data['text']
#                 referent_gloss = referent_data['gloss']
#                 referent_lemma = referent_data['lemma']
#                 # referent_semantic_role_label, complete_wordings = semantic_role_data.loc[semantic_role_data['xml:id'] == value].iloc[0]['semantic_role_label'], get_complete_wordings_for_role(value)
#                 # for wording in complete_wordings:
#                 #     complete_wordings_array = []
#                 #     if wording['forms'] != referent_text:
#                 #         complete_wordings_array.append(' '.join(wording['forms']))
#                 #         break
#                 #     else:
#                 #         complete_wordings = None
#                 # complete_wordings_string = ', '.join(complete_wordings_array) if complete_wordings_array else None
                
#                 output += f"- Referent: {attribute_descriptions[key]}: {referent_text} ({referent_gloss}, lemma {referent_lemma}) is the referent of the pronoun.\n" # playing the role of {referent_semantic_role_label}\n"
#                 # if complete_wordings_string:
#                 #     output += f" (complete wording: {complete_wordings_string})\n"
#                 # output += f" in its semantic configuration.\n"
#             elif key in ("person", "number", "gender", "case", "tense", "voice", "mood", "degree", "type"):
#                 output += f"- {key}: {attribute_descriptions[key]}: {value},\n"
     
#     # Add semantic role data to syntactic context: 'this word is the {semantic_role} in the configuration {semantic_configuration}' 
#     semantic_configuration = describe_semantic_configuration(word_id, semantic_role_data, output_template) # output_template was defined above
#     # Find the matching row from semantic_role_data, and get all of the labels plus the column names
#     role_data_row = semantic_role_data.loc[semantic_role_data['xml:id'] == word_id].iloc[0]
#     semantic_role_info = {key: value for key, value in role_data_row.items() if not value == ''}
#     if semantic_role_info.get('semantic_role_label'):
#         semantic_role = semantic_role_info['semantic_role_label']    
#         output += f"- Semantic configuration (useful for figuring out what is taking place in the sentence and how this word plays a role): This word is the {semantic_role} {'in `' + semantic_configuration + '`' if semantic_configuration else ''}, and it has the following data: {semantic_role_info}\n"
#     # TODO: need to exploit the roles.json file in order to get all related wordings for each frame. 
    
    
#     # Add opentext syntax data to syntactic context
#     # opentext_syntax_data = get_opentext_syntax_data(word_id)
#     # if opentext_syntax_data:
#     #     output += f"- OpenText syntax (useful for identifying all of the grammatical choices that led up to this word, such as whether it is part of a derived 'entity' definition or a nested 'turn' or a particular kind of speech act):\n  This word has the following syntactic selection features:\n"
#     #     for feature in opentext_syntax_data:
#     #         output += f"  - {feature['feature_name']}: {feature['feature_description']}\n"
    
#     # Add treedown syntax data
#     treedown_data = get_treedown_by_ref(word_ref)
#     if treedown_data:
#         output += f"- Treedown syntax: This word is part of the following sentence:\n{treedown_data}\n"
    
#     return output

# # def get_syntactic_information_for_verse(verse_ref):
    


In [68]:
# 1. Get treedown sentence context using word ref...

from lxml import etree

# Get the plain treedown representation for a token's sentence

# example endpoint: "https://labs.clear.bible/symphony-dev/api/GNT/Nestle1904/lowfat?usfm-ref=JHN%2014:1" - JHN 14:1

def process_element(element, usfm_ref, indent=0, brackets=False):
    if brackets:
        indent = 0
    treedown_str = ""
    open_bracket = "[" if brackets else ""
    close_bracket = "] " if brackets else ""

    if element.get("class") == "cl":
        treedown_str += "\n" + open_bracket + ("  " * indent)

    if element.get("role"):
        role = element.attrib["role"]
        if role == "adv":
            role = "+"
        if not brackets:
            treedown_str += "\n"
        treedown_str += open_bracket + ("  " * indent) + role + ": "

    # # bold the matching token using usfm ref # NOTE: not applicable, since I think you have to use a USFM ref without the word
    # if element.tag == "w" and element.get("ref") == usfm_ref:
    #     treedown_str += "**" + element.text + "**"
    #     treedown_str += element.attrib.get("after", "") + close_bracket

    if element.tag == "w" and element.text:
        treedown_str += element.attrib.get("gloss", "") + element.attrib.get("after", "") + f"({element.text})"
        treedown_str += close_bracket

    for child in element:
        treedown_str += process_element(child, usfm_ref, indent + 1, brackets)

    return treedown_str

def get_treedown_by_ref(usfm_ref, brackets=False):
    usfm_passage = usfm_ref.split("!")[0]
    endpoint = (
        "https://labs.clear.bible/symphony-dev/api/GNT/Nestle1904/lowfat?usfm-ref="
        + usfm_passage
    )

    text_response = requests.get(endpoint).text
    xml = etree.fromstring(text_response.encode("utf-8"))

    treedown = process_element(xml, usfm_passage, brackets=brackets)
    return treedown


# test

print(get_treedown_by_ref("MAT 3:14", brackets=True))

# TODO: allow passing in bracket notation

# # Add treedown syntax data
# treedown_data = get_treedown_by_ref(word_ref)
# if treedown_data:
#     # output += f"- Treedown syntax: This word is part of the following sentence:\n{treedown_data}\n"
    

But (δὲ)] 
[[s: - (ὁ)] [v: was hindering (διεκώλυεν)] [o: Him (αὐτὸν)] 
[[+: [v: saying (λέγων)] [o: 
[[s: I (Ἐγὼ)] [o: need (χρείαν)] [v: have (ἔχω)] 
[[+: [+: by (ὑπὸ)] You (σοῦ)] [v: to be baptized,(βαπτισθῆναι)] and (καὶ)] 
[[s: You (σὺ)] [v: come (ἔρχῃ)] [+: to (πρός)] me;(με)] 


In [83]:
# 2. Get all semantic configurations in the sentence scope

file_3_url = 'https://raw.githubusercontent.com/Clear-Bible/speaker-quotations/main/json/SpeakerProjections-clear.json'
file_4_url = 'https://raw.githubusercontent.com/Clear-Bible/speaker-quotations/main/json/character_detail.semantic_data.json'
file_3_name = 'SpeakerProjections-clear.json'
file_4_name = 'character_detail.semantic_data.json' # stores info about each unique character id (unique string value)
file_5_url = 'https://raw.githubusercontent.com/ryderwishart/biblical-machine-learning/main/gpt-inferences/word_level_semantic_data.tsv'
file_5_name = 'word_level_semantic_data.tsv'
file_6_url = 'https://raw.githubusercontent.com/ryderwishart/biblical-machine-learning/main/gpt-inferences/roles.json'
file_6_name = 'roles.json'

if file_3_name not in os.listdir():
    download_file(file_3_url, file_3_name)

if file_4_name not in os.listdir():
    download_file(file_4_url, file_4_name)

if file_5_name not in os.listdir():
    download_file(file_5_url, file_5_name)

if file_6_name not in os.listdir():
    download_file(file_6_url, file_6_name)
    
# Create a dataframe from the SpeakerProjections-clear.json file
with open('SpeakerProjections-clear.json', 'r') as f:
    speaker_projections = json.load(f)
    speaker_data = pd.DataFrame(speaker_projections)
    
# Create a dataframe from the character_detail.semantic_data.json file
with open('character_detail.semantic_data.json', 'r') as f:
    # this data is an array of JSON objects
    character_detail = json.load(f)
    character_data = pd.DataFrame(character_detail)


# Transpose the DataFrame
transposed_speaker_data = speaker_data.transpose()

# Reset the index
transposed_speaker_data.reset_index(inplace=True)

# Rename the columns
transposed_speaker_data.columns = ["row_id", "instance_data", "projections"]

print(transposed_speaker_data.iloc[0].to_dict())

# Normalize the 'instance_data' column
flattened_instance_data = pd.json_normalize(transposed_speaker_data['instance_data'])

# Merge the normalized DataFrame with the original transposed DataFrame
merged_speaker_data = pd.concat([transposed_speaker_data.drop(columns=['instance_data']), flattened_instance_data], axis=1)

print(merged_speaker_data.iloc[0].to_dict())

# Create an empty DataFrame to store the result
expanded_speaker_data = pd.DataFrame()

# Iterate through the rows in the merged_speaker_data DataFrame
for idx, row in merged_speaker_data.iterrows():
    projections = row['projections']
    
    # Iterate through the projections
    for proj_idx, projection in enumerate(projections):
        # Create a new row with the inherited speaker instance data
        new_row = row.drop('projections').to_dict()
        new_row.update(projection)
        
        # Set the row ID with the projection index
        new_row['row_id'] = f"{row['row_id']}|{proj_idx}"
        
        # Append the new row to the expanded_speaker_data DataFrame
        expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)

# Add 'tokens' and 'token_ids' columns with default empty lists
expanded_speaker_data = expanded_speaker_data.assign(tokens=[[]]*len(expanded_speaker_data), token_ids=[[]]*len(expanded_speaker_data))

# Iterate through the rows in the expanded_speaker_data DataFrame
for idx, row in expanded_speaker_data.iterrows():
    words = row['Words']
    
    # Extract the 'Text' and 'Id' values from the 'Words' data
    tokens = [word['Text'] for word in words]
    token_ids = [word['Id'] for word in words]
    
    # Assign the 'tokens' and 'token_ids' fields to the row
    expanded_speaker_data.at[idx, 'tokens'] = tokens
    expanded_speaker_data.at[idx, 'token_ids'] = token_ids

# Create a dataframe from 'word_level_semantic_data.tsv'. First row is header
semantic_role_data = pd.read_csv('word_level_semantic_data.tsv', sep='\t', header=0)
semantic_role_data = semantic_role_data.fillna('')

# Read roles.json and load it into a Python object
with open("roles.json", "r") as f:
    roles_data = json.load(f)

def extract_data(node):
    node_data = {
        "node_category": node["node_category"],
        "node_head": node["node_head"],
        "field_name": node["field_name"],
        "field_type": node["field_type"],
        "forms": [],
        "lemmas": [],
        "glosses": [],
        "ids": []
    }

    for word_data in node["words"].values():
        node_data["forms"].append(word_data["word"])
        node_data["lemmas"].append(word_data["lemma"])
        node_data["glosses"].append(word_data["gloss"])
        node_data["ids"].append(word_data["word_identifier"])

    return node_data

processed_data = [extract_data(node) for node in roles_data]

semantic_role_wordings_lookup = pd.DataFrame(processed_data)

for i in range(0, len(semantic_role_wordings_lookup)):
    # if len(semantic_role_wordings_lookup.iloc[i]['ids']) > 1:
    #     semantic_role_wordings_lookup.iloc[i].to_dict()
    for j in semantic_role_wordings_lookup.iloc[i]['ids']:
        if j.startswith('n66020004033'):
            print(semantic_role_wordings_lookup.iloc[i].to_dict())


{'row_id': 'GEN 1:3|GEN 1:3|God', 'instance_data': {'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'CharacterIds': ['God'], 'Alias': 'God (Yahweh)', 'QuoteType': 'Normal', 'DefaultCharacterId': 'God', 'Alternates': []}, 'projections': [{'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑וֹר\u200e', 'Depth': 2}]}]}
{'row_id': 'GEN 1:3|GEN 1:3|God', 'projections': [{'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_dat

{'node_category': 'np', 'node_head': 'n66020004033', 'field_name': 'Role', 'field_type': 'Negation', 'forms': ['τὸ', 'θηρίον', 'οὐδὲ', 'τὴν', 'εἰκόνα', 'αὐτοῦ'], 'lemmas': ['ὁ', 'θηρίον', 'οὐδέ', 'ὁ', 'εἰκών', 'αὐτός'], 'glosses': ['', '', '', '', '', ''], 'ids': ['n66020004031', 'n66020004032', 'n66020004033', 'n66020004034', 'n66020004035', 'n66020004036']}


In [119]:
# one-liner to get the semantic role wordings for a given word id
def get_complete_wordings_for_role(word_id):
    return semantic_role_wordings_lookup[semantic_role_wordings_lookup['ids'].apply(lambda x: word_id in x)].to_dict('records')

# Use semantic_role_data to get the semantic role data for a token

import pandas as pd
import re

def describe_semantic_configuration(query_id, df, output_template="[{a0_string}] [{frame_verb_lemma}] {all_other_role_strings} / [{a0_role}: {a0_gloss}] [{frame_verb_role}: {frame_verb_gloss}] {all_other_role_gloss_strings}"):
    # Find the row with the query ID
    row = df.loc[df['xml:id'] == query_id].iloc[0]

    # Find the matching rows for the verb frame IDs, handling multiple frame verb rows
    frame_verb_ids = row['frame_verb_id'].split('|')
    
    # FIXME: If there are no frame verb IDs, we probably just need to find the syntactic clause containing this token and return the treedown
    
    
    frame_verb_rows = [df.loc[df['xml:id'] == frame_verb_id].iloc[0] for frame_verb_id in frame_verb_ids if not df.loc[df['xml:id'] == frame_verb_id].empty]

    outputs = []

    for frame_verb_row in frame_verb_rows:
        a0_string = ''
        frame_verb_lemma = ''
        all_other_role_strings = []
        a0_gloss = ''
        frame_verb_role = ''
        frame_verb_gloss = ''
        all_other_role_gloss_strings = []
        a0_lemmas, a0_glosses, a0_roles = None, None, None
        
        # Extract all frame roles and IDs using regex, and store them in a dictionary where the role (e.g., A0) is the key and the value is a list of xml:ids
        role_id_dict = {role: ids.split(';') for role, ids in re.findall(r'(A[0-9A]+):([^;\s]+)', frame_verb_row['frame'])}

        for role_id in role_id_dict:
            id = role_id_dict[role_id][0] 
            # get the complete wordings relating to the id:
            wordings_dict = get_complete_wordings_for_role(id) # TODO: This is still not working properly, but I'm not sure why. I don't understand the complete contents of each data file yet.
            
        # Find the rows with the extracted role IDs
        role_rows = {role: df.loc[df['xml:id'].isin(ids)] for role, ids in role_id_dict.items()}

        # Generate the output
        frame_verb_lemma, frame_verb_gloss, frame_verb_role = frame_verb_row['lemma'], frame_verb_row['gloss'], 'Process'

        a0_rows = role_rows.get('A0', pd.DataFrame())
        if a0_rows.empty or not 'lemma' in a0_rows:
            '''''' # I'm not sure of the syntax to 'continue/pass' without skipping the rest of the loop... -\_(ツ)_/-
            # 'no a0 rows lemma', a0_rows
        else:
            a0_lemmas, a0_glosses, a0_roles = a0_rows['lemma'].tolist(), a0_rows['gloss'].tolist(), a0_rows['semantic_role_label'].tolist()
            a0_string = f'A0: {", ".join(a0_lemmas)}'
            a0_gloss = f'{", ".join(a0_glosses)}'

        all_other_role_strings = []
        all_other_role_gloss_strings = []
        for role, rows in role_rows.items():
            if role == 'A0':
                continue
            lemmas, glosses, roles = rows['lemma'].tolist(), rows['gloss'].tolist(), rows['semantic_role_label'].tolist()
            role_string = f'[{role}: {", ".join(lemmas)}]'
            role_gloss_string = f'[{roles[0]}: {", ".join(glosses)}]'
            all_other_role_strings.append(role_string)
            all_other_role_gloss_strings.append(role_gloss_string)

        output = output_template.format(
            a0_string=a0_string,
            frame_verb_lemma=frame_verb_lemma,
            all_other_role_strings=" ".join(all_other_role_strings),
            a0_role=a0_roles[0] if a0_roles else '',
            a0_gloss=a0_gloss,
            frame_verb_role=frame_verb_role,
            frame_verb_gloss=frame_verb_gloss,
            all_other_role_gloss_strings=" ".join(all_other_role_gloss_strings)
        )

        outputs.append(output)

    return "\n".join(outputs)

# Example usage:
query_id = 'n41001005008'

result = describe_semantic_configuration(query_id, semantic_role_data)
# print(result)

def get_semantic_configuration_for_word(word_id):
    """Submit a word id (not ref) and get any semantic configurations associated with that word.
    
    This function can be run repeatedly for a given verse, but ensure that you filter the output for repeated constructions.
    """   
    
    output = ''
    # Add semantic role data to syntactic context: 'this word is the {semantic_role} in the configuration {semantic_configuration}' 
    semantic_configuration = describe_semantic_configuration(word_id, semantic_role_data) # output_template was defined above
    # Find the matching row from semantic_role_data, and get all of the labels plus the column names
    role_data_row = semantic_role_data.loc[semantic_role_data['xml:id'] == word_id].iloc[0]
    semantic_role_info = {key: value for key, value in role_data_row.items() if not value == ''}
    if semantic_role_info.get('semantic_role_label'):
        semantic_role = semantic_role_info['semantic_role_label']    
        output += f"- Semantic configuration (useful for figuring out what is taking place in the sentence and how this word plays a role): This word is the {semantic_role} {'in `' + semantic_configuration + '`' if semantic_configuration else ''}, and it has the following data: {semantic_role_info}\n"
    # TODO: need to exploit the roles.json file in order to get all related wordings for each frame. 
    return output

def get_semantic_configurations_for_tokens_and_prune_dups(token_ids):
    all_configs = []
    for token in token_ids:
        configurations = describe_semantic_configuration(token, semantic_role_data)
        if configurations:
            all_configs.extend(configurations.split('\n'))
    return list(set(all_configs))

# Test usage
test_configurations = get_semantic_configurations_for_tokens_and_prune_dups(['n41001005005', 'n41001005006', 'n41001005007', 'n41001005008', 'n41001005009', 'n41001005010','n44003001001'])
for config in test_configurations:
    print('>', config)

> [A0: χώρα] [ἐκπορεύομαι]  / [Source: region] [Process: were going out] 
> [A0: χώρα] [ἐξομολογέω] [A1: ἁμαρτία] / [Source: region] [Process: confessing] [Goal: sins]
> [A0: Πέτρος] [ἀναβαίνω]  / [Source: Peter] [Process: were going up] 
> [] [βαπτίζω] [A1: χώρα] / [: ] [Process: were being baptized] [Source: region]


In [120]:
# 3. Get OpenText values filtered by morphosyntax? Need to fill out the API endpoint for OpenText values a bit more to filter by `(ww | w)/sem/@class`

OPENTEXT_ENDPOINT = "https://ww-network-annotations---macula-atlas-api-qa-25c5xl4maa-uk.a.run.app/graphql/"

opentext_query = """
query TokensWithAllOpenTextSystemValues(
  $wordTokensFilters: WordTokenFilter
  $wordTokensPagination: OffsetPaginationInput
  $annotationInstancesFilters: AnnotationFilter
) {
  wordTokens(filters: $wordTokensFilters, pagination: $wordTokensPagination) {
    ref
    annotationInstances(filters: $annotationInstancesFilters) {
      uri
      feature {
        label
        data
      }
    }
  }
}
"""

def get_opentext_syntax_data(xmlId):
    tokenData = mg.loc[xmlId].to_dict()
    passage = tokenData["ref"].split("!")[0]

    opentext_variables = {
        "wordTokensFilters": {"passageReference": passage},
        "annotationInstancesFilters": {
            "uri": {
                "regex": "https://github.com/OpenText-org/placeholder-data:system-values..+.all"
            }
        },
    }

    opentext_payload = {"query": opentext_query, "variables": opentext_variables}

    response = requests.post(OPENTEXT_ENDPOINT, json=opentext_payload, headers=headers)
    response_data = json.loads(response.text)
    words = response_data["data"]["wordTokens"]

    results = []
    for word in words:
        ref = word["ref"]
        if ref == tokenData["ref"]:
            annotation_features = word["annotationInstances"]
            for feature in annotation_features:
                feature_name = feature["feature"]["label"]
                if feature_name.startswith("$"):
                    feature_description = (
                        f"The grammar requires a {feature_name} lemma here"
                    )
                else:
                    feature_description = feature["feature"]["data"]["description"]
                    if feature_name.endswith("_tbd"):
                        feature_description += (
                            " (Fall back to morphological description of this wording)"
                        )
                    results.append(
                        {
                            "feature_name": feature_name,
                            "feature_description": feature_description,
                        }
                    )

    return results

#     # Add opentext syntax data to syntactic context
#     # opentext_syntax_data = get_opentext_syntax_data(word_id)
#     # if opentext_syntax_data:
#     #     output += f"- OpenText syntax (useful for identifying all of the grammatical choices that led up to this word, such as whether it is part of a derived 'entity' definition or a nested 'turn' or a particular kind of speech act):\n  This word has the following syntactic selection features:\n"
#     #     for feature in opentext_syntax_data:
#     #         output += f"  - {feature['feature_name']}: {feature['feature_description']}\n"
    

In [140]:

# def get_syntax_data_by_verse(verse_ref):
#     all_token_ids_in_verse = mg.loc[mg['ref'] == verse_ref]['xml:id'].tolist()
#     first_word = all_token_ids_in_verse[0]

#     # get lowfat data for sentence scope
#     treedown = get_treedown_by_ref(verse_ref, brackets=True)

#     # get semantic configurations
#     configurations = get_semantic_configurations_for_tokens_and_prune_dups(all_token_ids_in_verse)
    
#     # TODO: get opentext values relating to morphosyntax
#     # currently this returns all values
#     opentext_values = get_opentext_syntax_data(first_word)

#     return f"""
#     ## Syntactic context and function
#     - Treedown syntax: This word is part of the following sentence:
#     {treedown}
#     - Semantic configurations (useful for figuring out what is taking place in the sentence and how this word plays a role):
#     {configurations}
#     - OpenText syntax (useful for identifying all of the grammatical choices that led up to this word, such as whether it is part of a derived 'entity' definition or a nested 'turn' or a particular kind of speech act):
#     {opentext_values}
#     """
def get_syntax_data_by_verse(verse_ref):
    output = ''
    all_token_ids_in_verse = mg.loc[mg['book_chapter_verse'] == verse_ref]['id'].tolist()
    first_word = all_token_ids_in_verse[0]

    # get lowfat data for sentence scope
    treedown = get_treedown_by_ref(verse_ref, brackets=True)
    if treedown:
        treedown_string = f"""
- Treedown syntax: This word is part of the following sentence:
        {treedown}
        """

    # get semantic configurations
    configurations = get_semantic_configurations_for_tokens_and_prune_dups(all_token_ids_in_verse)
    if configurations:
        configurations_string = f"""
- Semantic configurations (useful for figuring out what is taking place in the sentence and how this word plays a role):
        {configurations}
        """
    
    # TODO: get opentext values relating to morphosyntax
    # currently this returns all values
    opentext_values = get_opentext_syntax_data(first_word)
    if opentext_values:

        opentext_values_string = """
- OpenText syntax (useful for identifying all of the grammatical choices that led up to this word, such as whether it is part of a derived 'entity' definition or a nested 'turn' or a particular kind of speech act):"""
        for value in opentext_values:
            opentext_values_string += f"\n   - {value['feature_name']}: {value['feature_description']}"

    if not treedown_string and not configurations_string and not opentext_values_string:
        return None
    else:
        ouput = """
## Syntactic context and function
        """
        if treedown_string:
            output += treedown_string
        if configurations_string:
            output += configurations_string
        if opentext_values_string:
            output += opentext_values_string
        return output

In [141]:
# Test
print(get_syntax_data_by_verse('MAT 5:4'))


- Treedown syntax: This word is part of the following sentence:
        
[[p: Blessed [are] (μακάριοι)] [s: those (οἱ)] 
[[v: mourning,(πενθοῦντες)] [+: for (ὅτι)] 
[[s: they (αὐτοὶ)] [v: will be comforted.(παρακληθήσονται)] 
        
- Semantic configurations (useful for figuring out what is taking place in the sentence and how this word plays a role):
        ['[] [παρακαλέω] [A1: αὐτός] / [: ] [Process: will be comforted] [Goal: they]', '[] [πενθέω]  / [: ] [Process: mourning] ']
        
- OpenText syntax (useful for identifying all of the grammatical choices that led up to this word, such as whether it is part of a derived 'entity' definition or a nested 'turn' or a particular kind of speech act):
   - #characterization: The circumstance construes an "environment" by means of a generalization that is instantiated by an entity. This entails a circumstantial generalization either of an entire grammatical unit or of some part of it, expressed by a wording that appears "predicative" 

## Discourse
- [ ] Levinsohn features we want a line-by-line representation of the Levinsohn data... With each feature intance related to any of the tokens displayed next to the tokens, and a glossary of what those feature labels mean below
- [ ] Specifically the speech act values from opentext?
- [ ] Add speaker quotation data

In [151]:

ENDPOINT = "https://macula-atlas-api-qa-25c5xl4maa-uk.a.run.app/graphql/"
headers = {"Content-Type": "application/json"}

# Levinsohn discourse features query

discourse_features_query = """
query AnnotationFeatures($filters1: AnnotationFeatureFilter, $filters2: AnnotationFilter, $filters3: WordTokenFilter ) {
  annotationFeatures(filters: $filters1) {
    label
    uri
    instances(filters: $filters2) {
      uri
      tokens(filters: $filters3) {
        ref
        wordValue
        xmlId
      }
    }
  }
}
"""


def get_discourse_annotation_types(xmlId):
    tokenData = mg.loc[xmlId].to_dict()
    passage = tokenData["ref"].split("!")[0]

    variables = {
        "filters1": {
            "reference": passage,
        },
        "filters2": {
            "reference": passage,
        },
        "filters3": {
            "xmlId": xmlId,
        },
    }

    payload = {"query": discourse_features_query, "variables": variables}

    response = requests.post(ENDPOINT, json=payload, headers=headers)

    response_data = json.loads(response.text)
    annotation_features = response_data["data"]["annotationFeatures"]

    labels = [feature["label"] for feature in annotation_features]
    return labels




def get_discourse_information(word_id):
    # Retrieve and return discourse information for the given word
    output = ''
    word_data = mg.loc[word_id].to_dict()
    word_ref = word_data['ref']
    
    discourse_features = get_discourse_annotation_types(word_id)
    if discourse_features:
        output += f"This word functions within {len(discourse_features)} discourse features (these are useful heuristic interpretive annotations that tell you about the nature of the proposition a word is in):\n"
        for feature in discourse_features:
            output += f"- {feature} is defined as {discourse_types[feature]['description']}\n"
    
    speaker_information = get_speaker_quotation_data(word_ref)
    if speaker_information:
        output += f"\nSpeaker data is critical to identifying quoted material and relating it to the proper speaker.\n"
        if len(speaker_information) == 1:
            speaker = speaker_information[0]
            output += f"This word is spoken by {speaker['who_is_speaking']}"
            if speaker.get("Divinity") == "Y":
                output += f"- {speaker['who_is_speaking']} is a divinity"
            if speaker.get("Age"):
                output += f", age: {speaker['Age']}"
            if speaker.get("Comment"):
                output += f" ({speaker['Comment']})"
            if len(speech) > 100:
                speech = speaker.get("what_is_said_truncated")
            else:
                speech = speaker.get("what_is_said_complete")
            output += f", who says (in a {speaker['delivery_tone']} tone), \"{speech}\"\n"
        
        else:
            output += f"This word is spoken by {len(speaker_information)} speaker(s): \n"
            for speaker in speaker_information:
                output += f"This word is spoken by {speaker['who_is_speaking']}"
                if speaker.get("Divinity") == "Y":
                    output += f"- {speaker['who_is_speaking']} is a divinity"
                if speaker.get("Age"):
                    output += f", {speaker['Age']} years old"
                if speaker.get("Comment"):
                    output += f" ({speaker['Comment']})"
                if len(speech) > 100:
                    speech = speaker.get("what_is_said_truncated")
                else:
                    speech = speaker.get("what_is_said_complete")
                output += f", who says (in a {speaker['Delivery']} tone), \"{speech}\"\n"
    return output

# TODO: get discourse context for a passage... all features plus wordings in sentence scope I suppose

## Lexical and Cultural context
- [ ] The domain labels for the TFIDF top hits in the passage
- [ ] A synthetic summary (generated by LLM?) of the subject matter on that basis

In [45]:
if 'tfidf.dictionary.model' not in os.listdir():
    !wget "https://github.com/ryderwishart/biblical-machine-learning/raw/main/gpt-inferences/tfidf.dictionary.model"
if 'tfidf.model' not in os.listdir():
    !wget "https://github.com/ryderwishart/biblical-machine-learning/raw/main/gpt-inferences/tfidf.model"

--2023-05-17 00:20:15--  https://github.com/ryderwishart/biblical-machine-learning/raw/main/gpt-inferences/tfidf.dictionary.model
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ryderwishart/biblical-machine-learning/main/gpt-inferences/tfidf.dictionary.model [following]
--2023-05-17 00:20:15--  https://raw.githubusercontent.com/ryderwishart/biblical-machine-learning/main/gpt-inferences/tfidf.dictionary.model
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40357556 (38M) [application/octet-stream]
Saving to: ‘tfidf.dictionary.model’


2023-05-17 00:20:15 (185 MB/s) - ‘tfidf.dictionary.model’ sav

In [46]:
from collections.abc import Mapping
import numpy
from unidecode import unidecode
from gensim.models.tfidfmodel import TfidfModel
from gensim.corpora import Dictionary
tfidf_dictionary = Dictionary.load('tfidf.dictionary.model')
tfidf_model = TfidfModel().load('tfidf.model')

perseus_stopwords = "μή, ἑαυτοῦ, ἄν, ἀλλ', ἀλλά, ἄλλος, ἀπό, ἄρα, αὐτός, δ', δέ, δή, διά, δαί, δαίς, ἔτι, ἐγώ, ἐκ, ἐμός, ἐν, ἐπί, εἰ, εἰμί, εἴμι, εἰς, γάρ, γε, γα, ἡ, ἤ, καί, κατά, μέν, μετά, μή, ὁ, ὅδε, ὅς, ὅστις, ὅτι, οὕτως, οὗτος, οὔτε, οὖν, οὐδείς, οἱ, οὐ, οὐδέ, οὐκ, περί, πρός, σύ, σύν, τά, τε, τήν, τῆς, τῇ, τι, τί, τις, τίς, τό, τοί, τοιοῦτος, τόν, τούς, τοῦ, τῶν, τῷ, ὑμός, ὑπέρ, ὑπό, ὡς, ὦ, ὥστε, ἐάν, παρά, σός".split(', ')
perseus_stopwords += "συ δ μοι".split(' ')
perseus_stopwords = [unidecode(w) for w in perseus_stopwords]

def tfidf_tokenize(string):
    output = string
    # Filter numeric digits from token
    output = ''.join(filter(lambda x: x.isalpha() or x == ' ', string))
    return [token.lower() for token in output.split() if unidecode(token.lower()) not in perseus_stopwords] # use unidecode to strip accents temporarily

# example
input_text = "Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος."
input_tokens = [w for w in tfidf_tokenize(input_text)]
input_bow = tfidf_dictionary.doc2bow(input_tokens)
input_tfidf = tfidf_model[input_bow]
summary = sorted(input_tfidf, key=lambda x: x[1], reverse=True)[:10]
print('Most significant words in input text: ')
for result in summary:
    id, score = result
    token = tfidf_dictionary[id]
    print(f'{score:.2f}: {token}')
    


def get_tfidf_summary(input_text):
    input_tokens = [w for w in tfidf_tokenize(input_text) if w not in stopwords]
    input_bow = tfidf_dictionary.doc2bow(input_tokens)
    input_tfidf = tfidf_model[input_bow]
    summary = sorted(input_tfidf, key=lambda x: x[1], reverse=True)[:10]
    output = []
    for result in summary:
        id, score = result
        token = tfidf_dictionary[id]
        output.append((f'{score:.2f}',token))
    return output

print(get_tfidf_summary(input_text))

Most significant words in input text: 
0.56: λόγος
0.56: θεόν
0.48: ἀρχῇ
0.39: θεὸς
[('0.56', 'λόγος'), ('0.56', 'θεόν'), ('0.48', 'ἀρχῇ'), ('0.39', 'θεὸς')]


In [195]:
def get_lexical_information(word_id):
    # Retrieve and return lexical prose only for the given word
    word_data = mg.loc[word_id].to_dict()
    lemma = word_data.get('lemma', '')
    text = word_data.get('text', '')
    
    key_descriptions = {
        "class": f"{text} is a ",
        "gloss": 'meaning \"',
        "lemma": "The lemma form of this word is "
    }

    output = f"Lexical information for {lemma}:\n"
    for key, value in word_data.items():
        if value not in (None, 'missing', 'nan'):
            if key in key_descriptions:
                output += f"    - {key}: {key_descriptions[key]}{value},\n"

    return output.strip(',\n')

def get_tfidf_filtered_lexical_information(token_ids=None):
    print('getting token data for token ids', token_ids)
    # Retrieve and return TFIDF-filtered lexical information for the given passage
    tokens = [mg.loc[token_id].to_dict() for token_id in token_ids]
    
    passage_string = ' '.join([token['text'] for token in tokens])
    most_significant_token_tuples_in_passage = get_tfidf_summary(passage_string) # returns an array of (score,token) tuples
    print('most_significant_token_tuples_in_passage', most_significant_token_tuples_in_passage, f'({passage_string})')
    
    tokens_data = []
    for score, token in most_significant_token_tuples_in_passage:
        token_data = mg.loc[mg['text'].apply(lambda x: x.lower()) == token].iloc[0].to_dict()
        token_data['tfidf_score'] = score
        tokens_data.append(token_data['id'])
    
    return tokens_data

In [196]:
# Test 
print(get_lexical_information('n41001003002'))
print(get_tfidf_filtered_lexical_information(['n41001003002', 'n41001003003']))

Lexical information for βοάω:
    - class: βοῶντος is a verb,
    - gloss: meaning "of one crying,
    - lemma: The lemma form of this word is βοάω
getting token data for token ids ['n41001003002', 'n41001003003']
most_significant_token_tuples_in_passage [('1.00', 'βοῶντος')] (βοῶντος ἐν)
['n40003003012']


In [197]:
def get_cultural_information(word_id):
    # Retrieve and return cultural information for the given word
    word_data = mg.loc[word_id].to_dict()

    output = f"Cultural information for {word_data.get('lemma', '')}:\n"
    
    ln_data = word_data.get('ln', '')
    domain_data = word_data.get('domain_label', '')
    
    if domain_data not in (None, 'missing', 'nan'):
        domain_string = '; '.join(domain_data)
        output += f"- Domain label: {domain_string},\n"
    if ln_data not in (None, 'missing', 'nan'):
        # output += f"- Louw and Nida domain: {ln_data},\n"
        pass
                
    return output.strip(',\n')

In [198]:
# Test
print(get_cultural_information('n41001003002'))

Cultural information for βοάω:
- Domain label: Speak, Talk


In [203]:
def get_verse_significant_lexical_data(verse_ref):
    token_ids = mg.loc[mg['book_chapter_verse'] == verse_ref]['id'].tolist()
    
    # get tfidf tokens
    tfidf_tokens = get_tfidf_filtered_lexical_information(token_ids)
    tfidf_token_forms = [mg.loc[mg['id'] == token_id]['text'].tolist()[0] for token_id in tfidf_tokens]
    
    # get lexical and cultural data for each token
    lexical_data = [get_lexical_information(token) for token in tfidf_tokens]
    cultural_data = [get_cultural_information(token) for token in tfidf_tokens]
    
    output = f"""
## Lexical features
    - TF-IDF tokens: {', '.join(tfidf_token_forms)}
"""
    
    output += '\n'.join(f"- {data}" for data in lexical_data)

    if cultural_data:
        output += f"\n## Cultural/encyclopedic knowledge"
        output += '\n'.join(f"    - {data}" for data in cultural_data)
    
    return output

In [204]:
# Test
print(get_verse_significant_lexical_data('MAT 5:3'))

getting token data for token ids ['n40005003001', 'n40005003002', 'n40005003003', 'n40005003004', 'n40005003005', 'n40005003006', 'n40005003007', 'n40005003008', 'n40005003009', 'n40005003010', 'n40005003011', 'n40005003012']
most_significant_token_tuples_in_passage [('0.61', 'πτωχοὶ'), ('0.46', 'μακάριοι'), ('0.45', 'οὐρανῶν'), ('0.38', 'βασιλεία'), ('0.27', 'πνεύματι'), ('0.03', 'ἐστιν'), ('0.03', 'αὐτῶν')] (Μακάριοι οἱ πτωχοὶ τῷ πνεύματι ὅτι αὐτῶν ἐστιν ἡ βασιλεία τῶν οὐρανῶν)

## Lexical features
    - TF-IDF tokens: πτωχοὶ, Μακάριοι, οὐρανῶν, βασιλεία, Πνεύματι, ἐστιν, αὐτῶν
- Lexical information for πτωχός:
    - class: πτωχοὶ is a adj,
    - gloss: meaning "poor,
    - lemma: The lemma form of this word is πτωχός
- Lexical information for μακάριος:
    - class: Μακάριοι is a adj,
    - gloss: meaning "Blessed [are],
    - lemma: The lemma form of this word is μακάριος
- Lexical information for οὐρανός:
    - class: οὐρανῶν is a noun,
    - gloss: meaning "heavens,
    - lemma: T

## Social Situation
- [ ] The social situation information for the passage should only be accessed once


In [None]:


def get_social_information(word_id):
    """Return situation information for a word, if available.
    
    It should suffice to submit any word in a passage."""
    # Retrieve and return social information for the given word
    word_data = mg.loc[word_id].to_dict()
    word_ref = word_data['ref']
    
    lookup = situations_lookup_json
    situation_data = get_situations_data(word_ref)
    # if situation_data and situation_data.get('matchingSituation'):
    #     situation_data = situation_data['matchingSituation']
    if situation_data:
        pre_text_features = situation_data['preTextFeatures']
        via_text_features = situation_data['viaTextFeatures']
        
        if pre_text_features == 'no embedded discourse':
            return None

        pre_text_feature_descriptions, pre_text_system_descriptions = process_features(lookup, pre_text_features)
        via_text_feature_descriptions, via_text_system_descriptions = process_features(lookup, via_text_features)

        mutations = generate_mutations(pre_text_features, via_text_features, lookup)
        print(mutations)

        output = f"This word is part of the passage '{situation_data['title'][0]}'\n"

        # Add situation type information
        if situation_data.get('type'):
            situation_type = situation_data['type']
            output += f"This passage is a {situation_type['title']} situation, which can be described in typical terms as follows: {situation_type['description']}\n"

        output += f"It begins as a {' '.join(pre_text_features)} situation\n"
        output += '\n'.join(pre_text_feature_descriptions)
        # output += '\n'.join(pre_text_system_descriptions)

        # output += f"And ends as a {' '.join(via_text_features)} situation\n"
        # output += '\n'.join(via_text_feature_descriptions)
        # output += '\n'.join(via_text_system_descriptions)

        if mutations and mutations[-1] != '':
            output += "During the passage, the situation:\n"
            output += '\n'.join(mutations)
    
    return output

## Synthesizing Context Functions

You should be able to return the context for a word, or else the context for a passage.

In [205]:
def get_context_for_verse(verse_ref):
    """Submit a verse reference (e.g., 'MAT 1:1') and return
    natural-language linguistic data.
    """
    # Generate and return the prosaic context for the given verse
    if '!' in verse_ref:
        # If a word ref gets passed in, just get all of the words for the word's verse
        verse_tokens = mg.loc[mg['book_chapter_verse'] == verse_ref.split('!')[0]].to_dict('records')
    else:
        # If a verse ref gets passed in, get all of the words for the verse
        verse_tokens = mg.loc[mg['book_chapter_verse'] == verse_ref].to_dict('records')
    
    token_ids = [token['id'] for token in verse_tokens]
    print(token_ids)


    ### LEXIS ###

    lexical_data = get_verse_significant_lexical_data(verse_ref)
    
    ### SYNTAX ###

    syntax_data = get_syntax_data_by_verse(verse_ref)
    
    ### DISCOURSE ###
        
    # Get discourse context for first word in verse
    # word_data.append(get_context_for_word(most_significant_tokens[0]['id'], selected_fields=['discourse']))
    
    ### SOCIAL CONTEXT ###

    # Get social context for verse using first word
    # word_data.append(get_context_for_word(most_significant_tokens[0]['id'], selected_fields=['social']))
    
    
    
    # filter out duplicate lines in word data
    # word_data = list(set(word_data))
    
    results = [syntax_data, lexical_data]
    return '\n'.join(results)

In [208]:
def get_similar_docs(query_string):
    return collection.search(query_string, search_type='similarity')

def get_range_of_examples(query_string):
    return collection.search(query_string, search_type='mmr')

def get_prosaic_context_for_verse(verse_ref_string):
    return get_context_for_verse(verse_ref_string)

# prompt should be something like this:
"""
(
    "You are a language model trained in linguistics," 
    " and you are great at summarizing structured data while"
    " focusing on linguistic features without delving into theological issues."
    " Please provide a concise textual commentary on the given {linguistic_data}"
    " (use the treedown representation for the larger text context of the target word)"
    " by examining its key lexical choices, syntactic structures, discourse organization,"
    " social context, and cultural references. Avoid personal opinions and maintain objectivity."
    " Illuminate the passage's nuances, foster clarity, and establish connections within the work,"
    " empowering readers to grasp the author's intentions and the interplay between language and content."
    " Please format your output using the following headings:"
    " 1. Lexical features"
    " 2. Syntactic context and function"
    " 3. Discourse context"
    " 4. Social context"
    " 5. Cultural/encyclopedic knowledge"    
).format(linguistic_data=linguistic_data)
"""

def answer_question_with_context(input_question):
    # first, find the relevant bible verse source ids
    similar_verses = get_similar_docs(input_question)
    # second, get the prosaic context for the first verse (later this could be a multi-verse analysis...)
    verse_refs = [verse.metadata['source'] for verse in similar_verses]
    print('>>>>>>', verse_refs)
    prose = get_prosaic_context_for_verse(verse_refs[0])
    return prose

In [210]:
# Test
print(answer_question_with_context('MAT 5:3'))

>>>>>> ['JHN 12:36', 'MAT 19:18', 'JHN 18:4', 'ROM 13:9']
['n43012036001', 'n43012036002', 'n43012036003', 'n43012036004', 'n43012036005', 'n43012036006', 'n43012036007', 'n43012036008', 'n43012036009', 'n43012036010', 'n43012036011', 'n43012036012', 'n43012036013', 'n43012036014', 'n43012036015', 'n43012036016', 'n43012036017', 'n43012036018', 'n43012036019', 'n43012036020']
getting token data for token ids ['n43012036001', 'n43012036002', 'n43012036003', 'n43012036004', 'n43012036005', 'n43012036006', 'n43012036007', 'n43012036008', 'n43012036009', 'n43012036010', 'n43012036011', 'n43012036012', 'n43012036013', 'n43012036014', 'n43012036015', 'n43012036016', 'n43012036017', 'n43012036018', 'n43012036019', 'n43012036020']
most_significant_token_tuples_in_passage [('0.46', 'ἐκρύβη'), ('0.41', 'πιστεύετε'), ('0.36', 'γένησθε'), ('0.32', 'ἀπελθὼν'), ('0.31', 'ἐλάλησεν'), ('0.26', 'φῶς'), ('0.25', 'ἰησοῦς'), ('0.24', 'υἱοὶ'), ('0.23', 'ἔχετε'), ('0.19', 'φωτὸς')] (ὡς τὸ φῶς ἔχετε πιστεύετ

In [282]:
collection.search('How did Jesus heal the man born blind?', search_type='similarity')

[Document(page_content='And  came  to Him  blind  and  lame  in  the  temple, and  He healed  them. ', metadata={'source': 'MAT 21:14', 'book': 'MAT', 'chapter': '21', 'verse': '14', 'ids': 'n40021014001|n40021014002|n40021014003|n40021014004|n40021014005|n40021014006|n40021014007|n40021014008|n40021014009|n40021014010|n40021014011|n40021014012', 'greek': 'Καὶ προσῆλθον αὐτῷ τυφλοὶ καὶ χωλοὶ ἐν τῷ ἱερῷ,καὶ ἐθεράπευσεν αὐτούς.'}),
 Document(page_content='And  -  Jesus  said  to him  Receive sight· The  faith  of you  has healed  you. ', metadata={'source': 'LUK 18:42', 'book': 'LUK', 'chapter': '18', 'verse': '42', 'ids': 'n42018042001|n42018042002|n42018042003|n42018042004|n42018042005|n42018042006|n42018042007|n42018042008|n42018042009|n42018042010|n42018042011', 'greek': 'καὶ ὁ Ἰησοῦς εἶπεν αὐτῷ Ἀνάβλεψον·ἡ πίστις σου σέσωκέν σε.'}),
 Document(page_content='And  answering  him  -  Jesus  says  What  to you  do you desire  I should do; -  And  the blind [man]  said  to Him  Rabboni, t

# Chain Actions Together

This step brings together two phases into a user experience:

1. The user inputs a query, and the Retriever finds the most relevant biblical passage.
2. Context is provided by `answer_question_with_context()` for the most relevant passage's verse reference. 

In [216]:
# Inspect some texts 
print('MAT 1:1 -->', collection.search('MAT 1:1', search_type='similarity'))
print('if the user queries "blindess" -->', collection.search('blindess', search_type='similarity')) # note the typo. still works -\_(ツ)_/-
# TODO: sort out metadata filtering
print('blind with filters -->', collection.search('blind in 1PE', search_type='similarity'))

MAT 1:1 --> [Document(page_content='Jacob  then  begat  -  Joseph  the  husband  of Mary, out of  whom  was born  Jesus  the [One]  being called  Christ. ', metadata={'source': 'MAT 1:16', 'book': 'MAT', 'chapter': '1', 'verse': '16', 'ids': 'n40001016001|n40001016002|n40001016003|n40001016004|n40001016005|n40001016006|n40001016007|n40001016008|n40001016009|n40001016010|n40001016011|n40001016012|n40001016013|n40001016014|n40001016015', 'greek': 'Ἰακὼβ δὲ ἐγέννησεν τὸν Ἰωσὴφ τὸν ἄνδρα Μαρίας,ἐξ ἧς ἐγεννήθη Ἰησοῦς ὁ λεγόμενος Χριστός.'}), Document(page_content='and  behold  you will conceive  in  womb  and  will bring forth  a son, and  you shall call  the  name  of Him  Jesus. ', metadata={'source': 'LUK 1:31', 'book': 'LUK', 'chapter': '1', 'verse': '31', 'ids': 'n42001031001|n42001031002|n42001031003|n42001031004|n42001031005|n42001031006|n42001031007|n42001031008|n42001031009|n42001031010|n42001031011|n42001031012|n42001031013|n42001031014', 'greek': 'καὶ ἰδοὺ συλλήμψῃ ἐν γαστρὶ καὶ 

In [218]:
from langchain.chains.base import Chain
from langchain.retrievers.tfidf import TFIDFRetriever
from langchain.schema import Document

from __future__ import annotations

from typing import Any, Dict, List, Optional

from pydantic import Extra

from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
)
from langchain.chains.base import Chain
from langchain.prompts.base import BasePromptTemplate

class RetrieverWithContextChain(Chain):
    retriever: TFIDFRetriever
    llm: BaseLanguageModel
    output_key: str = "text"

    @property
    def input_keys(self) -> List[str]:
        return ["query"]

    @property
    def output_keys(self) -> List[str]:
        return [self.output_key]

    def _call(
        self,
        inputs: Dict[str, Any],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> Dict[str, str]:
        # Retrieve relevant documents based on the query
        relevant_docs = self.retriever.get_relevant_documents(inputs["query"])
        
        # Assume the first document is the most relevant one
        most_relevant_doc = relevant_docs[0] if relevant_docs else Document()

        # Provide context for the verse reference in the most relevant document
        context_response = self.llm.generate_from_context(
            {"verse_reference": most_relevant_doc.metadata.get("source")},
            callbacks=run_manager.get_child() if run_manager else None,
        )

        return {self.output_key: context_response.get("text", "")}

    # TODO: define `_acall` method for asynchronous operation



In [220]:
# bible_tool = RetrieverWithContextChain(
#     retriever=collection.as_retriever(),
#     llm=llm
# )


In [222]:
from langchain import PromptTemplate, OpenAI, LLMChain
# Multiple inputs example

# template = """Tell me a {adjective} joke about {subject}."""
template = """Please answer this question "{question}", using only this context:\n\n{context}"""

prompt = PromptTemplate(template=template, input_variables=["question", "context"])
llm_chain = LLMChain(
    prompt=prompt, 
    llm=OpenAI(temperature=0)
)

llm_chain.predict(question="sad", context="ducks")



'\n\nMournful'

# Use Agents to Execute Informed QA

In [223]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from pathlib import Path

# Load Language Model
llm = OpenAI(temperature=0)

In [245]:
# Import things that are needed generically
from langchain import LLMMathChain, SerpAPIWrapper
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool, StructuredTool, Tool, tool

# Create a RetrievalQA tool with this vectorstore
bible_tool = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=collection.as_retriever())
 
@tool("Accurate Bible Context System", return_direct=True)
def context_expansion_tool(verse_ref: str) -> str:
    """useful for when you know what passage a question refers to,
    but you want to check your data to be sure your answer is accurate.
    Input should be a verse reference like MAT 3:5 or LUK 1:1.
    Use the sources from Bible Passage Finder System."""
    return "MACULA Context:" + answer_question_with_context(verse_ref)


tools = [
    Tool(
        name = "Bible Passage Finder System",
        func=bible_tool.run,
        description="useful for when you need to answer questions about the Bible, but aren't sure which verse to look at. Input should be a fully formed question.",
        return_direct=True  # If you want to use the agent as a router and return results directly
    ),
    context_expansion_tool,
    # Tool(
    #     name = "Accurate Bible Context System",
    #     func=bible_tool.run,
    #     description=("useful for when you know what passage a question refers to,"
    #         " but you want to check your data to be sure your answer is accurate."
    #         " Input should be a verse reference like MAT 3:5 or LUK 1:1."
    #         " Use the sources from Bible Passage Finder System."),
    #     return_direct=True  # If you want to use the agent as a router and return results directly
    # ),
]

In [246]:
# Initialize the agent
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)


In [227]:
agent.run("Who heals the man born blind? How do people respond to that healing?")





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find a Bible passage that talks about this.
Action: Bible Passage Finder System
Action Input: Who heals the man born blind? How do people respond to that healing?[0m




Observation: [36;1m[1;3m Jesus heals the man born blind. People respond to the healing by asking how it is possible and Jesus responds by telling them to go and tell John what they have seen and heard - that the blind receive sight, the lame walk, lepers are cleansed, the deaf hear, the dead are raised, and the poor are gospelized.[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


' Jesus heals the man born blind. People respond to the healing by asking how it is possible and Jesus responds by telling them to go and tell John what they have seen and heard - that the blind receive sight, the lame walk, lepers are cleansed, the deaf hear, the dead are raised, and the poor are gospelized.'

In [228]:
agent.run("Based on the Accurate Bible Context System, who heals the man born blind? How do people respond to that healing?")





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find the passage that talks about the healing of the man born blind.
Action: Bible Passage Finder System
Action Input: healing of the man born blind[0m




Observation: [36;1m[1;3m Yes, He healed the man born blind.[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


' Yes, He healed the man born blind.'

## Set up a custom agent

In [268]:
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
from langchain.prompts import StringPromptTemplate
from langchain import OpenAI, SerpAPIWrapper, LLMChain
from typing import List, Union
from langchain.schema import AgentAction, AgentFinish
import re

In [269]:
# Set up the base template
template = """You are ChatGPT. You are notoriously bad for hallucinating convincing answers that could be completely inaccurate. Therefore, ensure you always gather appropriate context using the tools available to you and do not try to interpret a passage of the Bible directly.

Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}] (Bible Passage Finder System if I do not have a specific verse reference, otherwise Accurate Bible Context System if I have a verse reference but have not used Accurate Bible Context System yet.)
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times until you have retrieved linguistic data context for that passage)
Thought: I now know the final answer
Final Answer: the final answer to the original input question based on the observation of Accurate Bible Context System's output

Begin! Remember to that you can look up passages using Bible Passage Finder System to figure out what passage a question relates to, but do not try to answer questions directly without looking up the verse reference using Accurate Bible Context System.

Question: {input}
{agent_scratchpad}"""

In [283]:
# Set up a prompt template
class CustomPromptTemplate(StringPromptTemplate):
    # The template to use
    template: str
    # The list of tools available
    tools: List[Tool]
    
    def format(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            print('in custom prompt formatter:', action, observation)
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
        print('formatted prompt:', self.template.format(**kwargs))
        return self.template.format(**kwargs)

In [284]:
prompt = CustomPromptTemplate(
    template=template,
    tools=tools,
    # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
    # This includes the `intermediate_steps` variable because that is needed
    input_variables=["input", "intermediate_steps"]
)

In [285]:
class CustomOutputParser(AgentOutputParser):
    
    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        # Check if agent should finish
        if "MACULA" in llm_output:
            print('macula data found in output')
            if "Final Answer:" in llm_output:
                return AgentFinish(
                    # Return values is generally always a dictionary with a single `output` key
                    # It is not recommended to try anything else at the moment :)
                    return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                    log=llm_output,
                )
        # Parse out the action and action input
        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)

In [286]:
output_parser = CustomOutputParser()

In [287]:
# LLM chain consisting of the LLM and a prompt
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [288]:
tool_names = [tool.name for tool in tools]
# agent = LLMSingleActionAgent(
#     llm_chain=llm_chain, 
#     output_parser=output_parser,
#     stop=["\nObservation:"], 
#     allowed_tools=tool_names
# )

print(tool_names)

['Bible Passage Finder System', 'Accurate Bible Context System']


In [289]:
from langchain.agents import Tool, AgentExecutor, BaseMultiActionAgent
from langchain import OpenAI, SerpAPIWrapper
from typing import List, Tuple, Any, Union
from langchain.schema import AgentAction, AgentFinish

class MaculaAgent(BaseMultiActionAgent):
    """MACULA QA Agent."""
    
    @property
    def input_keys(self):
        return ["input"]
    
    def plan(
        self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
    ) -> Union[List[AgentAction], AgentFinish]:
        """Given input, decided what to do.

        Args:
            intermediate_steps: Steps the LLM has taken to date,
                along with observations
            **kwargs: User inputs.

        Returns:
            Action specifying what tool to use.
        """
        if len(intermediate_steps) == 0:
            return [
                AgentAction(tool='Bible Passage Finder System', tool_input=kwargs["input"], log=""),
                AgentAction(tool='Accurate Bible Context System', tool_input=kwargs["input"], log=""),
            ]
        else:
            # TODO: ensure output of both actions is parsed
            return AgentFinish(return_values={"output": "MACULA:"}, log="")

    async def aplan(
        self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
    ) -> Union[List[AgentAction], AgentFinish]:
        """Given input, decided what to do.

        Args:
            intermediate_steps: Steps the LLM has taken to date,
                along with observations
            **kwargs: User inputs.

        Returns:
            Action specifying what tool to use.
        """
        if len(intermediate_steps) == 0:
            return [
                AgentAction(tool="Search", tool_input=kwargs["input"], log=""),
                AgentAction(tool="RandomWord", tool_input=kwargs["input"], log=""),
            ]
        else:
            return AgentFinish(return_values={"output": "bar"}, log="")

In [266]:
tool_names

['Bible Passage Finder System', 'Accurate Bible Context System']

In [279]:
agent = MaculaAgent()

In [280]:
agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=True)

ValidationError: ignored

In [261]:
agent_executor.run("Who heals the man born blind? How do people respond to that healing?")





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to look up the passage that this question relates to.
Action: Bible Passage Finder System
Action Input: Who heals the man born blind?[0m





Observation:[36;1m[1;3m Jesus.[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


' Jesus.'

In [244]:
agent_executor.run("What are the key lexical terms in the healing of the man born blind?")





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find the passage that this question is referring to.
Action: Bible Passage Finder System
Action Input: What is the passage about the healing of the man born blind?[0m





Observation:[36;1m[1;3m The passage does not mention the healing of a man born blind.[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


' The passage does not mention the healing of a man born blind.'