In [97]:
import requests, json, re, os
import pandas as pd

## Populate Greek token data

In [98]:
def download_file(url, file_name):
    response = requests.get(url)
    with open(file_name, 'wb') as file:
        file.write(response.content)


file1_url = 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/Nestle1904/TSV/macula-greek.tsv'
file2_url = 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/sources/MARBLE/SDBG/marble-domain-label-mapping.json'
file1_name = 'macula-greek.tsv'
file2_name = 'marble-domain-label-mapping.json'

if file1_name not in os.listdir():
    download_file(file1_url, file1_name)

if file2_name not in os.listdir():
    download_file(file2_url, file2_name)

# Import Macula Greek data
mg = pd.read_csv('macula-greek.tsv', index_col='xml:id', sep='\t',
                 header=0, converters={'*': str}).fillna('missing')
# mg['domain'] = mg['domain'].astype(str).fillna('missing')

# Extract book, chapter, and verse into separate columns
mg[['book', 'chapter', 'verse']] = mg['ref'].str.extract(
    r'(\d?[A-Z]+)\s(\d+):(\d+)')

# Add columns for book + chapter, and book + chapter + verse for easier grouping
mg['book_chapter'] = mg['book'] + ' ' + mg['chapter'].astype(str)
mg['book_chapter_verse'] = mg['book_chapter'] + ':' + mg['verse'].astype(str)

# Import domain-label mapping

# Open the JSON file
with open('marble-domain-label-mapping.json', 'r') as f:

    # Load the contents of the file as a dictionary
    domain_labels = json.load(f)

domain_labels['missing'] = 'no domain'
domain_labels['nan'] = 'no domain'

# Use domain labels to create a new column


def get_domain_label(domain_string_number):
    labels = [domain_labels[label]
              for label in domain_string_number.split(' ')]
    return labels


mg['domain_label'] = mg['domain'].apply(get_domain_label)
mg.head()

Unnamed: 0_level_0,ref,role,class,type,gloss,text,after,lemma,normalized,strong,...,ln,frame,subjref,referent,book,chapter,verse,book_chapter,book_chapter_verse,domain_label
xml:id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n40001001001,MAT 1:1!1,missing,noun,common,[The] book,Βίβλος,,βίβλος,Βίβλος,976,...,33.38,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Written Language]
n40001001002,MAT 1:1!2,missing,noun,common,of [the] genealogy,γενέσεως,,γένεσις,γενέσεως,1078,...,10.24 33.19,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Kinship Relations Involving Successive Genera...
n40001001003,MAT 1:1!3,missing,noun,proper,of Jesus,Ἰησοῦ,,Ἰησοῦς,Ἰησοῦ,2424,...,93.169a,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Persons]
n40001001004,MAT 1:1!4,missing,noun,proper,Christ,Χριστοῦ,,Χριστός,Χριστοῦ,5547,...,93.387,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Persons]
n40001001005,MAT 1:1!5,missing,noun,common,son,υἱοῦ,,υἱός,υἱοῦ,5207,...,10.30,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Kinship Relations Involving Successive Genera...


## Populate speaker quotation data locally

In [99]:
file_3_url = 'https://raw.githubusercontent.com/Clear-Bible/speaker-quotations/main/json/SpeakerProjections-clear.json'
file_4_url = 'https://raw.githubusercontent.com/Clear-Bible/speaker-quotations/main/json/character_detail.semantic_data.json'
file_3_name = 'SpeakerProjections-clear.json'
file_4_name = 'character_detail.semantic_data.json' # stores info about each unique character id (unique string value)

if file_3_name not in os.listdir():
    download_file(file_3_url, file_3_name)

if file_4_name not in os.listdir():
    download_file(file_4_url, file_4_name)
    
# Create a dataframe from the SpeakerProjections-clear.json file
with open('SpeakerProjections-clear.json', 'r') as f:
    speaker_projections = json.load(f)
    speaker_data = pd.DataFrame(speaker_projections)
    
# Create a dataframe from the character_detail.semantic_data.json file
with open('character_detail.semantic_data.json', 'r') as f:
    # this data is an array of JSON objects
    character_detail = json.load(f)
    character_data = pd.DataFrame(character_detail)


In [157]:
speaker_data.head() # I will reformat this data to make it easier to work with below

Unnamed: 0,GEN 1:3|GEN 1:3|God,GEN 1:6|GEN 1:6|God,GEN 1:9|GEN 1:9|God,GEN 1:11|GEN 1:11|God,GEN 1:14|GEN 1:15|God,GEN 1:20|GEN 1:20|God,GEN 1:22|GEN 1:22|God,GEN 1:24|GEN 1:24|God,GEN 1:26|GEN 1:26|God,GEN 1:28|GEN 1:30|God,...,REV 21:3|REV 21:4|voice from throne,REV 21:5|REV 21:8|God,REV 21:9|REV 21:9|angel (one of the seven),REV 22:6|REV 22:6|angel (one of the seven),REV 22:7|REV 22:7|Jesus,REV 22:9|REV 22:11|angel (one of the seven),REV 22:12|REV 22:16|Jesus,REV 22:17|REV 22:17|him who hears,"REV 22:17|REV 22:17|Holy Spirit, the",REV 22:20|REV 22:20|Jesus
SpeakerInstance,"{'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3...","{'StartVerse': 'GEN 1:6', 'EndVerse': 'GEN 1:6...","{'StartVerse': 'GEN 1:9', 'EndVerse': 'GEN 1:9...","{'StartVerse': 'GEN 1:11', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:14', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:20', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:22', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:24', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:26', 'EndVerse': 'GEN 1:...","{'StartVerse': 'GEN 1:28', 'EndVerse': 'GEN 1:...",...,"{'StartVerse': 'REV 21:3', 'EndVerse': 'REV 21...","{'StartVerse': 'REV 21:5', 'EndVerse': 'REV 21...","{'StartVerse': 'REV 21:9', 'EndVerse': 'REV 21...","{'StartVerse': 'REV 22:6', 'EndVerse': 'REV 22...","{'StartVerse': 'REV 22:7', 'EndVerse': 'REV 22...","{'StartVerse': 'REV 22:9', 'EndVerse': 'REV 22...","{'StartVerse': 'REV 22:12', 'EndVerse': 'REV 2...","{'StartVerse': 'REV 22:17', 'EndVerse': 'REV 2...","{'StartVerse': 'REV 22:17', 'EndVerse': 'REV 2...","{'StartVerse': 'REV 22:20', 'EndVerse': 'REV 2..."
Projections,"[{'StartWord': 'o010010030031', 'EndWord': 'o0...","[{'StartWord': 'o010010060031', 'EndWord': 'o0...","[{'StartWord': 'o010010090031', 'EndWord': 'o0...","[{'StartWord': 'o010010110031', 'EndWord': 'o0...","[{'StartWord': 'o010010140031', 'EndWord': 'o0...","[{'StartWord': 'o010010200031', 'EndWord': 'o0...","[{'StartWord': 'o010010220051', 'EndWord': 'o0...","[{'StartWord': 'o010010240031', 'EndWord': 'o0...","[{'StartWord': 'o010010260031', 'EndWord': 'o0...","[{'StartWord': 'o010010280071', 'EndWord': 'o0...",...,"[{'StartWord': 'n66021003009', 'EndWord': 'n66...","[{'StartWord': 'n66021005008', 'EndWord': 'n66...","[{'StartWord': 'n66021009025', 'EndWord': 'n66...","[{'StartWord': 'n66022006004', 'EndWord': 'n66...","[{'StartWord': 'n66022007002', 'EndWord': 'n66...","[{'StartWord': 'n66022009004', 'EndWord': 'n66...","[{'StartWord': 'n66022012001', 'EndWord': 'n66...","[{'StartWord': 'n66022017008', 'EndWord': 'n66...","[{'StartWord': 'n66022017013', 'EndWord': 'n66...","[{'StartWord': 'n66022020005', 'EndWord': 'n66..."


In [155]:
character_data.head()

Unnamed: 0,CharacterId,MaxSpeakers,Gender,Age,Comment,SDBH,LouwNida,FCBHCharacter,Divinity
0,2 other disciples,2,Male,Adult,"Not Peter (Simon), Thomas, Nathaniel, James, o...",,,,
1,250 Israelite leaders,250,Male,Adult,,,,,
2,a Jew,1,Male,Adult,,,,,
3,Aaron,1,Male,Adult,,[000172001001000],[93.1],,
4,Abednego,1,Male,Adult,original Hebrew name: Azariah,,,,


In [100]:
print(speaker_data['GEN 1:3|GEN 1:3|God'].to_dict())

{'SpeakerInstance': {'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'CharacterIds': ['God'], 'Alias': 'God (Yahweh)', 'QuoteType': 'Normal', 'DefaultCharacterId': 'God', 'Alternates': []}, 'Projections': [{'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑וֹר\u200e', 'Depth': 2}]}]}


In [101]:
# Transpose the DataFrame
transposed_speaker_data = speaker_data.transpose()

# Reset the index
transposed_speaker_data.reset_index(inplace=True)

# Rename the columns
transposed_speaker_data.columns = ["row_id", "instance_data", "projections"]

print(transposed_speaker_data.iloc[0].to_dict())


{'row_id': 'GEN 1:3|GEN 1:3|God', 'instance_data': {'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'CharacterIds': ['God'], 'Alias': 'God (Yahweh)', 'QuoteType': 'Normal', 'DefaultCharacterId': 'God', 'Alternates': []}, 'projections': [{'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑וֹר\u200e', 'Depth': 2}]}]}


In [102]:
# Normalize the 'instance_data' column
flattened_instance_data = pd.json_normalize(transposed_speaker_data['instance_data'])

# Merge the normalized DataFrame with the original transposed DataFrame
merged_speaker_data = pd.concat([transposed_speaker_data.drop(columns=['instance_data']), flattened_instance_data], axis=1)

print(merged_speaker_data.iloc[0].to_dict())

{'row_id': 'GEN 1:3|GEN 1:3|God', 'projections': [{'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑וֹר\u200e', 'Depth': 2}]}], 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'CharacterIds': ['God'], 'Alias': 'God (Yahweh)', 'QuoteType': 'Normal', 'DefaultCharacterId': 'God', 'Alternates': [], 'Delivery': nan}


### Turn speaker quotation data into one row per projection

In [103]:
# Create an empty DataFrame to store the result
expanded_speaker_data = pd.DataFrame()

# Iterate through the rows in the merged_speaker_data DataFrame
for idx, row in merged_speaker_data.iterrows():
    projections = row['projections']
    
    # Iterate through the projections
    for proj_idx, projection in enumerate(projections):
        # Create a new row with the inherited speaker instance data
        new_row = row.drop('projections').to_dict()
        new_row.update(projection)
        
        # Set the row ID with the projection index
        new_row['row_id'] = f"{row['row_id']}|{proj_idx}"
        
        # Append the new row to the expanded_speaker_data DataFrame
        expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)

  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  expanded_speaker_data = expanded_speaker_data.append(new_row, ignore_index=True)
  ex

### Add 'tokens' and 'token_ids' columns to speaker quotation data

In [104]:
# Add 'tokens' and 'token_ids' columns with default empty lists
expanded_speaker_data = expanded_speaker_data.assign(tokens=[[]]*len(expanded_speaker_data), token_ids=[[]]*len(expanded_speaker_data))

# Iterate through the rows in the expanded_speaker_data DataFrame
for idx, row in expanded_speaker_data.iterrows():
    words = row['Words']
    
    # Extract the 'Text' and 'Id' values from the 'Words' data
    tokens = [word['Text'] for word in words]
    token_ids = [word['Id'] for word in words]
    
    # Assign the 'tokens' and 'token_ids' fields to the row
    expanded_speaker_data.at[idx, 'tokens'] = tokens
    expanded_speaker_data.at[idx, 'token_ids'] = token_ids


In [105]:
print(expanded_speaker_data.iloc[0].to_dict())

{'row_id': 'GEN 1:3|GEN 1:3|God|0', 'StartVerse': 'GEN 1:3', 'EndVerse': 'GEN 1:3', 'Verses': ['GEN 1:3'], 'CharacterIds': ['God'], 'Alias': 'God (Yahweh)', 'QuoteType': 'Normal', 'DefaultCharacterId': 'God', 'Alternates': [], 'Delivery': nan, 'StartWord': 'o010010030031', 'EndWord': 'o010010030041', 'ClearSubjectReferents': ['o010010030021'], 'ClearSubjectReferentLabels': ['God'], 'Depth': 2, 'Words': [{'Id': 'o010010030031', 'Text': 'יְהִ֣י\u200e', 'Depth': 2}, {'Id': 'o010010030041', 'Text': 'א֑וֹר\u200e', 'Depth': 2}], 'tokens': ['יְהִ֣י\u200e', 'א֑וֹר\u200e'], 'token_ids': ['o010010030031', 'o010010030041']}


# Prompt expansion

In [106]:
# Need token ids for MAT 3:15
# Need token data for the token with with a gloss like 'him'
# Need to get the @Referent for that token
# Need to get the token data for the token that matches @Referent
# Need to get social situation data for this passage
# Need to get description of the social situation based on the token

## Define prosaic annotation category and value glosses

In [107]:
attribute_descriptions = {
    "after": "Encodes the following character, including a blank space.",
    "articular": "'true' if the word has an article (i.e., modified by the word 'the').",
    "case": "Grammatical case: nominative, genitive, dative, accusative, or vocative",
    "class": "On words, the class is the word's part of speech",
    "cltype": "Explicitly marks Verbless Clauses, Verb Elided Clauses, and Minor Clauses",
    "degree": "A derivative lexical category, indicating the degree of the adjective",
    "discontinuous": "'true' if the word is discontinuous with respect to sentence order due to reordering in the syntax tree",
    "domain": "Semantic domain information from the Semantic Dictionary of Biblical Greek (SDBG)",
    "frame": "Frames of verbs, refers to the arguments of the verb",
    "gender": "Grammatical gender values",
    "gloss": "SIL data, not Berean",
    "lemma": "Form of the word as it appears in a dictionary.",
    "ln": "The semantic domain entry in Louw and Nida's, 'Greek-English Lexicon of the New Testament: Based on Semantic Domains'.",
    "mood": "Grammatical mood",
    "morph": "Morphological parsing codes",
    "normalized": "The normalized form of the token (i.e., no trailing or leading punctuation or accent shifting depending on context)",
    "number": "Grammatical number",
    "person": "Grammatical person",
    "ref": "Verse!word reference to this edition of the Nestle1904 text by USFM id",
    "referent": "The xml:id of the node to which a pronoun (i.e., 'he') refers. Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "role": "The clause-level role of the word.",
    "strong": "Strong's number for the lemma",
    "subjref": "The xml:id of the node that is the implied subject of a verb (for verbs without an explicit subject). Note that some of these IDs are not word IDs but rather phrase or clause IDs.",
    "tense": "Grammatical tense form",
    "text": "Text content associated with the ID",
    "type": "Indicates different types of pronominals",
    "voice": "Grammatical voice",
    "xml:id": "XML ids occur on every word and encode the corpus ('n' for New Testament), the book (40 for Matthew), the chapter (001), verse (001), and word (001)."
}

discourse_types = {
    'Main clauses': {'description': 'Main clauses are the top-level clauses in a sentence. They are the clauses that are not embedded in other clauses.'},
    'Historical Perfect': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'Specific Circumstance': {'description': 'The function of ἐγενετο ‘it came about’ and an immediately following temporal expression varies with the author (see DFNTG §10.3). In Matthew’s Gospel, it usually marks major divisions in the book (e.g. Mt 7:28). In Luke-Acts, in contrast, ‘it picks out from the general background the specific circumstance for the foreground events that are to follow’ (ibid.), as in Acts 9:37 (see also Mt 9:10).'},
    'Verb Focus+': {'description': 'Verb in final position in clause demonstrates verb focus.'},
    'Articular Pronoun': {'description': 'Articular pronoun, which often introduces an ‘intermediate step’ in a reported conversation.'},
    'Topical Genitive': {'description': 'A genitival constituent that is nominal is preposed within the noun phrase for two purposes: 1) to bring it into focus; 2) within a point of departure, to indicate that it is the genitive in particular which relates to a corresponding constituent of the context.(DFNTG §4.5)'},
    'Embedded DFE': {'description': "'Dominant focal elements' embedded within a constituent in P1."},
    'Reported Speech': {'description': 'Reported speech.'},
    'Ambiguous': {'description': 'Marked but ambiguous constituent order.'},
    'Over-encoding': {'description': 'Any instance in which more encoding than the default is employed to refer to an active participant or prop. Over-encoding is used in Greek, as in other languages: to mark the beginning of a narrative unit (e.g. Mt 4:5); and to highlight the action or speech concerned (e.g. Mt 4:7).'},
    'Highlighter': {'description': 'Presentatives - Interjections such as ἰδού and ἴδε ‘look!, see!’ typically highlight what immediately follows (Narr §5.4.2, NonNarr §7.7.3).'},
    'Referential PoD': {'description': 'Pre-verbal topical subject other referential point of departure (NARR §3.1, NonNarr §4.3, DFNTG §§2.2, 2.8; as in 1 Th 1:6).'},
    'annotations': {'description': 'Inline annotations.'},
    'Left-Dislocation': {'description': 'Point of departure - A type of SENTENCE in which one of the CONSTITUENTS appears in INITIAL position and its CANONICAL position is filled by a PRONOUN or a full LEXICAL NOUN PHRASE with the same REFERENCE, e.g. John, I like him/the old chap.”'},
    'Focus+': {'description': 'Constituents placed in P2 to give them focal prominence.'},
    'Tail-Head linkage': {'description': 'Point of departure involving renewal - Tail-head linkage involves “the repetition in a subordinate clause, at the beginning (the ‘head’) of a new sentence, of at least the main verb of the previous sentence (the tail)” (Dooley & Levinsohn 2001:16).'},
    'Postposed them subject': {'description': 'When a subject is postposed to the end of its clause (following nominals or adjuncts), it is marked ThS+ (e.g. Lk 1:41 [twice]). Such postposing typically marks as salient the participant who performs the next event in chronological sequence in the story (see Levinsohn 2014).'},
    'EmbeddedRepSpeech': {'description': 'Embedded reported speech - speech that is reported within a reported speech.'},
    'Futuristic Present': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'OT quotes': {'description': 'Old Testament quotations.'},
    'Constituent Negation': {'description': 'Negative pro-forms when they are in P2 indicate that the constituent has been negated rather than the clause as a whole.'},
    'Split Focal': {'description': 'The second part of a focal constituent with only the first part in P2 (NonNarr §5.5, DFNTG §4.4).'},
    'Right-Dislocated': {'description': 'Point of departure - A type of SENTENCE in which one of the CONSTITUENTS appears in FINAL position and its CANONICAL position is filled by a PRONOUN with the same REFERENCE, e.g. ... He’s always late, that chap.'},
    'Appositive': {'description': 'Appositive'},
    'Situational PoD': {'description': 'Situational point of departure (e.g. temporal, spatial, conditional―(NARR §3.1, NonNarr §4.3, DFNTG §§2.2, 2.8; as in 1 Th 3:4).'},
    'Historical Present': {'description': 'Highlights not the speech or act to which it refers but the event(s) that follow (DFNTG §12.2).'},
    'Noun Incorporation': {'description': 'Some nominal objects that appear to be in P2 may precede their verb because they have been “incorporated” (Rosen 1989) in the verb phrase. Typically, the phrase consists of an indefinite noun and a “light verb” such as “do, give, have, make, take” (Wikipedia entry on Light Verbs).'},
    'Thematic Prominence': {'description': 'Thematic prominence - In Greek, prominence is given to active participants and props who are the current centre of attention (NARR §4.6) by omitting the article (DFNTG §§9.2.3-9.4), by adding αυτος ‘-self’ (e.g. in 1 Th 3:11), by using the proximal demonstrative οὗτος (NARR chap. 9, Appendix 1; e.g. in 3:3), and by postposing the constituent concerned (e.g. Mt 14:29). If such constituents are NOT in postion P1, they are demonstrating topical prominence.'},
    'Cataphoric Focus': {'description': 'An expression that points forward to and highlights something which ‘is about to be expressed.’'},
    'Cataphoric referent': {'description': 'The clause or sentence to which a cataphoric reference refers when NOT introduced with ὅτι or ἵνα.'},
    'DFE': {'description': 'Constituents that may be moved from their default position to the end of a proposition to give them focal prominence include verbs, pronominals and objects that follow adjuncts (NonNarr §5.3, DFNTG §3.5). Such constituents, also called ‘dominant focal elements’or DFEs (Heimedinger 1999:167).'},
    'Embedded Focus+': {'description': 'A constituent of a phrase or embedded clause preposed for focal prominence.'}
}

## Define data provisioning functions

Each of these functions should accept a token or range of tokens. They should return a list of annotations for each token or range of tokens. If the annotation is not applicable to a token or range of tokens, the function should return an empty list.

In [108]:
ENDPOINT = 'https://macula-atlas-api-qa-25c5xl4maa-uk.a.run.app/graphql/'
headers = {"Content-Type": "application/json"}

In [109]:
# Levinsohn discourse features query

discourse_features_query = """
query AnnotationFeatures($filters1: AnnotationFeatureFilter, $filters2: AnnotationFilter, $filters3: WordTokenFilter ) {
  annotationFeatures(filters: $filters1) {
    label
    uri
    instances(filters: $filters2) {
      uri
      tokens(filters: $filters3) {
        ref
        wordValue
        xmlId
        lemma {
          value
          prose
        }

      }
    }
  }
}
"""

def get_discourse_annotation_types(xmlId):
    tokenData = mg.loc[xmlId].to_dict()
    passage = tokenData['ref'].split('!')[0]
    
    variables = {
        "filters1": {
            "reference": passage,
        },
        "filters2": {
            "reference": passage,
        },
        "filters3": {
            "xmlId": xmlId,
        }
        }
    
    payload = {'query': discourse_features_query, 'variables': variables}
    
    response = requests.post(ENDPOINT, json=payload, headers=headers)
    
    response_data = json.loads(response.text) 
    annotation_features = response_data["data"]["annotationFeatures"]

    labels = [feature["label"] for feature in annotation_features]
    return labels

In [220]:
# Situation data query

SITUATIONS_ENDPOINT = 'https://gospelgenre.ryderwishart.com/api/token/'

# just fetch the raw JSON from the situations endpoint plus the ref (e.g., MAT 3:13)
# Note, you only need to submit one token to get the social situation data, if available
def get_situations_data(tokenRef):
    # validate tokenRef
    if not re.match(r'^\d?[A-Z]+ \d+:\d+$', tokenRef):
        return {'error': 'invalid ref'};
    expanded_endpoint = SITUATIONS_ENDPOINT + tokenRef
    response = requests.get(expanded_endpoint)
    print(response.text)
    return response.json()

def process_features(lookup, feature_list):
    feature_descriptions = []
    system_descriptions = set()

    for feature_name in feature_list:
        feature = next((item for item in lookup["features"] if item["name"] == feature_name), None)
        if feature:
            feature_descriptions.append(feature["description"])

            system = next((item for item in lookup["systems"] if item["name"] == feature["system"]), None)
            if system:
                system_descriptions.add(system["summary"])

    return feature_descriptions, system_descriptions

def generate_mutations(pre_text_features, via_text_features):
    mutations = []

    pre_text_features_set = set(pre_text_features)
    via_text_features_set = set(via_text_features)

    gained_features = via_text_features_set - pre_text_features_set
    lost_features = pre_text_features_set - via_text_features_set

    if gained_features:
        mutations.append("gained the following features: " + ', '.join(gained_features))

    if lost_features:
        mutations.append("lost the following features: " + ', '.join(lost_features))

    return mutations

In [221]:
# # Token features query

# passage_tokens_query = """
# query PassageByReference($filters: PassageFilter) {
#   passage(filters: $filters) {
#     usfmRef
#     textContent
#     tokens {
#       ref
#       xmlId
#       data
#     }
#   }
# }
# """

# def get_passage_token_features(startRef, endRef = None):
#     ref = startRef
#     if endRef and endRef != startRef: # FIXME: add some additional validation here...?
#         ref = startRef + '-' + endRef
    
#     variables = {
#         "filters": {
#             "reference": ref,
#         }
#     }
    
#     payload = {'query': passage_tokens_query, 'variables': variables}
    
#     response = requests.post(ENDPOINT, json=payload, headers=headers)
    
#     response_data = json.loads(response.text)
#     # print(response_data)
#     tokens = response_data["data"]["passage"][0]["tokens"]
    
#     return tokens
  
# def get_token(tokens, query_string=None):
#     print('get_tokens', tokens, query_string)
#     """
#     Accepts an array of tokens, and a query string. If the string is not supplied, return None.
    
#     Filters the tokens by checking each property of the token against the query string. Return the first match.
#     # TODO: add a way to return multiple matches?
#     """
#     if not query_string:
#         return None
    
#     for token in tokens:
#         for key in token:
#             if query_string.lower() in str(token[key]).lower():
#                 return token
#             if key == 'data':
#                 for data_key in token[key]:
#                     if query_string.lower() in str(token[key][data_key]).lower():
#                         return token
#     return None
    

In [222]:
# def get_all_annotations_by_token_and_query_string(token_ref: str, query_string: str = None):
#     """
#     Accepts a token ref, and a query string. If the string is not supplied, return the token from the passage that has the same ref as the token ref.
    
#     If the query string is supplied, return the token from the passage that has some property that matches the query string.
#     """
    
#     # Use sets to avoid storing duplicates
#     query_token_discourse_features = set()
#     query_tokens_data = set()
    
#     # Get situation data
    
#     # you only need to get the situation data once, assuming the tokens all belong to one pericope
#     query_situation_data = get_situations_data(token_ref) 
#     # FIXME: add some validation here to check the result and ensure 
#     # all the token refs are in the resulting 
#     # { "matchingSituation": 
#     #     "token_ids": [...all token refs should be in here], 
#     #     [...etc.]
#     # }
    
#     # Get token data, including the XML id for every token_ref 
#     # in the token_refs, and the discourse features
    
#     # for each token ref, get execute all the retrieval functions defined above
#     print(token_ref)
#     passage_tokens = get_passage_token_features(token_ref)
#     print(len(passage_tokens), 'tokens found in passage for ref', token_ref)
#     # for tok in passage_tokens:
#     #     xml_id = tok['xmlId']
#     #     query_tokens_data.add({xml_id: token_data})
#     #     query_token_xml_ids.add(xml_id)
#     #     discourse_features = get_discourse_annotation_types(xml_id)
#     # query_token_discourse_features.add(discourse_features)
#     print(passage_tokens)
#     if query_string:
#         passage_token_matching_query_string = get_token(passage_tokens, query_string)
#     else:
#         passage_token_matching_query_string = passage_tokens
#     return {
#         "matchingSituation": query_situation_data,
        
#     } # TODO: does it make more sense to incrementally build up a pandas DF?
    

In [223]:
# get_all_annotations_by_token_and_query_string('MAT 3:14', 'him')

In [224]:
list(character_data.columns)

['CharacterId',
 'MaxSpeakers',
 'Gender',
 'Age',
 'Comment',
 'SDBH',
 'LouwNida',
 'FCBHCharacter',
 'Divinity']

In [225]:
# Get speaker quotation data for a token
"""
speaker_data.columns =
['CharacterId',
'MaxSpeakers',
'Gender',
'Age',
'Comment',
'SDBH',
'LouwNida',
'FCBHCharacter',
'Divinity']
"""
def get_speaker_quotation_data(token_ref: str):
    """
    Accepts a token ref, and returns the speaker quotation data (from expanded_speaker_data) for that token.
    """
    # The token id is the row Name
    token_data = mg[mg["ref"] == token_ref]
    print(token_data)
    token_id = token_data.index[0]
    print(token_ref, 'matched to', token_id)
    speaker_data_for_token = expanded_speaker_data[expanded_speaker_data["token_ids"].apply(lambda x: token_id in x)]
    # print(speaker_data_for_token)
    
    if speaker_data_for_token.empty:
        return None
    
    speaker_ids = speaker_data_for_token["CharacterIds"].iloc[0]
    print(speaker_ids)
    results = []
    for speaker_id in speaker_ids:
            
        speaker_character_data = character_data[character_data["CharacterId"] == speaker_id]
        speaker_character_data = {key: value for key, value in speaker_character_data.iloc[0].items() if not type(value) == float}
        print('speaker_character_data', speaker_character_data)
        print(speaker_character_data)
    
        result = {
            "who_is_speaking": speaker_id,
            "delivery_tone": speaker_data_for_token["Delivery"].iloc[0],
            # "contained_in_speech_by": # TODO: somehow I would like to note that the Baptist's speech is contained in the Narrator's speech
            "what_is_said_truncated": ' '.join(speaker_data_for_token["tokens"].iloc[0][:10]) + '...',
            "what_is_said_complete": ' '.join(speaker_data_for_token["tokens"].iloc[0]),
        }
        for character_item in speaker_character_data:
            result[character_item] = speaker_character_data[character_item]
            
        results.append(result)
        
    return results
    
# test
get_speaker_quotation_data('MAT 3:14!6')

                     ref role class      type gloss text after lemma  \
xml:id                                                                 
n40003014006  MAT 3:14!6    s  pron  personal     I  Ἐγὼ         ἐγώ   

             normalized  strong  ...    ln    frame  subjref      referent  \
xml:id                           ...                                         
n40003014006        Ἐγώ    1473  ...  92.1  missing  missing  n40003013013   

             book chapter verse book_chapter book_chapter_verse domain_label  
xml:id                                                                        
n40003014006  MAT       3    14        MAT 3           MAT 3:14    [Speaker]  

[1 rows x 30 columns]
MAT 3:14!6 matched to n40003014006
['John the Baptist']
speaker_character_data {'CharacterId': 'John the Baptist', 'MaxSpeakers': 1, 'Gender': 'Male', 'LouwNida': ['93.190a', '93.190b', '93.190e', '93.190f', '93.190d', '93.190c']}
{'CharacterId': 'John the Baptist', 'MaxSpeakers': 1, 'Ge

[{'who_is_speaking': 'John the Baptist',
  'delivery_tone': 'humble',
  'what_is_said_truncated': 'ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι καὶ σὺ ἔρχῃ πρός...',
  'what_is_said_complete': 'ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι καὶ σὺ ἔρχῃ πρός με',
  'CharacterId': 'John the Baptist',
  'MaxSpeakers': 1,
  'Gender': 'Male',
  'LouwNida': ['93.190a',
   '93.190b',
   '93.190e',
   '93.190f',
   '93.190d',
   '93.190c']}]

In [226]:
mg.head()

Unnamed: 0_level_0,ref,role,class,type,gloss,text,after,lemma,normalized,strong,...,ln,frame,subjref,referent,book,chapter,verse,book_chapter,book_chapter_verse,domain_label
xml:id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n40001001001,MAT 1:1!1,missing,noun,common,[The] book,Βίβλος,,βίβλος,Βίβλος,976,...,33.38,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Written Language]
n40001001002,MAT 1:1!2,missing,noun,common,of [the] genealogy,γενέσεως,,γένεσις,γενέσεως,1078,...,10.24 33.19,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Kinship Relations Involving Successive Genera...
n40001001003,MAT 1:1!3,missing,noun,proper,of Jesus,Ἰησοῦ,,Ἰησοῦς,Ἰησοῦ,2424,...,93.169a,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Persons]
n40001001004,MAT 1:1!4,missing,noun,proper,Christ,Χριστοῦ,,Χριστός,Χριστοῦ,5547,...,93.387,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Persons]
n40001001005,MAT 1:1!5,missing,noun,common,son,υἱοῦ,,υἱός,υἱοῦ,5207,...,10.30,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Kinship Relations Involving Successive Genera...


## Define function to synthesize all available annotations into prose

This function is at the heart of this project. It should accept a token or range of tokens. It should return a string that synthesizes all available annotations for the token or range of tokens. 

If no annotations are available, it should return a generic explanation of the fact that the user should try asking more precisely about a word or phrase, with a verse reference if a specific context is in mind. 

If no passage refs are applicable, then return information about the lemma, lemmas, cultural/encyclopedic, or grammatical phenomena in question.

If an error occurs, return a message to the user that an error occurred and that they should try again with a different query.

In [227]:
for i in get_situations_data('MAT 3:14')['matchingSituation'].items():
    print(i)

{"matchingSituation":{"situation":"01-06","title":["The Baptism of Jesus"],"preTextFeatures":["non-institutional-or-neutralized","dialogical","opposing","practical-ie-outwardly-oriented","addressee-more-active","spoken","phonic","neutral","distant","instructing","ancillary","interpersonal"],"viaTextFeatures":["non-institutional-or-neutralized","asserting","multilogical","close","conceptual-ie-internally-oriented","addressee-more-passive","allying","spoken","phonic","neutral","ancillary","interpersonal"],"start":["SBLGNT.Matt.3.13.w1"],"section":["01-06"],"morphGntId":["010313"],"ref":"MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1 MAT 3:13!1","text":"Τότε παραγίνεται ὁ Ἰησοῦς ἀπὸ τῆς Γαλιλαίας ἐπὶ τὸν Ἰορδάνην πρὸς τὸν Ἰωάννην τοῦ βαπτισθῆναι ὑπ’ αὐτοῦ. ὁ δὲ Ἰωάννης διεκώλυεν αὐτὸν λέγων· Ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι, καὶ σὺ ἔρχῃ π

In [228]:
situations_lookup_json = json.loads('''
                                    {
    "parameters": [
        {
            "name": "field",
            "type": "register_parameter",
            "summary": "Field is the subject matter of a situation and concerns the nature and the structure of the activity being carried out. It involves three parameters: abstractness, activity focus, and goals.",
            "system": "field"
        },
        {
            "name": "tenor",
            "type": "register_parameter",
            "summary": "Tenor pertains to the social roles and relationships among the participants involved in a situation. It covers five parameters: value orientation predisposition, publicity, number of speaking participants, control, and social distance.",
            "system": "tenor"
        },
        {
            "name": "mode",
            "type": "register_parameter",
            "summary": "Mode is the dimension of a situation through which participants are brought into contact with each other. It involves the systems of material contact and semantic contact, focusing on four parameters: language role, process sharing, channel, and medium.",
            "system": "mode"
        }
    ],
    "systems": [
        {
            "name": "abstractness",
            "type": "system",
            "summary": "The abstractness of a situation refers to the distinction between conceptual and practical activities, with the former involving theoretical or abstract activities, while the latter involves concrete or immediate actions."
        },
        {
            "name": "activity focus",
            "type": "system",
            "summary": "The activity focus of a situation involves the domain of experience that participants are focusing on. An experiential focus refers to what is happening, an interpersonal focus refers to why it is happening, and a logical focus refers to how, when, or where it is happening. The analysis only captures the beginning and the end of the focus, even if it changes during the episode."
        },
        {
            "name": "goals",
            "type": "system",
            "summary": "The goals of a situational activity involve the motivation of actions, and can be instructing, projecting, or asserting."
        },
        {
            "name": "control",
            "type": "system",
            "summary": "Control involves social tendencies related to deference between participants based on their relative status, power, authority, or institutional roles. Situations may be hierarchic or non-hierarchic, with the former being unequal and the latter being equal. In unequal relationships, there may be numerous subjects that cannot typically be discussed, whereas equal relationships allow for a greater range of meanings to be exchanged."
        },
        {
            "name": "plurality",
            "type": "system",
            "summary": "The plurality system pertains to the number of speaking participants in a situation, and includes the parameters of monological, dialogical, and multilogical. This system recognizes that more than two participants may be engaged in dialogical activity, interacting with each other in various overlapping arrangements over the course of a situation."
        },
        {
            "name": "value-orientation-disposition",
            "type": "system",
            "summary": "Value-orientation disposition refers to the nature of relative alliance or opposition between agents in a situation, and answers the question of whether the situation is presented as if there is agreement or opposition between participants. The system distinguishes between an allying disposition that realizes agreement and an opposing disposition that realizes disagreement."
        },
        {
            "name": "social-distance",
            "type": "system",
            "summary": "Social distance refers to the level of familiarity between participants in a situation. Close participants may exchange more kinds of meanings and require less explicitness in their communication, while distant participants tend to require more explicitness and have a more restricted set of possible meaning exchanges."
        },
        {
            "name": "publicity",
            "type": "system",
            "summary": "Publicity is a dimension of tenor that refers to the presence or absence of onlookers with regard to a social act, and the various levels of engagement such onlookers might reveal. It includes disinterested, interested (neutral or biased), and private situations."
        },
        {
            "name": "language-role",
            "type": "system",
            "summary": "Language role refers to the amount of work language does in accomplishing a situation's activity, and can be constitutive or ancillary depending on whether language is the primary means of accomplishing the activity or simply assists in the unfolding of non-linguistic actions."
        },
        {
            "name": "process-sharing",
            "type": "system",
            "summary": "Process sharing refers to the degree of active participation by more than one participant in the unfolding of text, and can be active or passive depending on whether participants share in the creation of the text or engage with it more passively."
        },
        {
            "name": "channel",
            "type": "system",
            "summary": "Channel refers to the physical mechanics of the addressee's interaction with the text, and can be phonic or graphic. It is closely related to process sharing, and is decided by the nature of the social activity and of the social relation between the participants."
        },
        {
            "name": "medium",
            "type": "system",
            "summary": "Medium refers to the style or patterning of the wordings themselves, and can be spoken or written. It is a matter of style, and is related to the extemporaneousness of the language realizing a situation."
        }
    ],
    "features": [
        {
            "name": "conceptual-ie-internally-oriented",
            "type": "feature",
            "register_parameter": "field",
            "description": "Conceptual field values involve abstract or theoretical activities, such as theological or philosophical discussions.",
            "system": "field",
            "parameter": "abstractness"
        },
        {
            "name": "practical-ie-outwardly-oriented",
            "type": "feature",
            "register_parameter": "field",
            "description": "Practical field values involve activities that are concrete or focused on immediate actions, such as fishing, a healing, or a miracle.",
            "system": "field",
            "parameter": "abstractness"
        },
        {
            "name": "experiential",
            "type": "feature",
            "register_parameter": "field",
            "description": "Experiential activity focus characterizes a situation whose linguistic activity chiefly relates to the unfolding of events or happenings, where participants are involved in carrying out or observing the activity.",
            "system": "field",
            "parameter": "activity_focus"
        },
        {
            "name": "interpersonal",
            "type": "feature",
            "register_parameter": "field",
            "description": "Interpersonal activity focus pertains to the social interaction between participants, focusing on their roles, relationships, and attitudes.",
            "system": "field",
            "parameter": "activity_focus"
        },
        {
            "name": "logical",
            "type": "feature",
            "register_parameter": "field",
            "description": "Logical activity focus involves reasoning, argumentation, or explanation, where participants engage in activities that require logical thinking.",
            "system": "field",
            "parameter": "activity_focus"
        },
        {
            "name": "instructing",
            "type": "feature",
            "register_parameter": "field",
            "description": "Instructing goals are centered around teaching, explaining, or providing guidance to others.",
            "system": "field",
            "parameter": "goals"
        },
        {
            "name": "projecting",
            "type": "feature",
            "register_parameter": "field",
            "description": "Projecting goals involve making predictions, prophesying, or discussing future events or possibilities.",
            "system": "field",
            "parameter": "goals"
        },
        {
            "name": "asserting",
            "type": "feature",
            "register_parameter": "field",
            "description": "Asserting goals involve stating or affirming beliefs, claims, or opinions, often in a declarative manner.",
            "system": "field",
            "parameter": "goals"
        },
        {
            "name": "allying",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Allying value orientation predisposition refers to participants who share the same views or are supportive of each other's positions.",
            "system": "tenor",
            "parameter": "value_orientation_predisposition"
        },
        {
            "name": "opposing",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Opposing value orientation predisposition refers to participants who hold different views or are antagonistic toward each other's positions.",
            "system": "tenor",
            "parameter": "value_orientation_predisposition"
        },
        {
            "name": "disinterested",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Disinterested publicity refers to a neutral stance where participants are not personally invested in the outcome or do not take sides.",
            "system": "tenor",
            "parameter": "publicity"
        },
        {
            "name": "neutral",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Neutral publicity refers to a situation where participants neither support nor oppose a particular stance or outcome.",
            "system": "tenor",
            "parameter": "publicity"
        },
        {
            "name": "on-someones-side",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "On-someones-side publicity refers to a situation where participants actively support a particular stance or outcome.",
            "system": "tenor",
            "parameter": "publicity"
        },
        {
            "name": "private",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Private publicity refers to a situation where participants actively oppose a particular stance or outcome.",
            "system": "tenor",
            "parameter": "publicity"
        },
        {
            "name": "monological",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Monological refers to situations with only one speaking participant, such as a monologue or a soliloquy.",
            "system": "tenor",
            "parameter": "number_of_speaking_participants"
        },
        {
            "name": "dialogical",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Dialogical refers to situations with two speaking participants, such as a dialogue or conversation.",
            "system": "tenor",
            "parameter": "number_of_speaking_participants"
        },
        {
            "name": "multilogical",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Multilogical refers to situations with three or more speaking participants, such as group discussions or debates.",
            "system": "tenor",
            "parameter": "number_of_speaking_participants"
        },
        {
            "name": "institutional",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Institutional control refers to situations where one participant or a group of participants hold authority or power over others.",
            "system": "tenor",
            "parameter": "control"
        },
        {
            "name": "non-institutional-or-neutralized",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Non-institutional or neutralized control refers to situations where no specific participant or group holds authority or power over others.",
            "system": "tenor",
            "parameter": "control"
        },
        {
            "name": "unclear",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Unclear control refers to situations where it is not evident who holds authority or power.",
            "system": "tenor",
            "parameter": "control"
        },
        {
            "name": "equalized",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Equalized control refers to situations where all participants have an equal share of authority or power.",
            "system": "tenor",
            "parameter": "control"
        },
        {
            "name": "close",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Close social distance refers to situations where participants have a close relationship or are familiar with each other.",
            "system": "tenor",
            "parameter": "social_distance"
        },
        {
            "name": "distant",
            "type": "feature",
            "register_parameter": "tenor",
            "description": "Distant social distance refers to situations where participants have a distant relationship or are not familiar with each other.",
            "system": "tenor",
            "parameter": "social_distance"
        },
        {
            "name": "constitutive",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Constitutive language role refers to situations where language is the primary means of carrying out the activity or achieving the goal.",
            "system": "mode",
            "parameter": "language_role"
        },
        {
            "name": "ancillary",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Ancillary language role refers to situations where language plays a secondary or supporting role in carrying out the activity or achieving the goal.",
            "system": "mode",
            "parameter": "language_role"
        },
        {
            "name": "addressee-more-active",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Addressee-more-active process sharing refers to situations where the recipient of the message is more actively involved in the communication process, such as asking questions or providing feedback.",
            "system": "mode",
            "parameter": "process_sharing"
        },
        {
            "name": "addressee-more-passive",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Addressee-more-passive process sharing refers to situations where the recipient of the message is less actively involved in the communication process, such as listening or reading without providing feedback.",
            "system": "mode",
            "parameter": "process_sharing"
        },
        {
            "name": "phonic",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Phonic channel refers to communication through sound, such as spoken language or music.",
            "system": "mode",
            "parameter": "channel"
        },
        {
            "name": "graphic",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Graphic channel refers to communication through visual means, such as written language, images, or symbols.",
            "system": "mode",
            "parameter": "channel"
        },
        {
            "name": "spoken",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Spoken medium refers to communication that takes place through speech, either in face-to-face conversations or through audio recordings.",
            "system": "mode",
            "parameter": "medium"
        },
        {
            "name": "written",
            "type": "feature",
            "register_parameter": "mode",
            "description": "Written medium refers to communication that takes place through text, either in print or digital formats.",
            "system": "mode",
            "parameter": "medium"
        }
    ]
}
''')

In [229]:

def generate_prosaic_context(word_id, selected_fields=None):
    word_data = mg.loc[word_id].to_dict()
    # Get annotations using combined annotations function
    lemma = word_data['lemma']
    word_ref = word_data['ref']
    print(word_ref)

    if not selected_fields:
        selected_fields = list(attribute_descriptions.keys())

    context_data = {
        "1. Lexical features": [],
        "2. Syntactic context and function": [],
        "3. Discourse context": [],
        "4. Social context": [],  # To be implemented
        "5. Cultural/encyclopedic knowledge": []
    }

    for key in selected_fields:
        value = word_data.get(key)
        if value not in (None, 'missing', 'nan'):
            if key == "class":
                context_data["1. Lexical features"].append(f"- {key}: {lemma} is a {value},")
            elif key == "gloss":
                context_data["1. Lexical features"].append(f"- {key}: meaning \"{value}.\"")
            elif key == "lemma":
                context_data["1. Lexical features"].append(f"- {key}: The lemma form of this word is {value},")
            elif key == "morph":
                context_data["1. Lexical features"].append(f"- {key}: and it is parsed as a {value}") # TODO: expand morphological parse codes into prose - although, is this necessary given the other data points?
            elif key == "strong":
                context_data["1. Lexical features"].append(f"- {key}: with a Strong's number of {value}.")
            elif key in ("person", "number", "gender", "case", "tense", "voice", "mood", "degree", "type"):
                context_data["2. Syntactic context and function"].append(f"- {key}: {attribute_descriptions[key]}: {value},")
            elif key in ("ln"):
                context_data["5. Cultural/encyclopedic knowledge"].append(f"- {key}: {attribute_descriptions[key]}: {value},")
            elif key in ("domain_label"):
                context_data["5. Cultural/encyclopedic knowledge"].append(f"- {key}: {attribute_descriptions[key]}: {domain_labels[value]},")

    # discourse_features = get_discourse_annotation_types(word_id)
    # if discourse_features:
    #     context_data["3. Discourse context"].append(f"This word functions within {len(discourse_features)} discourse features:")
    #     for feature in discourse_features:
    #         context_data["3. Discourse context"].append(f"- {feature} is defined as {discourse_types[feature]['description']}")

    speaker_information = get_speaker_quotation_data(word_ref)
    """
    Speaker information is an array of speakers, like this:
    [{'who_is_speaking': 'John the Baptist',
        'delivery_tone': 'humble',
        'what_is_said_truncated': 'ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι καὶ σὺ ἔρχῃ πρός...',
        'what_is_said_complete': 'ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι καὶ σὺ ἔρχῃ πρός με',
        'CharacterId': 'John the Baptist',
        'MaxSpeakers': 1, # this represents the number of speakers who are speaking at the same time (up to n)
        'Gender': 'Male', # this value, if present, can inform the pronouns used in the prosaic description below
        'LouwNida': ['93.190a',
        '93.190b',
        '93.190e',
        '93.190f',
        '93.190d',
        '93.190c']}]
        
    With possible additional values for the speaker, like this:
    'Age', # this value, if present, can be appended to the parenthetical content after the speaker's name (e.g. "John the Baptist ({Age} old)"
    'Comment', # this value, such as an alternate name for the speaker, can be appended to the parenthetical content after the speaker's name (e.g. "John the Baptist ({Age} old, {Comment})"
    'SDBH', # not used, available in macula greek database, but potentially useful for profiling the subject matter of the speech
    'LouwNida', # not used, available in macula greek database, but potentially useful for profiling the subject matter of the speech
    'FCBHCharacter', # not used, alternate id
    'Divinity' # will be 'Y' if present, otherwise not present. This value can be used to determine whether to use "he" or "He" in the prosaic description below, with "He" used for divinities.
    """
    if speaker_information:
        if len(speaker_information) == 1:
            speaker = speaker_information[0]
            context_data["3. Discourse context"].append(f"This word is spoken by {speaker['who_is_speaking']}")
            if speaker.get("Divinity") == "Y":
                context_data["3. Discourse context"].append(f"- {speaker['who_is_speaking']} is a divinity")
            if speaker.get("Age"):
                context_data["3. Discourse context"].append(f", age: {speaker['Age']}")
            if speaker.get("Comment"):
                context_data["3. Discourse context"].append(f" ({speaker['Comment']})")
            speech = speaker.get("what_is_said_complete")
            if len(speech) > 100:
                speech = speaker.get("what_is_said_truncated")
            context_data["3. Discourse context"].append(f", who says (in a {speaker['delivery_tone']} tone), \"{speech}\"")
                
        else:
            context_data["3. Discourse context"].append(f"This word is spoken by {len(speaker_information)} speaker(s): ")
            for speaker in speaker_information:
                context_data["3. Discourse context"].append(f"This word is spoken by {speaker['who_is_speaking']}")
                if speaker.get("Divinity") == "Y":
                    context_data["3. Discourse context"].append(f"- {speaker['who_is_speaking']} is a divinity")
                if speaker.get("Age"):
                    context_data["3. Discourse context"].append(f", {speaker['Age']} years old")
                if speaker.get("Comment"):
                    context_data["3. Discourse context"].append(f" ({speaker['Comment']})")
                speech = speaker.get("what_is_said_complete")
                if len(speech) > 100:
                    speech = speaker.get("what_is_said_truncated")
                context_data["3. Discourse context"].append(f", who says (in a {speaker['Delivery']} tone), \"{speech}\"")
    
    print('>>>>>>>>', word_ref)
    lookup = situations_lookup_json
    situation_data = get_situations_data(word_ref.split('!')[0])['matchingSituation']

    if situation_data:
        pre_text_features = situation_data['preTextFeatures']
        via_text_features = situation_data['viaTextFeatures']

        pre_text_feature_descriptions, pre_text_system_descriptions = process_features(lookup, pre_text_features)
        via_text_feature_descriptions, via_text_system_descriptions = process_features(lookup, via_text_features)

        mutations = generate_mutations(pre_text_features, via_text_features)

        context_data["4. Social context"].append(f"This word is part of the passage '{situation_data['title'][0]}'")

        context_data["4. Social context"].append(f"Which begins as a {' '.join(pre_text_features)} situation")
        context_data["4. Social context"].extend(pre_text_feature_descriptions)
        context_data["4. Social context"].extend(pre_text_system_descriptions)

        context_data["4. Social context"].append(f"And ends as a {' '.join(via_text_features)} situation")
        context_data["4. Social context"].extend(via_text_feature_descriptions)
        context_data["4. Social context"].extend(via_text_system_descriptions)

        if mutations:
            context_data["4. Social context"].append("During the passage, the situation:")
            context_data["4. Social context"].extend(mutations)
    # situation_data = get_situations_data(word_ref.split('!')[0])['matchingSituation']
    # if situation_data:
    #     """
    #     Situation data keys:
    #     ['situation', # situation id, matches morph_gnt but with additional [a-z] suffixes for split situations
    #     'title', # title of the situation
    #     'preTextFeatures', 
    #     'viaTextFeatures', 
        
    #     # id and morph gnt-derived ids in the situation
    #     ('situation', '01-06')
    #     ('section', ['01-06'])
        
    #     # title of the passage
    #     ('title', ['The Baptism of Jesus'])
        
    #     # features of the situation at the outset of the text
    #     ('preTextFeatures', ['non-institutional-or-neutralized', 'dialogical', 'opposing', 'practical-ie-outwardly-oriented', 'addressee-more-active', 'spoken', 'phonic', 'neutral', 'distant', 'instructing', 'ancillary', 'interpersonal'])
        
    #     # features of the situation at the conclusion of the text
    #     ('viaTextFeatures', ['non-institutional-or-neutralized', 'asserting', 'multilogical', 'close', 'conceptual-ie-internally-oriented', 'addressee-more-passive', 'allying', 'spoken', 'phonic', 'neutral', 'ancillary', 'interpersonal'])
        
    #     # first id of the situation
    #     ('start', ['SBLGNT.Matt.3.13.w1'])
        
    #     # first token id in morph gnt
    #     ('morphGntId', ['010313'])
        
    #     # first USFM word id of situation.. repeated n number of times for some reason
    #     ('ref', 'MAT 3:13!1 MAT 3:13!1 ... MAT 3:13!1') # string.. perhaps split by whitespace and join the first two values?
    #     ('text', 'Τότε παραγίνεται ὁ Ἰησοῦς ... ἐν ᾧ εὐδόκησα.') # string
    #     ('token_ids', 'n40003013001 n40003013002 ... n40003017017') # string
    #     ('token_refs', ['MAT 3:13!16', 'MAT 3:16!2', ... 'MAT 3:16!13']) # list
    #     """
    #     context_data["4. Social context"].append(f"This word is part of the passage '{situation_data['title'][0]}'")
        
    #     context_data["4. Social context"].append(f"Which begins as a {' '.join(situation_data['preTextFeatures'])} situation")
        
    #     context_data["4. Social context"].append(f"And ends as a {' '.join(situation_data['viaTextFeatures'])} situation")
    output_lines = []
    for header, sentences in context_data.items():
        if sentences:
            output_lines.append(f"## {header}\n")
            output_lines.append("\n".join(sentences))
            output_lines.append("\n")

    prosaic_context = "".join(output_lines)
    print(prosaic_context)
    return prosaic_context


In [230]:
generate_prosaic_context('n40003014006')

MAT 3:14!6
                     ref role class      type gloss text after lemma  \
xml:id                                                                 
n40003014006  MAT 3:14!6    s  pron  personal     I  Ἐγὼ         ἐγώ   

             normalized  strong  ...    ln    frame  subjref      referent  \
xml:id                           ...                                         
n40003014006        Ἐγώ    1473  ...  92.1  missing  missing  n40003013013   

             book chapter verse book_chapter book_chapter_verse domain_label  
xml:id                                                                        
n40003014006  MAT       3    14        MAT 3           MAT 3:14    [Speaker]  

[1 rows x 30 columns]
MAT 3:14!6 matched to n40003014006
['John the Baptist']
speaker_character_data {'CharacterId': 'John the Baptist', 'MaxSpeakers': 1, 'Gender': 'Male', 'LouwNida': ['93.190a', '93.190b', '93.190e', '93.190f', '93.190d', '93.190c']}
{'CharacterId': 'John the Baptist', 'MaxSpeake

'## 1. Lexical features\n- class: ἐγώ is a pron,\n- gloss: meaning "I."\n- lemma: The lemma form of this word is ἐγώ,\n- morph: and it is parsed as a P-1NS\n- strong: with a Strong\'s number of 1473.\n## 2. Syntactic context and function\n- case: Grammatical case: nominative, genitive, dative, accusative, or vocative: nominative,\n- number: Grammatical number: singular,\n- type: Indicates different types of pronominals: personal,\n## 3. Discourse context\nThis word is spoken by John the Baptist\n, who says (in a humble tone), "ἐγὼ χρείαν ἔχω ὑπὸ σοῦ βαπτισθῆναι καὶ σὺ ἔρχῃ πρός με"\n## 4. Social context\nThis word is part of the passage \'The Baptism of Jesus\'\nWhich begins as a non-institutional-or-neutralized dialogical opposing practical-ie-outwardly-oriented addressee-more-active spoken phonic neutral distant instructing ancillary interpersonal situation\nNon-institutional or neutralized control refers to situations where no specific participant or group holds authority or power o

# Question answering pipeline

Example questions:

In MAT 3:15, it says, "Jesus said to him", but who is the "him" (αὐτῷ) referring to?
What is the social context of this passage?

## Build Gradio UI for question answering pipeline