The goal is to take every row from the conditional data and add macula ids for each row for both the protasis and apodosis separately.

Ingest excel sheets

In [1]:
import re
import os
import pandas as pd
import openai
import getpass

In [2]:
# Set the maximum number of rows and columns to display (set them to None for unlimited)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
ids_file = 'macula_ids_with_glosses.tsv'
excel_path = 'CanIL Analysis of NT Conditionals by book220831.xlsx'

In [5]:
# Function to merge columns with similar names
def merge_columns(df):
    # Group by the first word in the column name
    grouped = df.groupby(by=lambda x: x.split(" ")[0], axis=1)
    
    # Combine grouped columns
    for name, group in grouped:
        if len(group.columns) > 1:
            df[name] = group.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)

    # Drop the original columns, keeping only the merged ones
    for name, group in grouped:
        if len(group.columns) > 1:
            df.drop(columns=group.columns[1:], inplace=True)
    return df

In [6]:

# Read all sheet names in the Excel file
xl = pd.ExcelFile(excel_path)
sheet_names = xl.sheet_names

# Skip the first sheet and last sheet, as it's just introductory material
sheet_names = sheet_names[1:-1]

# Read and concatenate all the sheets, making the column names lowercase
df_list = []
for sheet in sheet_names:
    df = pd.read_excel(excel_path, sheet_name=sheet)
    df.columns = map(str.lower, df.columns)  # Making column names lowercase
    df = merge_columns(df)  # Merge similar columns
    
    # Need to process the 'reference' column to add the sheet name + ' ' to the beginning of each value
    try:
        df['reference'] = sheet + ' ' + df['reference'].astype(str)
    except KeyError:
        print('Sheet name', sheet, 'does not have a "reference" column.')
    
    df_list.append(df)

In [7]:
# Concatenate all DataFrames in df_list
concatenated_df = pd.concat(df_list, ignore_index=True)

# Your existing code, adjusted for the concatenated DataFrame
mark_df = concatenated_df

ids_df = pd.read_csv(ids_file, sep='\t')
# Add 'macula_ids_in_protasis' and 'macula_ids_in_apodosis' columns to mark_df as empty arrays
mark_df['macula_ids_in_protasis'] = [[] for _ in range(len(mark_df))]
mark_df['macula_ids_in_apodosis'] = [[] for _ in range(len(mark_df))]
# let's extract the 'ref' into a 'b_ch_v' column: e.g., MRK 1:1!1 -> MRK 1:1
ids_df['b_ch_v'] = ids_df['ref'].str.extract(r'(\w+ \d+:\d+)')
mark_df['b_ch_v'] = mark_df['reference'].str.extract(r'(\w+ \d+:\d+)')

# create a subset of the ifs_df for rows where the 'b_ch_v' value is a substring within the 'Reference' column of the mark_df
# e.g., 1:1 is a substring of MRK 1:1!1
subset = ids_df[ids_df['b_ch_v'].isin(mark_df['reference'])]

# fill all NaN in both dataframes with empty strings
mark_df = mark_df.fillna('')

In [8]:
# add matched english words (which line up with matched ids) into the dataframe as a new column, one for 'english', one for 'gloss'
mark_df['matched_english_in_protasis'] = [[] for _ in range(len(mark_df))]
mark_df['matched_english_in_apodosis'] = [[] for _ in range(len(mark_df))]
mark_df['matched_gloss_in_protasis'] = [[] for _ in range(len(mark_df))]
mark_df['matched_gloss_in_apodosis'] = [[] for _ in range(len(mark_df))]
mark_df['all_verse_word_tuples'] = [[] for _ in range(len(mark_df))]

mark_df['matched_protasis_words'] = [[] for _ in range(len(mark_df))]
mark_df['all_protasis_words'] = [[] for _ in range(len(mark_df))]
mark_df['matched_apodosis_words'] = [[] for _ in range(len(mark_df))] 
mark_df['all_apodosis_words'] = [[] for _ in range(len(mark_df))] 
mark_df['unmatched_protasis_words'] = [[] for _ in range(len(mark_df))] 
mark_df['unmatched_apodosis_words'] = [[] for _ in range(len(mark_df))] 

backup_df = mark_df.copy()

In [9]:
mark_df.head()

Unnamed: 0,reference,scope of conditional (esv unless noted),class,inv.,probability,time orientation,illocutionary force,english translations,notes,parallel passage(s),unnamed: 10,unnamed:,parallel passages,unnamed: 9,scope of conditional (esv unless otherwise indicated),scope of conditional (esv),scope of conditional (esv unless stated otherwise),macula_ids_in_protasis,macula_ids_in_apodosis,b_ch_v,matched_english_in_protasis,matched_english_in_apodosis,matched_gloss_in_protasis,matched_gloss_in_apodosis,all_verse_word_tuples,matched_protasis_words,all_protasis_words,matched_apodosis_words,all_apodosis_words,unmatched_protasis_words,unmatched_apodosis_words
0,MAT 4:3,p: (if you are the Son of God)\nq: (command these stones to become bread),1,,Factual,Present,Exhort,"ESV, NASB, NRSV, NIV, NLT: ""if""","P presents a fact that the temptor knew to be true. The devil certainly knows Jesus' identity. Hagner (1993) notes that this might as well be translated ""since"", as the devil is testing Jesus' obedience to the Father rather than questioning his identity. Q presents the subsequent command Jesus is being tempted to obey. The conditional as a whole amounts to a taunt or a challenge. Fong (2014: 30-31) discusses this as an example given by Young (1994) of a rhetorical conditional. That is, it is not being presented as a real condition, but the conditional is used for its logical connection in order to affect some other speech act, in this case ""manipulation"", according to Young. Fong, however, thinks it would be better classified as a ""challenge.""",Luke 4:3,,,,,,,,[],[],MAT 4:3,[],[],[],[],[],[],[],[],[],[],[]
1,MAT 4:6,p: (if you are the Son of God)\nq: (throw yourself down),1,,Factual,Present,Exhort,"ESV, NASB, NRSV, NIV, NLT: ""if""","As with 4:3, this conditional expresses no doubt about Jesus' identity, but is being used to manipulate, tempt, and challenge him.",Luke 4:9,,,,,,,,[],[],MAT 4:6,[],[],[],[],[],[],[],[],[],[],[]
2,MAT 4:9,q:(All these I will give you)\np: (if you will fall down and worship me),3,x,Very Unlikely,Present,Promise / Exhort,"ESV, NASB, NRSV, NIV, NLT: ""if""","In 4:3 and 4:6 the exhortations (or temptations) are direct and occur in q; in this verse the exhortation (or temptation) is made indirectly in p. Though highly unlikely, and though it did not come to pass, this was still a (remote) possibility when the devil spoke it. It was a real option for Jesus, though he did not take it. Unlike the two previous temptations, this one is a third class, using εαν with the subjunctive because ( according to Hagner 1993) it involves an actual unmet condition.",Luke 4:7,,,,,,,,[],[],MAT 4:9,[],[],[],[],[],[],[],[],[],[],[]
3,MAT 5:13a,p: (if salt has lost its taste)\nq: (how shall its saltiness be restored?),3,,Unlikely,Gnomic,Assert,"ESV, NASB, NRSV, NIV, NLT: ""if""","This conditional must be read in its context of describing discipleship. The connection between salt and discipleship is that, as Marshall (1978) puts it, ""a false form of discipleship may look like salt, but the gradual process of leeching leaves only a zestless pile of waste."" Nolland (2005) claims that discussions of how salt of the day may have lost its flavour are pointless, as the point being made is that such a thing would be ""bizarre and unnatural."" It would be terrible for salt to lose saltiness because it is itself the thing that adds flavour to tasteless food.",Mark 9:50; Luke 14:34,,,,,,,,[],[],MAT 5:13,[],[],[],[],[],[],[],[],[],[],[]
4,MAT 5:20,p: (unless your righteousness exceeds that of the scribes and Pharisees)\nq: (you will never enter the kingdom of heaven),3,,Very Unlikely,Gnomic,Warn,"ESV, NASB, NRSV, NIV, NLT: ""unless""","Greek: εαν μη. Nolland, Hagner, and others agree that the righteousness Jesus discusses here cannot mean that he is actually commanding the disciples to live by the letter of the law as the Pharisees do (and to a greater extent, which is nigh impossible). \nParaphrase (to avoid multiple negatives in languages without 'unless' or 'except'): For I say to you that only if you are more righteous than the scribes and Pharisees, will you enter the kingdom of heaven.",,,,,,,,,[],[],MAT 5:20,[],[],[],[],[],[],[],[],[],[],[]


Now we need to try to match up strings. 
Here's an example. 

in the `Scope of conditional (ESV unless noted)` column, we have the string:

"p: (If you will) q: (you can make me clean)"
`p: \(.*?\)` matches the protasis
`q: \(.*?\)` matches the apodosis

Now, in the ids_df, for rows where ids_df[ids_df['b_ch_v'].isin(mark_df['Reference'])] (i.e., the subset relevant to just one row in mark_df) is true, we have the rows:

xml:id	ref	english	gloss	text	b_ch_v
603	n41001040001	MRK 1:40!1	and	And	Καὶ	1:40
604	n41001040002	MRK 1:40!2	came	comes	ἔρχεται	1:40
605	n41001040003	MRK 1:40!3	to	to	πρὸς	1:40
606	n41001040004	MRK 1:40!4	him	Him	αὐτὸν	1:40
607	n41001040005	MRK 1:40!5	leper	a leper	λεπρὸς	1:40

So, here we note that there is both an 'english' and 'gloss' column.

We want to find all ifs_df rows where the 'b_ch_v' value is a substring of the 'Reference' column in the mark_df, and then we want to take the 'english' and 'gloss' values from the ids_df and match them up with any English words (strip brackets and square brackets on both sides of the equation) into the 'Scope of conditional (ESV unless noted)' column in the mark_df.
Then, we want to populate the columns 'macula_ids_in_protasis' and 'macula_ids_in_apodosis' with the 'xml:id' values from the ids_df.

In [10]:
verse_df = ids_df[ids_df['b_ch_v'] == mark_df.iloc[0]['b_ch_v']]
verse_df

Unnamed: 0,xml:id,ref,english,gloss,text,b_ch_v
1247,n40004003001,MAT 4:3!1,and,And,καὶ,MAT 4:3
1248,n40004003002,MAT 4:3!2,came,having come,προσελθὼν,MAT 4:3
1249,n40004003003,MAT 4:3!3,the,the,ὁ,MAT 4:3
1250,n40004003004,MAT 4:3!4,tempter,[one] tempting,πειράζων,MAT 4:3
1251,n40004003005,MAT 4:3!5,said,he said,εἶπεν,MAT 4:3
1252,n40004003006,MAT 4:3!6,him,to Him,αὐτῷ,MAT 4:3
1253,n40004003007,MAT 4:3!7,if,If,Εἰ,MAT 4:3
1254,n40004003008,MAT 4:3!8,son,Son,υἱὸς,MAT 4:3
1255,n40004003009,MAT 4:3!9,are,You are,εἶ,MAT 4:3
1256,n40004003010,MAT 4:3!10,,-,τοῦ,MAT 4:3


In [11]:
def generate_prompt(verse):
    p_q = mark_df.loc[mark_df['b_ch_v'] == verse]
    verse_df = ids_df[ids_df['b_ch_v'] == verse]                            

    prompt = f'''
    ## Instruction:
    Use the rows from the table below to associate xml:ids with the protasis and apodosis listed below to create 2 csv files. One for the protasis, and one for the apodosis:

    ## Context
    {verse_df}

    Here is an example:
    Protasis: if you are the Son of God
    ```
    if, If, Εἰ, n40004003007
    son, Son, υἱὸς, n40004003008
    are, You are, εἶ, n40004003009
    ...
    ```

    Apodosis: command these stones to become bread
    ```
    command, speak, εἰπὲ, n40004003012
    to, that, ἵνα, n40004003013
    NaN, the, οἱ, n40004003014
    ...
    ```

    Now follow a similar format with this protasis and apodosis pair:
    {p_q['scope of conditional (esv unless noted)']}


    ## Results:

    '''
    return prompt

In [12]:
openai_pass = getpass.getpass('Enter OpenAI secret key: ')

In [13]:
# define your GPT completion function

openai.api_key = openai_pass

model = 'gpt-3.5-turbo' # or 4

MAX_RETRIES = 10
def align(prompt):
    system_prompt = "Analyze the p-q phrases and align the individual words to ids with the table the user provides"
    messages = [
        {"role": 'system', "content": system_prompt},
        {"role": 'user', 'content': prompt}
    ]
    for i in range(MAX_RETRIES):
        try:
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=0.1,
            )
            generated_texts = [
                choice.message["content"] for choice in response["choices"]
            ]
            return generated_texts[0]
        except (openai.error.APIConnectionError, openai.error.APIError) as e:
            print('Error in alignment:', e)
            if i < MAX_RETRIES - 1:  # i is zero indexed
                continue
            else:
                return {"error": str(e)}

In [14]:
refs = mark_df['b_ch_v'].tolist()
refs[:5]

['MAT 4:3', 'MAT 4:6', 'MAT 4:9', 'MAT 5:13', 'MAT 5:20']

In [19]:
for ref in refs:  
    prompt = generate_prompt(ref)  
    results = align(prompt)
    with open('pq_macula.txt', 'a', encoding='utf-8') as file:
        file.write(results)
        file.write('\n\n\n')

In [23]:
with open('pq_macula.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    
alignments = text.split('\n\n\n')
alignments

['Protasis:\n```\nif, If, Εἰ, n40004003007\nyou, You are, εἶ, n40004003009\nare, You are, εἶ, n40004003009\nthe, the, ὁ, n40004003003\nSon, Son, υἱὸς, n40004003008\nof, of God, θεοῦ, n40004003011\nGod, of God, θεοῦ, n40004003011\n```\n\nApodosis:\n```\ncommand, speak, εἰπὲ, n40004003012\nthese, these, οὗτοι, n40004003016\nstones, stones, λίθοι, n40004003015\nto, that, ἵνα, n40004003013\nbecome, might become, γένωνται, n40004003018\nbread, loaves of bread, ἄρτοι, n40004003017\n```',
 'Protasis:\n```\nif, If, Εἰ, n40004006004\nyou, You, σοῦ, n40004006020\nare, You are, εἶ, n40004006006\nthe, the, τὸν, n40004006030\nSon, Son, υἱὸς, n40004006005\nof, of, τοῦ, n40004006007\nGod, God, θεοῦ, n40004006008\n```\n\nApodosis:\n```\nthrow, throw, βάλε, n40004006009\nyourself, Yourself, σεαυτὸν, n40004006010\ndown, down, κάτω, n40004006011\n```',
 'Protasis:\n```\nall, All, πάντα, n40004009006\nthese, These things, Ταῦτά, n40004009004\nI, I will give, δώσω, n40004009007\nyou, to You, σοι, n40004009

In [37]:
mark_df['clean_protasis'] = None
mark_df['clean_protasis_id'] = None
mark_df['clean_apodosis'] = None
mark_df['clean_apodosis_id'] = None

In [33]:
def clean_list(values):
    clean_values = []
    clean_value_ids = []
    
    for value in values:
        if len(value) > 0:
            data = value.split(', ')
            eng = data[0]
            gloss = data[1]
            grc = data[2]
            id = data[3]
            
            word_row = verse_df[verse_df['xml:id'] == id]
            
            matching_row = word_row[
                (word_row['english'].str.lower() == eng.lower()) &
                (word_row['gloss'].str.lower() == gloss.lower()) &
                (word_row['text'].str.lower() == grc.lower())
            ]
            
            if not matching_row.empty:
                clean_values.append(value)
                clean_value_ids.append(id)
     
    return clean_values, clean_value_ids

In [38]:
for i in range(len(alignments)):
    verse = refs[i]
    verse_df = ids_df[ids_df['b_ch_v'] == verse] 
    # Split the text into sections for Protasis and Apodosis
    sections = alignments[i].split('\n\n')

    # Create two lists for Protasis and Apodosis
    protasis_list = []
    apodosis_list = []

    # Process each section and add items to the respective lists
    for section in sections:
        if section.startswith("Protasis:"):
            lines = section.split('\n')[1:-1]
            protasis_list.extend([line.strip().replace('```', '') for line in lines])
        elif section.startswith("Apodosis:"):
            lines = section.split('\n')[1:-1]
            apodosis_list.extend([line.strip().replace('```', '') for line in lines])

    
    clean_protasis, clean_protasis_id = clean_list(protasis_list)
    clean_apodosis, clean_apodosis_id = clean_list(apodosis_list)
    
    def custom_key(item):
        return int(item[-3:])

    clean_protasis = sorted(clean_protasis, key=custom_key)
    clean_protasis_id = sorted(clean_protasis_id, key=custom_key)
    clean_apodosis = sorted(clean_apodosis, key=custom_key)
    clean_apodosis_id = sorted(clean_apodosis_id, key=custom_key)
    
    row_index = mark_df[mark_df['b_ch_v'] == verse].index[0]
    
    mark_df.at[row_index, 'clean_protasis'] = clean_protasis
    mark_df.at[row_index, 'clean_protasis_id'] = clean_protasis_id
    mark_df.at[row_index, 'clean_apodosis'] = clean_apodosis
    mark_df.at[row_index, 'clean_apodosis_id'] = clean_apodosis_id

MAT 4:3
MAT 4:6
MAT 4:9
MAT 5:13
