In [39]:
import os
import json
from typing import List, Tuple
import getpass
import openai
import time

In [40]:
try:
    openai_key = getpass.getpass()
    openai.api_key = openai_key
except Exception as error:
    print('ERROR', error)

In [41]:
def read_files(directory: str) -> dict:
    """
    Reads all files in the specified directory and returns a dictionary
    where the keys are filenames and values are file contents.
    """
    files_content = {}
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                files_content[filename] = file.read()
    return files_content

In [42]:
def chunk_text(text: str) -> List[str]:
    """
    Splits text into paragraphs based on empty lines.
    """
    return [paragraph for paragraph in text.split('\n\n') if paragraph]

In [43]:
def extract_references(chunk: str) -> List[Tuple[str, str]]:
    prompt = f'''
    here is a paragraph from a commentary:

    {chunk}

    This paragraph may contain a number of references on a number of biblical texts. 
    If there are references to biblical events, characters, etc., please create tuples that show `(verseOrPassageReference: str (use full book name, like 'Genesis', not 'Gen'), relevantCommentaryText: str (this should be a direct quote from the commentary), relevantWordingsFromBiblePassage: str (this should be the text of the verse that is referenced in the commentary))`
    If no references to biblical texts are found, simply say "No references found.".
    '''

    messages = [
        {"role": "system", "content": f"You are LangAlignerGPT. Analyze the user-suppliedcommentary chunks below and follow any instructions the user gives."},
        {"role": "user", "content": prompt},
    ]
    
    print(prompt)

    retries = 3
    for _ in range(retries):
        try:
            response = openai.Completion.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                temperature=0.7,
                max_tokens=500,
                n=1,
                stop=None,
                presence_penalty=0.5,
                frequency_penalty=0.5,
            )
            
            references = [
                # choice.message["content"].strip() for choice in response["choices"]
                choice["text"].strip() for choice in response["choices"]
            ]
            
            return references
        except Exception as e:
            time.sleep(1)

In [44]:
def save_annotations(filename: str, annotations: dict, output_directory: str):
    """
    Saves annotations to the output directory.
    """
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    output_filename = f'{filename}_annotations.jsonl'
    output_path = os.path.join(output_directory, output_filename)
    with open(output_path, 'a', encoding='utf-8') as file:
        json.dump(annotations[0], file, indent=2)

In [48]:
def annotate_text(filename: str, text: str, output_dir):
    """
    Processes the text paragraph by paragraph, extracts references,
    and returns a list of annotations.
    """
    annotations = []
    for paragraph in chunk_text(text):
        for reference in extract_references(paragraph):
            if 'no references found' in reference.lower():
                pass
            else:
                annotations.append({
                    'reference': reference[0],
                    'commentary': reference[1],
                    'verse_text': reference[2]
                })
                save_annotations(filename, annotations, output_dir)
    return annotations

In [46]:
def main():
    input_directory = 'C:/Users/natha/OneDrive/Desktop/clear-genesis/sample_input'
    output_directory = 'C:/Users/natha/OneDrive/Desktop/clear-genesis/sample_output'
    
    files_content = read_files(input_directory)
    for filename, text in files_content.items():
        annotate_text(filename, text, output_directory)


In [49]:
if __name__ == '__main__':
    main()


    here is a paragraph from a commentary:

    CHAPTER II

    This paragraph may contain a number of references on a number of biblical texts. 
    If there are references to biblical events, characters, etc., please create tuples that show `(verseOrPassageReference: str (use full book name, like 'Genesis', not 'Gen'), relevantCommentaryText: str (this should be a direct quote from the commentary), relevantWordingsFromBiblePassage: str (this should be the text of the verse that is referenced in the commentary))`
    If no references to biblical texts are found, simply say "No references found.".
    

    here is a paragraph from a commentary:

      TRUE LOVE TESTED

    This paragraph may contain a number of references on a number of biblical texts. 
    If there are references to biblical events, characters, etc., please create tuples that show `(verseOrPassageReference: str (use full book name, like 'Genesis', not 'Gen'), relevantCommentaryText: str (this should be a direct quot

KeyboardInterrupt: 