In [None]:
# !pip install langchain openai chromadb tiktoken

In [1]:
import os
import getpass
secret_key = getpass.getpass('Enter OpenAI secret key: ')
os.environ['OPENAI_API_KEY'] = secret_key

# Create Chroma Database for Prose Context

In [2]:
context_documents_path = '/Users/ryderwishart/genesis/itemized_prose_contexts' # NOTE: this is the directory for the itemized prose contexts
context_documents_path = '/Users/ryderwishart/genesis/prose_contexts' # NOTE: this is the directory for the full prose contexts 


In [3]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from pathlib import Path

# Load Language Model
llm = OpenAI(temperature=0)

## Set up embeddings model

In [4]:
persist_directory = '/Users/ryderwishart/genesis/databases/prose-contexts' # NOTE: this is the db for the itemized prose contexts
# persist_directory = '/Users/ryderwishart/genesis/databases/itemized-prose-contexts' # NOTE: this is the db for the itemized prose contexts
# persist_directory = '/Users/ryderwishart/biblical-machine-learning/gpt-inferences/db' # NOTE: this is the db for the full prose contexts 

In [5]:
# !pip install sentence_transformers > /dev/null

In [6]:
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
embeddings = HuggingFaceEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm


Use following code if creating new chroma DB

In [7]:
# !pip install unstructured > /dev/null

Use the following code to create a new database from the documents in `context_documents_path`

In [8]:
# Long book names to USFM (3 uppercase letters) format
book_name_mapping = {
    "Genesis": "GEN",
    "Exodus": "EXO",
    "Leviticus": "LEV",
    "Numbers": "NUM",
    "Deuteronomy": "DEU",
    "Joshua": "JOS",
    "Judges": "JDG",
    "Ruth": "RUT",
    "1 Samuel": "1SA",
    "2 Samuel": "2SA",
    "1 Kings": "1KI",
    "2 Kings": "2KI",
    "1 Chronicles": "1CH",
    "2 Chronicles": "2CH",
    "Ezra": "EZR",
    "Nehemiah": "NEH",
    "Esther": "EST",
    "Job": "JOB",
    "Psalms": "PSA",
    "Psalm": "PSA",
    "Proverbs": "PRO",
    "Ecclesiastes": "ECC",
    "Song of Solomon": "SNG",
    "Isaiah": "ISA",
    "Jeremiah": "JER",
    "Lamentations": "LAM",
    "Ezekiel": "EZK",
    "Daniel": "DAN",
    "Hosea": "HOS",
    "Joel": "JOL",
    "Amos": "AMO",
    "Obadiah": "OBA",
    "Jonah": "JON",
    "Micah": "MIC",
    "Nahum": "NAM",
    "Habakkuk": "HAB",
    "Zephaniah": "ZEP",
    "Haggai": "HAG",
    "Zechariah": "ZEC",
    "Malachi": "MAL",
    "Matthew": "MAT",
    "Mark": "MRK",
    "Luke": "LUK",
    "John": "JHN",
    "Acts": "ACT",
    "Romans": "ROM",
    "1 Corinthians": "1CO",
    "2 Corinthians": "2CO",
    "Galatians": "GAL",
    "Ephesians": "EPH",
    "Philippians": "PHP",
    "Colossians": "COL",
    "1 Thessalonians": "1TH",
    "2 Thessalonians": "2TH",
    "1 Timothy": "1TI",
    "2 Timothy": "2TI",
    "Titus": "TIT",
    "Philemon": "PHM",
    "Hebrews": "HEB",
    "James": "JAS",
    "1 Peter": "1PE",
    "2 Peter": "2PE",
    "1 John": "1JN",
    "2 John": "2JN",
    "3 John": "3JN",
    "Jude": "JUD",
    "Revelation": "REV"
}
reverse_book_name_mapping = {v:k for k, v in book_name_mapping.items()}

In [9]:

# data_scope = docs[0].metadata['source'].split('_')[-1].split('.')[0] # example data_scope
# verse_ref = docs[0].metadata['source'].split('/')[-1].split('.')[0] # example verse ref
# book = reverse_book_name_mapping[verse_ref.split(' ')[0]]
# chapter = verse_ref.split(' ')[1].split(':')[0]
# verse = verse_ref.split(' ')[1].split(':')[1]

# example 
# docs[0].metadata['source'].split('_')[-1].split('.')[0]


In [23]:
# Create embeddings and store in a vectorstore

# root_dir = context_documents_path
root_dir = "/Users/ryderwishart/genesis/prose_contexts_shorter_itemized"
persist_directory = "/Users/ryderwishart/genesis/databases/prose-contexts-shorter-itemized"

docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for i, file in enumerate(filenames):
        # report number of files done of total
        print('file: ', i + 1, 'of', len(filenames), sep=" ", flush=True, end='\r')
        # if file.endswith('.md') or file.endswith('.txt') and '/.venv/' not in dirpath:
        if file.endswith('.txt'):
            try: 
                # loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
                # loader = UnstructuredMarkdownLoader(os.path.join(dirpath, file)) #, mode="elements")
                # doc = loader.load()[0]
                loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
                documents = loader.load_and_split()
                for doc in documents:
                    # add scope, book, chapter, verse to metadata
                    doc.metadata['data_scope'] = doc.metadata['source'].split('_')[-1].split('.')[0]
                    doc.metadata['verse_ref'] = doc.metadata['source'].split('/')[-1].split('.')[0]
                    doc.metadata['book'] = reverse_book_name_mapping[doc.metadata['verse_ref'].split(' ')[0]]
                    doc.metadata['chapter'] = doc.metadata['verse_ref'].split(' ')[1].split(':')[0]
                    doc.metadata['verse'] = doc.metadata['verse_ref'].split(' ')[1].split(':')[1]
                    docs.append(doc)
            except Exception as e: 
                print(f'Error loading {os.path.join(dirpath, file)}', e)
                pass
print(f'{len(docs)}')

20410  14436 of 14436 of 14436 137 of 14436 141 of 14436 143 of 14436 144 of 14436 145 of 14436 226 of 14436 229 of 14436 240 of 14436 250 of 14436 319 of 14436 363 of 14436 366 of 14436 368 of 14436 381 of 14436 383 of 14436 390 of 14436 392 of 14436 394 of 14436 395 of 14436 406 of 14436 416 of 14436 418 of 14436 425 of 14436 504 of 14436 515 of 14436 527 of 14436 528 of 14436 530 of 14436 542 of 14436 545 of 14436 561 of 14436 565 of 14436 576 of 14436 580 of 14436 582 of 14436 645 of 14436 683 of 14436 721 of 14436 727 of 14436 731 of 14436 741 of 14436 762 of 14436 770 of 14436 771 of 14436 775 of 14436 781 of 14436 782 of 14436 815 of 14436 830 of 14436 833 of 14436 841 of 14436 862 of 14436 873 of 14436 875 of 14436 882 of 14436 884 of 14436 893 of 14436 896 of 14436 904 of 14436 937 of 14436 939 of 14436 940 of 14436 941 of 14436 949 of 14436 992 of 14436 993 of 14436 1005 of 14436 1009 of 14436 1012 of 14436 1023 of 14436 1026 of 14436 1032 of 14436 1047 of 14436 1054 of 14436

Use the following code to actually create the DB (took me 40 minutes on my laptop)

In [24]:
context_chroma = Chroma.from_documents(docs, embeddings, collection_name="prosaic_contexts_shorter_itemized", persist_directory=persist_directory)
# # Save the database
context_chroma.persist()

Using embedded DuckDB with persistence: data will be stored in: /Users/ryderwishart/genesis/databases/prose-contexts-shorter-itemized


Use the following code if using existing chroma DB

In [17]:
# Load the persisted database from disk and use it as normal
context_chroma = Chroma(persist_directory=persist_directory, embedding_function=embeddings, collection_name="prosaic_contexts")

Using embedded DuckDB with persistence: data will be stored in: /Users/ryderwishart/genesis/databases/prose-contexts


Make sure you can query the DB

In [16]:
print(context_chroma.search('jesus speaks to peter', search_type='similarity', k=1))
print(context_chroma.search('peter is spoken to by jesus', search_type='similarity', k=1))
print(context_chroma.search('jesus (subj) speaks to peter (obj)', search_type='similarity', k=1))

[Document(page_content='This verse has 2 discourse features (these are useful heuristic interpretive annotations that tell you about the nature of the proposition a word is in):\n- Main clauses is defined as Main clauses are the top-level clauses in a sentence. They are the clauses that are not embedded in other clauses.\n  - Τότε (Then) προσελθὼν (having come) ὁ (-) Πέτρος (Peter)\n  - εἶπεν (said) αὐτῷ (to Him)\n  - Κύριε (Lord) ποσάκις (how often) ἁμαρτήσει (will sin) εἰς (against) ἐμὲ (me) ὁ (the) ἀδελφός (brother) μου (of me)\n  - καὶ (and) ἀφήσω (I will forgive) αὐτῷ (him)\n  - ἕως (Up to) ἑπτάκις (seven times)\n- Reported Speech is defined as Reported speech.\n  - Κύριε (Lord) ποσάκις (how often) ἁμαρτήσει (will sin) εἰς (against) ἐμὲ (me) ὁ (the) ἀδελφός (brother) μου (of me) καὶ (and) ἀφήσω (I will forgive) αὐτῷ (him) ἕως (Up to) ἑπτάκις (seven times)\n\nSpeaker data is critical to identifying quoted material and relating it to the proper speaker. In this verse, there are the 

# Add plaintext English Bible in a second DB

In [10]:
import os

english_bible_url = 'https://bereanbible.com/bsb.txt'

if not os.path.exists('bsb.txt'):
#     # import requests
#     # r = requests.get(english_bible_url, allow_redirects=True)
#     # with open('bsb.txt', 'w') as f:
#     #     new_testament = r.split('\nMatthew 1:1')[1]
#     # #     f.write('Matthew 1:1' + new_testament)
    # urllib.request.urlretrieve(english_bible_url, 'bsb.txt')
    
#     # with open('bsb.txt', 'r') as f:
#     #     new_testament = f.read().split('\nMatthew 1:1')[1]
#     #     with open('bsb.txt', 'w') as f:
#     #         f.write('Matthew 1:1' + new_testament)
    !wget https://bereanbible.com/bsb.txt

In [11]:
# Turn the BSB bible into the proper encoding...
import chardet

def find_encoding(fname):
    rawdata = open(fname, 'rb').read()
    result = chardet.detect(rawdata)
    charenc = result['encoding']
    return charenc

my_encoding = find_encoding('bsb.txt')

with open('bsb.txt', 'r', encoding=my_encoding) as f:
    text = f.read().replace('�', '')

with open('bsb.txt', 'w', encoding='utf-8') as f:
    f.write(text)


In [12]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

In [13]:
bible_persist_directory = '/Users/ryderwishart/genesis/databases/berean-bible-database'

In [23]:
### SENTENCE-BASED TEXTS

def get_usfm_verse(ref):
    spaces_in_ref = len(ref.split(' '))
    if spaces_in_ref > 1:
        name = ' '.join(ref.split(' ')[0:-1])
        ref_num = ref.split(' ')[-1]
    else:
        name = ref.split(' ')[0]
        ref_num = ref.split(' ')[1]
    return book_name_mapping[name] + ' ' + ref_num

# Initialize lists
text_list = []
dict_list = []
id_list = []

# Group the DataFrame by 'sentence'
# grouped = mg.groupby('sentence')
# grouped = mg.groupby('book_chapter_verse')
with open('bsb.txt', 'r', encoding='utf-8') as f:
    # create a verses variable to store each line starting with the line that begins 'Genesis 1:1\t'
    verses = f.read().split('Genesis 1:1\t')[1]
    verses = verses.split('\n')
    # drop the final empty string
    verses = verses[:-1]
    # fix the first verse
    verses[0] = 'Genesis 1:1\t' + verses[0]
    for verse in verses:
        ref, text = verse.split('\t')
        text_list.append(verse)
        print(verse) # NOTE: I'm adding the verse, not just the text, because I want the bible ref in the text content for similarity searching
        usfm_verse = get_usfm_verse(ref)
        dict_entry = {'source': ref, 'usfm': usfm_verse}
        dict_list.append(dict_entry)
        id_list.append(ref)
        break
# Print the lists for testing
print(len(text_list), text_list[:5])
print(len(dict_list), dict_list[:5])
print(len(id_list), id_list[:5])

Genesis 1:1	In the beginning God created the heavens and the earth.
1 ['Genesis 1:1\tIn the beginning God created the heavens and the earth.']
1 [{'source': 'Genesis 1:1', 'usfm': 'GEN 1:1'}]
1 ['Genesis 1:1']


Use this code if creating new chroma DB for the Berean bible texts

In [18]:
bible_chroma = Chroma("berean-bible", embeddings, persist_directory=bible_persist_directory)

Using embedded DuckDB with persistence: data will be stored in: /Users/ryderwishart/genesis/databases/berean-bible-database


In [20]:
# Add greek texts with metadata
bible_chroma.add_texts(
    texts=text_list,
    metadatas=dict_list,
    ids=[str(i) + v.split('\t')[0] for i, v in enumerate(text_list)]
    )

['0Genesis 1:1',
 '1Genesis 1:2',
 '2Genesis 1:3',
 '3Genesis 1:4',
 '4Genesis 1:5',
 '5Genesis 1:6',
 '6Genesis 1:7',
 '7Genesis 1:8',
 '8Genesis 1:9',
 '9Genesis 1:10',
 '10Genesis 1:11',
 '11Genesis 1:12',
 '12Genesis 1:13',
 '13Genesis 1:14',
 '14Genesis 1:15',
 '15Genesis 1:16',
 '16Genesis 1:17',
 '17Genesis 1:18',
 '18Genesis 1:19',
 '19Genesis 1:20',
 '20Genesis 1:21',
 '21Genesis 1:22',
 '22Genesis 1:23',
 '23Genesis 1:24',
 '24Genesis 1:25',
 '25Genesis 1:26',
 '26Genesis 1:27',
 '27Genesis 1:28',
 '28Genesis 1:29',
 '29Genesis 1:30',
 '30Genesis 1:31',
 '31Genesis 2:1',
 '32Genesis 2:2',
 '33Genesis 2:3',
 '34Genesis 2:4',
 '35Genesis 2:5',
 '36Genesis 2:6',
 '37Genesis 2:7',
 '38Genesis 2:8',
 '39Genesis 2:9',
 '40Genesis 2:10',
 '41Genesis 2:11',
 '42Genesis 2:12',
 '43Genesis 2:13',
 '44Genesis 2:14',
 '45Genesis 2:15',
 '46Genesis 2:16',
 '47Genesis 2:17',
 '48Genesis 2:18',
 '49Genesis 2:19',
 '50Genesis 2:20',
 '51Genesis 2:21',
 '52Genesis 2:22',
 '53Genesis 2:23',
 '

In [24]:
bible_chroma.persist()

Use this code if loading the db from file

In [26]:
bible_chroma = Chroma("berean-bible", embeddings, persist_directory=bible_persist_directory)

Using embedded DuckDB with persistence: data will be stored in: /Users/ryderwishart/genesis/databases/berean-bible-database


In [34]:
# test bible chroma
print(bible_chroma.search('jesus speaks to peter', search_type='similarity', k=1))
test_doc_usfm_source = bible_chroma.search('jesus speaks to peter', search_type='similarity', k=1)[0].metadata['usfm']
print(test_doc_usfm_source)

[Document(lc_kwargs={'page_content': '“But what about you?” Jesus asked. “Who do you say I am?” Peter answered, “The Christ of God.”', 'metadata': {'source': 'Luke 9:20', 'usfm': 'LUK 9:20'}}, page_content='“But what about you?” Jesus asked. “Who do you say I am?” Peter answered, “The Christ of God.”', metadata={'source': 'Luke 9:20', 'usfm': 'LUK 9:20'})]
LUK 9:20


In [None]:
print(context_chroma.similarity_search_with_score('jesus speaks to peter', search_type='similarity', k=1, kwargs={'where': {'usfm': 'MAT 9'}}))

### To update the context chroma with verse ref and context in each document:

In [37]:
# print(context_chroma.get().keys())
# # dict_keys(['ids', 'embeddings', 'documents', 'metadatas'])
# for k, v in context_chroma.get().items():
#     if k == "embeddings":
#         continue
#     print(k, v[0:5])
#     # update each document in this loop

zipper = zip(
    context_chroma.get()["ids"],
    context_chroma.get()["documents"],
    context_chroma.get()["metadatas"],
)

bible_chroma_zipper = list(zip(
    bible_chroma.get()["ids"],
    bible_chroma.get()["documents"],
    bible_chroma.get()["metadatas"],
))

# Make a lookup dict for the bible chroma based on bible_verse.metadata['source']
bible_chroma_lookup = {}
for bible_verse in bible_chroma_zipper:
    bible_chroma_lookup[bible_verse[2]['source']] = bible_verse[1]

for doc in zipper:
    verse_ref = f"{doc[2]['book']} {doc[2]['chapter']}:{doc[2]['verse']}"
    # Add f"{doc[2]['book']} {doc[2]['chapter']}:{doc[2]['verse']}" to the beginning of doc[1]
    try:
        matching_verse_content = bible_chroma_lookup[verse_ref]
    except KeyError:
        print(f"KeyError: {verse_ref} not found in bible_chroma_lookup")
    # Only update the doc if it does not already start with the matching_verse_content
    if doc[1].startswith(matching_verse_content):
        continue
    else:
        new_doc_page_content = f"{matching_verse_content}\n\n{doc[1]}"
        # Create a new Document instance with the updated content
        updated_doc = Document(page_content=new_doc_page_content, metadata=doc[2])
        # Update the document in the context_chroma db
        context_chroma.update_document(document_id=doc[0], document=updated_doc)
        
context_chroma.persist()


## Create tyndale resources db

In [11]:
tyndale_data_path = '/Users/ryderwishart/biblical-machine-learning/data/tyndale-resources'

tyndale_persist_path = '/Users/ryderwishart/genesis/databases/tyndale'

tyndale_chroma = Chroma("tyndale", embeddings, persist_directory=tyndale_persist_path)

# Chunk the texts into 1000 word chunks
chunked_texts = []
for txt_file in [i for i in os.listdir(tyndale_data_path) if i.endswith('.txt')]:
    loader = TextLoader(os.path.join(tyndale_data_path, txt_file), encoding='utf-8')
    documents = loader.load_and_split()
    chunked_texts.extend(documents)
    
tyndale_chroma.add_documents(chunked_texts)
tyndale_chroma.persist()

Using embedded DuckDB with persistence: data will be stored in: /Users/ryderwishart/genesis/databases/tyndale


## Add a third DB with some public domain theology texts

In [None]:
calvin_urls = [
    "https://www.gutenberg.org/cache/epub/45001/pg45001.txt", # Calvin's Institutes vol 1
    "https://www.gutenberg.org/cache/epub/64392/pg64392.txt", # Calvin's Institutes vol 2
]

other_theology_urls = [
    "https://www.gutenberg.org/cache/epub/35354/pg35354.txt", # Moral Theology, by John A. McHugh and Charles J. Callan
    "https://www.gutenberg.org/files/52648/52648-0.txt", # The Fundamental Doctrines of the Christian faith, by R. A. Torrey
]

strong_sys_theo_urls = [
    "https://www.gutenberg.org/files/44035/44035-0.txt", # Systematic Theology (Volume 1 of 3) by Augustus Hopkins Strong
    "https://www.gutenberg.org/files/44555/44555-0.txt", # Systematic Theology (Volume 2 of 3) by Augustus Hopkins Strong
    "https://www.gutenberg.org/files/45283/45283-0.txt", # Systematic Theology (Volume 3 of 3) by Augustus Hopkins Strong
]

bible_encyclopedic_urls = [
    "https://www.gutenberg.org/files/43070/43070-0.txt", # Archæology and the Bible, by George A. Barton
    "https://www.gutenberg.org/files/40747/40747-0.txt", # Biblical Geography and History, by Charles Foster Kent
]

theology_urls = calvin_urls + other_theology_urls + strong_sys_theo_urls
encyclopedic_urls = bible_encyclopedic_urls


In [None]:
!mkdir -p theology

import os
os.chdir('theology')

for url in theology_urls:
    !wget -nc {url}
    
os.chdir('..')

!mkdir -p encyclopedic

os.chdir('encyclopedic')

for url in encyclopedic_urls:
    !wget -nc {url}

os.chdir('..')

In [None]:
# Chunk the theology texts
from langchain.document_loaders import TextLoader

root_dir = 'theology'

theology_docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith('.txt') and '/.venv/' not in dirpath:
            try: 
                loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
                theology_docs.extend(loader.load_and_split())
            except Exception as e: 
                pass
print(f'Number of theology document chunks: {len(theology_docs)}')

In [None]:
# # !pip install PyPDF2
# NOTE: PyPDF2 is not extracting text from high-res PDFs
# from PyPDF2 import PdfReader

# reader = PdfReader("/Users/ryderwishart/biblical-machine-learning/gpt-inferences/encyclopedic/brill_awdl000063_hi.pdf")
# number_of_pages = len(reader.pages)
# page = reader.pages[0]
# text = page.extract_text()
# print('number_of_pages', number_of_pages, 'text', text)

In [None]:
!pip install pdf2image pytesseract
# pytesseract, pytesseract, PIL

In [None]:
# # Use Google's Tesseract OCR to extract text from PDFs - NOTE: also does not work 
# from PIL import Image
# from pdf2image import convert_from_path
# import pytesseract
# import os

# filePath = '/Users/ryderwishart/biblical-machine-learning/gpt-inferences/encyclopedic/brill_awdl000063_hi.pdf'
# doc = convert_from_path(filePath)
# path, fileName = os.path.split(filePath)
# fileBaseName, fileExtension = os.path.splitext(fileName)

# for page_number, page_data in enumerate(doc):
#     txt = pytesseract.image_to_string(page_data) #.encode("utf-8") for bytes output
#     print("Page # {} - {}".format(str(page_number),txt))
#     break


In [None]:
# Chunk the encyclopedic texts
from langchain.document_loaders import PyPDFLoader

root_dir = 'encyclopedic'

encyclopedic_docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith('.pdf'):
            try:
                loader = PyPDFLoader(os.path.join(dirpath, file))
                print(loader.load_and_split())
                docs.extend(loader.load_and_split())
            except Exception as e:
                print('Error loading PDF', e)
                pass
        if file.endswith('.txt') and '/.venv/' not in dirpath:
            try: 
                loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
                encyclopedic_docs.extend(loader.load_and_split())
            except Exception as e: 
                pass
print(f'Number of encyclopedic document chunks: {len(encyclopedic_docs)}')
    

In [None]:
unique_sources = []
for doc in encyclopedic_docs:
    if doc.metadata['source'] not in unique_sources:
        unique_sources.append(doc.metadata['source'])
print(unique_sources)

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

theology_texts = text_splitter.split_documents(docs)
encyclopedic_texts = text_splitter.split_documents(encyclopedic_docs)

print(f"Number of theology texts for db: {len(theology_texts)}")
print(f"Number of encyclopedic texts for db: {len(encyclopedic_texts)}")

In [None]:
!mkdir -p databases/theology
!mkdir -p databases/encyclopedic

In [None]:
# secondary_sources_chroma = Chroma.from_documents(
#     documents=texts,
#     name='secondary-sources',
#     embeddings=embeddings,
#     persist_directory='databases/secondary-sources'
# )
# secondary_sources_chroma.persist()
# print(secondary_sources_chroma.search('jesus speaks to peter', search_type='similarity', k=1))

theology_persist_directory = 'databases/theology'
theology_chroma = Chroma("theology", embeddings, persist_directory=theology_persist_directory)
theology_chroma.add_documents(theology_texts)
theology_chroma.persist()

In [None]:
# encyclopedic_chroma = Chroma.from_documents(
#     documents=encyclopedic_texts,
#     name='encyclopedic',
#     embeddings=embeddings,
#     persist_directory='databases/encyclopedic'
# )
# encyclopedic_chroma.persist()

encyclopedic_persist_directory = 'databases/encyclopedic'
encyclopedic_chroma = Chroma("encyclopedic", embeddings, persist_directory=encyclopedic_persist_directory)
encyclopedic_chroma.add_documents(encyclopedic_texts)
encyclopedic_chroma.persist()

In [None]:
# theology_chroma = Chroma("theology", embeddings, persist_directory='databases/theology')


In [None]:
print(theology_chroma.search('jesus speaks to peter', search_type='similarity', k=1))

In [None]:
query = 'jesus speaks to peter'

print('Theology context doc:', theology_chroma.search(query, search_type='similarity', k=1))
print('Encyclopedic context doc:', encyclopedic_chroma.search(query, search_type='similarity', k=1))

# Create UI

A simple Gradio frontend UI to query the databases and see the query results from each DB. 

In [None]:
import gradio as gr

def gradio_wrapper(user_input_string='', k=5, metric='similarity'):
    """Take user input and return hits from the database. 
    - inputs: [
        user_input_string: str, # query to search for
        k: int, # number of docs to return
        metric: str, # metric to use for search ('similarity' | 'mmr')
      ]
    """
    bible_contexts = bible_chroma.search(user_input_string, search_type=metric, k=k)
    # bible_context_usfm_sources = [context.metadata['usfm'] for context in bible_contexts]
    
    data_contexts = context_chroma.search(user_input_string, search_type=metric, k=k) #, kwargs={'where': {'usfm': bible_context_usfm_sources[0]}}) # FIXME: use the bible contexts as metadata filters - see https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/chroma_self_query.html
    # bible_contexts = bible_chroma.search(user_input_string, search_type=metric, k=k)
    
    theology_contexts = theology_chroma.search(user_input_string, search_type=metric, k=k)
    encyclopedic_contexts = encyclopedic_chroma.search(user_input_string, search_type=metric, k=k)
    
    return [bible_contexts, data_contexts, theology_contexts, encyclopedic_contexts]

app = gr.Blocks(theme='bethecloud/storj_theme')

with app:
    
    gr.Markdown("## Find relevant New Testament contexts for a given query")

    with gr.Column(width=2):
        question_input = gr.Textbox(label="Question", value="Who is involved in Mat 3:14, and what are the participants trying to accomplish?", type="text")
        # answer_temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
        submit_button = gr.Button("Get contexts")
        bible_passage_output = gr.Textbox(label="Related Bible passages")
        contexts_output = gr.Textbox(label="Context documents")
        theology_output = gr.Textbox(label="Theology documents")
        encyclopedic_output = gr.Textbox(label="Encyclopedic documents")
    submit_button.click(
        gradio_wrapper,
        inputs=question_input,
        outputs=[bible_passage_output, contexts_output, theology_output, encyclopedic_output]
    )
    
app.launch()
