In [1]:
# !pip install langchain openai chromadb tiktoken

In [11]:
import os
import getpass
secret_key = getpass.getpass('Enter OpenAI secret key: ')
os.environ['OPENAI_API_KEY'] = secret_key

# Create Chroma Database for Prose Context

In [12]:
context_documents_path = '/Users/ryderwishart/genesis/itemized_prose_contexts' # NOTE: this is the directory for the itemized prose contexts
context_documents_path = '/Users/ryderwishart/genesis/prose_contexts' # NOTE: this is the directory for the full prose contexts 


In [13]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from pathlib import Path

# Load Language Model
llm = OpenAI(temperature=0)

## Set up embeddings model

In [14]:
# persist_directory = '/Users/ryderwishart/genesis/databases/itemized-prose-contexts' # NOTE: this is the db for the itemized prose contexts
persist_directory = '/Users/ryderwishart/genesis/databases/prose-contexts' # NOTE: this is the db for the itemized prose contexts
# persist_directory = '/Users/ryderwishart/biblical-machine-learning/gpt-inferences/db' # NOTE: this is the db for the full prose contexts 

In [15]:
# !pip install sentence_transformers > /dev/null

In [16]:
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
embeddings = HuggingFaceEmbeddings()

Use following code if creating new chroma DB

In [17]:
# !pip install unstructured > /dev/null

In [18]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownTextSplitter

Use the following code to create a new database from the documents in `context_documents_path`

In [19]:
# Long book names to USFM (3 uppercase letters) format
book_name_mapping = {
    "Genesis": "GEN",
    "Exodus": "EXO",
    "Leviticus": "LEV",
    "Numbers": "NUM",
    "Deuteronomy": "DEU",
    "Joshua": "JOS",
    "Judges": "JDG",
    "Ruth": "RUT",
    "1 Samuel": "1SA",
    "2 Samuel": "2SA",
    "1 Kings": "1KI",
    "2 Kings": "2KI",
    "1 Chronicles": "1CH",
    "2 Chronicles": "2CH",
    "Ezra": "EZR",
    "Nehemiah": "NEH",
    "Esther": "EST",
    "Job": "JOB",
    "Psalms": "PSA",
    "Psalm": "PSA",
    "Proverbs": "PRO",
    "Ecclesiastes": "ECC",
    "Song of Solomon": "SNG",
    "Isaiah": "ISA",
    "Jeremiah": "JER",
    "Lamentations": "LAM",
    "Ezekiel": "EZK",
    "Daniel": "DAN",
    "Hosea": "HOS",
    "Joel": "JOL",
    "Amos": "AMO",
    "Obadiah": "OBA",
    "Jonah": "JON",
    "Micah": "MIC",
    "Nahum": "NAM",
    "Habakkuk": "HAB",
    "Zephaniah": "ZEP",
    "Haggai": "HAG",
    "Zechariah": "ZEC",
    "Malachi": "MAL",
    "Matthew": "MAT",
    "Mark": "MRK",
    "Luke": "LUK",
    "John": "JHN",
    "Acts": "ACT",
    "Romans": "ROM",
    "1 Corinthians": "1CO",
    "2 Corinthians": "2CO",
    "Galatians": "GAL",
    "Ephesians": "EPH",
    "Philippians": "PHP",
    "Colossians": "COL",
    "1 Thessalonians": "1TH",
    "2 Thessalonians": "2TH",
    "1 Timothy": "1TI",
    "2 Timothy": "2TI",
    "Titus": "TIT",
    "Philemon": "PHM",
    "Hebrews": "HEB",
    "James": "JAS",
    "1 Peter": "1PE",
    "2 Peter": "2PE",
    "1 John": "1JN",
    "2 John": "2JN",
    "3 John": "3JN",
    "Jude": "JUD",
    "Revelation": "REV"
}
reverse_book_name_mapping = {v:k for k, v in book_name_mapping.items()}

In [20]:

# data_scope = docs[0].metadata['source'].split('_')[-1].split('.')[0] # example data_scope
# verse_ref = docs[0].metadata['source'].split('/')[-1].split('.')[0] # example verse ref
# book = reverse_book_name_mapping[verse_ref.split(' ')[0]]
# chapter = verse_ref.split(' ')[1].split(':')[0]
# verse = verse_ref.split(' ')[1].split(':')[1]

# example 
# docs[0].metadata['source'].split('_')[-1].split('.')[0]


In [21]:
# Create embeddings and store in a vectorstore
from langchain.text_splitter import MarkdownTextSplitter

root_dir = context_documents_path

docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for i, file in enumerate(filenames):
        if file.endswith('.md') or file.endswith('.txt') and '/.venv/' not in dirpath:
            try: 
                # loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
                loader = UnstructuredMarkdownLoader(os.path.join(dirpath, file)) #, mode="elements")
                doc = loader.load()[0]
                # add scope, book, chapter, verse to metadata
                # doc.metadata['data_scope'] = doc.metadata['source'].split('_')[-1].split('.')[0]
                doc.metadata['verse_ref'] = doc.metadata['source'].split('/')[-1].split('.')[0]
                doc.metadata['book'] = reverse_book_name_mapping[doc.metadata['verse_ref'].split(' ')[0]]
                doc.metadata['chapter'] = doc.metadata['verse_ref'].split(' ')[1].split(':')[0]
                doc.metadata['verse'] = doc.metadata['verse_ref'].split(' ')[1].split(':')[1]
                
                docs.extend([doc])
            except Exception as e: 
                print(f'Error loading {os.path.join(dirpath, file)}', e)
                pass
print(f'{len(docs)}')

7943


Use the following code to actually create the DB (took me 40 minutes on my laptop)

In [22]:
context_chroma = Chroma.from_documents(docs, embeddings, collection_name="prosaic_contexts_large", persist_directory=persist_directory)
# # Save the database
context_chroma.persist()

Using embedded DuckDB with persistence: data will be stored in: /Users/ryderwishart/genesis/databases/prose-contexts


Use the following code if using existing chroma DB

In [7]:
# Load the persisted database from disk and use it as normal
context_chroma = Chroma(persist_directory=persist_directory, embedding_function=embeddings, collection_name="prosaic_contexts")

Using embedded DuckDB with persistence: data will be stored in: /Users/ryderwishart/genesis/databases/itemized-prose-contexts


Make sure you can query the DB

In [8]:
# Inspect some texts 
print('MAT 1:1 -->', context_chroma.search('MAT 1:1', search_type='similarity'))
print('blind -->', context_chroma.search('blind', search_type='similarity'))
# TODO: sort out metadata filtering
print('blind with filters -->', context_chroma.search('blind in 1PE', search_type='similarity'))

MAT 1:1 --> [Document(page_content="Syntax data\n\nTreedown syntax: This word is part of the following sentence:\n        for (γὰρ)] Not even (οὐδὲ)] \n[[s: the (ὁ)] Father (Πατὴρ)] [v: judges (κρίνει)] [o: no one,(οὐδένα)] but (ἀλλὰ)] \n[[o: - (τὴν)] judgment (κρίσιν)] all (πᾶσαν)] [v: has given (δέδωκεν)] [io: to the (τῷ)] Son,(Υἱῷ)] so that (ἵνα)] \n[[s: all (πάντες)] [v: may honor (τιμῶσι)] [o: the (τὸν)] Son (Υἱὸν)] [+: even as (καθὼς)] \n[[v: they honor (τιμῶσι)] [o: the (τὸν)] Father.(Πατέρα)] \n[[s: He who (ὁ)] \n[[+: not (μὴ)] [v: is honoring (τιμῶν)] [o: the (τὸν)] Son (Υἱὸν)] [+: not (οὐ)] [v: is honoring (τιμᾷ)] [o: the (τὸν)] Father (Πατέρα)] [apposition: the [One] (τὸν)] \n[[v: having sent (πέμψαντα)] [o: Him.(αὐτόν)]\n\nSemantic configurations (useful for figuring out what is taking place in the sentence and how this word plays a role):\n        ['[A0: πᾶς] [τιμάω] [A1: υἱός] / [Source: all] [Process: may honor] [Goal: Son]', '[A0: τιμάω] [τιμάω] [A1: πατήρ] / [Source: i

In [15]:
print(context_chroma.search('jesus speaks to peter', search_type='similarity', k=1))
print(context_chroma.search('peter is spoken to by jesus', search_type='similarity', k=1))
print(context_chroma.search('jesus (subj) speaks to peter (obj)', search_type='similarity', k=1))

[Document(page_content='Syntax data\n\nTreedown syntax: This word is part of the following sentence:\n        therefore (οὖν)] \n[\n[[+: [o: Him (τοῦτον)] [v: having seen (ἰδὼν)] [s: - (ὁ)] Peter (Πέτρος)] [v: says (λέγει)] [io: - (τῷ)] to Jesus (Ἰησοῦ)] [o: [aux: Lord,(Κύριε)] and (δὲ)] \n[[s: this man (οὗτος)] [p: what about;(τί)]\n\nSemantic configurations (useful for figuring out what is taking place in the sentence and how this word plays a role):\n        [\'[A0: Πέτρος] [ὁράω] [A1: οὗτος] / [Source: Peter] [Process: having seen] [Goal: Him]\', \'[A0: Πέτρος] [λέγω] [A2: Ἰησοῦς] / [Source: Peter] [Process: says] [Beneficiary: to Jesus]\']\n\nThis verse has 7 discourse features (these are useful heuristic interpretive annotations that tell you about the nature of the proposition a word is in):\n- Main clauses is defined as Main clauses are the top-level clauses in a sentence. They are the clauses that are not embedded in other clauses.\n  - τοῦτον (Him) οὖν (therefore) ἰδὼν (havin

# Add plaintext English Bible in a second DB

In [35]:
import os

english_bible_url = 'https://bereanbible.com/bsb.txt'

if not os.path.exists('bsb.txt'):
#     # import requests
#     # r = requests.get(english_bible_url, allow_redirects=True)
#     # with open('bsb.txt', 'w') as f:
#     #     new_testament = r.split('\nMatthew 1:1')[1]
#     # #     f.write('Matthew 1:1' + new_testament)
    # urllib.request.urlretrieve(english_bible_url, 'bsb.txt')
    
#     # with open('bsb.txt', 'r') as f:
#     #     new_testament = f.read().split('\nMatthew 1:1')[1]
#     #     with open('bsb.txt', 'w') as f:
#     #         f.write('Matthew 1:1' + new_testament)
    !wget https://bereanbible.com/bsb.txt

In [12]:
# Turn the BSB bible into the proper encoding...
import chardet

def find_encoding(fname):
    rawdata = open(fname, 'rb').read()
    result = chardet.detect(rawdata)
    charenc = result['encoding']
    return charenc

my_encoding = find_encoding('bsb.txt')

with open('bsb.txt', 'r', encoding=my_encoding) as f:
    text = f.read().replace('�', '')

with open('bsb.txt', 'w', encoding='utf-8') as f:
    f.write(text)


In [13]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

In [36]:
bible_persist_directory = '/Users/ryderwishart/genesis/databases/berean-bible-database'

In [17]:
### SENTENCE-BASED TEXTS

def get_usfm_verse(ref):
    spaces_in_ref = len(ref.split(' '))
    if spaces_in_ref > 1:
        name = ' '.join(ref.split(' ')[0:-1])
        ref_num = ref.split(' ')[-1]
    else:
        name = ref.split(' ')[0]
        ref_num = ref.split(' ')[1]
    return book_name_mapping[name] + ' ' + ref_num

# Initialize lists
text_list = []
dict_list = []
id_list = []

# Group the DataFrame by 'sentence'
# grouped = mg.groupby('sentence')
# grouped = mg.groupby('book_chapter_verse')
with open('bsb.txt', 'r', encoding='utf-8') as f:
    # create a verses variable to store each line starting with the line that begins 'Genesis 1:1\t'
    verses = f.read().split('Genesis 1:1\t')[1]
    verses = verses.split('\n')
    # drop the final empty string
    verses = verses[:-1]
    # fix the first verse
    verses[0] = 'Genesis 1:1\t' + verses[0]
    for verse in verses:
        ref, text = verse.split('\t')
        text_list.append(text)
        usfm_verse = get_usfm_verse(ref)
        dict_entry = {'source': ref, 'usfm': usfm_verse}
        dict_list.append(dict_entry)
        id_list.append(ref)
# Print the lists for testing
print(len(text_list), text_list[:5])
print(len(dict_list), dict_list[:5])
print(len(id_list), id_list[:5])

31102 ['In the beginning God created the heavens and the earth.', 'Now the earth was formless and void, and darkness was over the surface of the deep. And the Spirit of God was hovering over the surface of the waters.', 'And God said, “Let there be light,” and there was light.', 'And God saw that the light was good, and He separated the light from the darkness.', 'God called the light “day,” and the darkness He called “night.” And there was evening, and there was morning—the first day.']
31102 [{'source': 'Genesis 1:1', 'usfm': 'GEN 1:1'}, {'source': 'Genesis 1:2', 'usfm': 'GEN 1:2'}, {'source': 'Genesis 1:3', 'usfm': 'GEN 1:3'}, {'source': 'Genesis 1:4', 'usfm': 'GEN 1:4'}, {'source': 'Genesis 1:5', 'usfm': 'GEN 1:5'}]
31102 ['Genesis 1:1', 'Genesis 1:2', 'Genesis 1:3', 'Genesis 1:4', 'Genesis 1:5']


Use this code if creating new chroma DB for the Berean bible texts

In [37]:
bible_chroma = Chroma("berean-bible", embeddings, persist_directory=bible_persist_directory)

Using embedded DuckDB with persistence: data will be stored in: /Users/ryderwishart/genesis/databases/berean-bible-database


In [38]:
# Add greek texts with metadata
bible_chroma.add_texts(
    texts=text_list,
    metadatas=dict_list,
    ids=id_list    
    )

['Genesis 1:1',
 'Genesis 1:2',
 'Genesis 1:3',
 'Genesis 1:4',
 'Genesis 1:5',
 'Genesis 1:6',
 'Genesis 1:7',
 'Genesis 1:8',
 'Genesis 1:9',
 'Genesis 1:10',
 'Genesis 1:11',
 'Genesis 1:12',
 'Genesis 1:13',
 'Genesis 1:14',
 'Genesis 1:15',
 'Genesis 1:16',
 'Genesis 1:17',
 'Genesis 1:18',
 'Genesis 1:19',
 'Genesis 1:20',
 'Genesis 1:21',
 'Genesis 1:22',
 'Genesis 1:23',
 'Genesis 1:24',
 'Genesis 1:25',
 'Genesis 1:26',
 'Genesis 1:27',
 'Genesis 1:28',
 'Genesis 1:29',
 'Genesis 1:30',
 'Genesis 1:31',
 'Genesis 2:1',
 'Genesis 2:2',
 'Genesis 2:3',
 'Genesis 2:4',
 'Genesis 2:5',
 'Genesis 2:6',
 'Genesis 2:7',
 'Genesis 2:8',
 'Genesis 2:9',
 'Genesis 2:10',
 'Genesis 2:11',
 'Genesis 2:12',
 'Genesis 2:13',
 'Genesis 2:14',
 'Genesis 2:15',
 'Genesis 2:16',
 'Genesis 2:17',
 'Genesis 2:18',
 'Genesis 2:19',
 'Genesis 2:20',
 'Genesis 2:21',
 'Genesis 2:22',
 'Genesis 2:23',
 'Genesis 2:24',
 'Genesis 2:25',
 'Genesis 3:1',
 'Genesis 3:2',
 'Genesis 3:3',
 'Genesis 3:4',
 '

In [39]:
bible_chroma.persist()

Use this code if loading the db from file

In [None]:
# bible_chroma = Chroma("berean-bible", embeddings, persist_directory=persist_directory)

In [26]:
# test bible chroma
print(bible_chroma.search('jesus speaks to peter', search_type='similarity', k=1))
test_doc_usfm_source = bible_chroma.search('jesus speaks to peter', search_type='similarity', k=1)[0].metadata['usfm']
print(test_doc_usfm_source)

[Document(page_content='“But what about you?” Jesus asked. “Who do you say I am?” Peter answered, “The Christ of God.”', metadata={'source': 'Luke 9:20', 'usfm': 'LUK 9:20'})]
LUK 9:20


In [33]:
print(context_chroma.similarity_search_with_score('jesus speaks to peter', search_type='similarity', k=1, kwargs={'where': {'usfm': 'MAT 9'}}))



# Create UI

A simple Gradio frontend UI to query the databases and see the query results from each DB. 

In [35]:
import gradio as gr

def gradio_wrapper(user_input_string='', k=5, metric='similarity'):
    """Take user input and return hits from the database. 
    - inputs: [
        user_input_string: str, # query to search for
        k: int, # number of docs to return
        metric: str, # metric to use for search ('similarity' | 'mmr')
      ]
    """
    bible_contexts = bible_chroma.search(user_input_string, search_type=metric, k=k)
    # bible_context_usfm_sources = [context.metadata['usfm'] for context in bible_contexts]
    
    
    
    data_contexts = context_chroma.search(user_input_string, search_type=metric, k=k) #, kwargs={'where': {'usfm': bible_context_usfm_sources[0]}}) # FIXME: use the bible contexts as metadata filters - see https://python.langchain.com/en/latest/modules/indexes/retrievers/examples/chroma_self_query.html
    # bible_contexts = bible_chroma.search(user_input_string, search_type=metric, k=k)
    
    return [bible_contexts, data_contexts]

app = gr.Blocks(theme='bethecloud/storj_theme')

with app:
    
    gr.Markdown("## Find relevant New Testament contexts for a given query")

    with gr.Column(width=2):
        question_input = gr.Textbox(label="Question", value="Who is involved in Mat 3:14, and what are the participants trying to accomplish?", type="text")
        # answer_temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, default=0.7, step=0.1, label="Temperature")
        submit_button = gr.Button("Get contexts")
        bible_passage_output = gr.Textbox(label="Related Bible passages")
        contexts_output = gr.Textbox(label="Context documents")
    submit_button.click(
        gradio_wrapper,
        inputs=question_input,
        outputs=[bible_passage_output, contexts_output]
    )
    
app.launch()


  exec(code_obj, self.user_global_ns, self.user_ns)


Running on local URL:  http://127.0.0.1:7872

To create a public link, set `share=True` in `launch()`.


