In [1]:
from llama_index import (
    VectorStoreIndex, 
    KnowledgeGraphIndex,
    SimpleDirectoryReader, 
    ServiceContext,
    LLMPredictor,
    set_global_service_context
)
from llama_index.graph_stores import SimpleGraphStore
from llama_index.llms.openai import OpenAI
from tqdm import tqdm
import time, openai, os, re, shutil, ast
import pandas as pd
import numpy as np

from llama_index.node_parser import SimpleNodeParser

# load environment variables
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")

os.makedirs('temp', exist_ok=True)
os.makedirs('query_engines', exist_ok=True)

In [2]:
def extract_title_and_creators(filename):
    filename = os.path.join(os.getcwd(), filename)
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()

    # Regular expressions to match title and creator(s)
    title_pattern = re.compile(r'Title:\s*(.+?)(?=Creator)', re.MULTILINE | re.DOTALL)
    creator_pattern = re.compile(r'Creator\(s\):\s*(.+?)(?=\n\s+[A-Z][a-zA-Z]*:|$)', re.MULTILINE | re.DOTALL)

    # Regular expression to find instances of "Image of page 1" or "Image of page 2" etc.
    image_pattern = re.compile(r'Image of page \d+')

    # If content has more than 3 instances of "Image of page 1" or "Image of page 2" etc., then it is not a book
    if len(image_pattern.findall(content)) > 3:
        return '', '', False
    else:
        # Extract title and creators from the content
        title_match = title_pattern.search(content)
        title = title_match.group(1).strip() if title_match else ''
        title = [title.strip() for title in title.split('\n') if title.strip()]
        title = ' '.join(title)

        creator_match = creator_pattern.search(content)
        creators = creator_match.group(1).strip() if creator_match else ''
        creators = [creator.strip() for creator in creators.split('\n') if creator.strip()]
        creators = ' - '.join(creators)

        return title, creators, True

In [3]:
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(
    chunk_size_limit=1024,
    llm=chatgpt
)

parser = SimpleNodeParser.from_defaults()

set_global_service_context(service_context)

In [9]:
books = pd.read_csv('filename_category.csv')
categories = pd.read_csv('category_description.csv')
last_index = 0
category_docs = {}
category_index = {}
books.head()

Unnamed: 0,filename,final_bundled_category,title,authors
0,ccel_aaberg_hymnsdenmark.txt,Historical and Biographical Texts,Hymns and Hymnwriters of Denmark,"Aaberg, Jens Christian (1877-1970)"
1,ccel_abelard_misfortunes.txt,Christian Biography,Historia Calamitatum: The Story of My Misfortunes,"Abelard, Peter"
2,ccel_addison_evidences.txt,Theology and Beliefs,"The Evidences of the Christian Religion, with ...","Addison, Joseph (1672-1719)"
3,ccel_adeney_expositoreznehes.txt,Biblical Texts and Commentaries,"The Expositor's Bible: Ezra, Nehemiah, and Esther","Adeney, Walter Frederic (1849-1920)"
4,ccel_adeney_expositorsonglament.txt,Biblical Texts and Commentaries,The Expositor's Bible: The Song of Solomon and...,"Adeney, Walter Frederic (1849-1920)"


In [11]:
books['final_bundled_category'].value_counts()

final_bundled_category
Biblical Texts and Commentaries      171
Theology and Beliefs                 169
Christian Life and Worship           167
Reformed Theology                    144
Miscellaneous                        140
Sermons                              102
Theology                              68
Historical and Biographical Texts     63
Reformed Commentaries                 58
Christian Living                      51
Christian Fiction                     41
Christian Devotional                  12
Early Christian Fathers               12
Christian Biography                   11
Christian Poetry                      11
Early Christian Literature            11
Systematic Theology                   10
Name: count, dtype: int64

In [43]:
categories.loc[categories['Category'] == 'Reformed Commentaries', 'Example Titles'] = str(np.random.choice(books.loc[books['final_bundled_category'] == 'Reformed Commentaries', 'title'].unique(), 3).tolist())
categories.loc[categories['Category'] == 'Reformed Commentaries', 'Frequent Authors'] = str(np.random.choice(books.loc[books['final_bundled_category'] == 'Reformed Commentaries', 'authors'].unique(), 3).tolist())
categories.loc[categories['Category'] == 'Reformed Theology', 'Example Titles'] = str(np.random.choice(books.loc[books['final_bundled_category'] == 'Reformed Theology', 'title'].unique(), 3).tolist())
categories.loc[categories['Category'] == 'Reformed Theology', 'Frequent Authors'] = str(np.random.choice(books.loc[books['final_bundled_category'] == 'Reformed Theology', 'authors'].unique(), 3).tolist())

In [48]:
categories.to_csv('category_description.csv', index=False)

In [5]:
index_summary = []
for row in categories.iterrows():
    index_summary.append(f"{row[1]['Description']} - Includes books like {', '.join(ast.literal_eval(row[1]['Example Titles']))} and authors like {', '.join(ast.literal_eval(row[1]['Frequent Authors']))}")

# save index summary to txt file
with open('index_summary.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join(index_summary))

In [6]:
for category in categories['Category']:
    print(f"\nIndexing {category}...")
    category_docs[category] = []
    temp_index = VectorStoreIndex([], service_context=service_context)
    for book in tqdm(books.loc[books['final_bundled_category'] == category, 'filename']):
        shutil.copy(f'raw_data/ccel/{book}', 'temp/temp.txt')
        title, authors, validation = extract_title_and_creators(f'raw_data/ccel/{book}')
        documents = SimpleDirectoryReader('temp').load_data()
        for doc in documents:
            doc.metadata['title'] = title
            doc.metadata['authors'] = authors
            category_docs[category].append(doc)
            temp_index.insert(doc)
    temp_index.storage_context.persist(persist_dir=f'query_engines/{category.lower().replace(" ", "_")}')


Indexing Biblical Texts and Commentaries...


100%|██████████| 174/174 [1:21:10<00:00, 27.99s/it]   



Indexing Christian Biography...


100%|██████████| 11/11 [00:54<00:00,  4.96s/it]



Indexing Christian Devotional...


100%|██████████| 12/12 [00:31<00:00,  2.65s/it]



Indexing Christian Fiction...


100%|██████████| 42/42 [04:52<00:00,  6.96s/it]



Indexing Christian Life and Worship...


100%|██████████| 254/254 [1:04:32<00:00, 15.25s/it]



Indexing Christian Living...


100%|██████████| 51/51 [04:39<00:00,  5.49s/it]



Indexing Christian Poetry...


100%|██████████| 11/11 [00:38<00:00,  3.46s/it]



Indexing Early Christian Fathers...


100%|██████████| 12/12 [01:02<00:00,  5.24s/it]



Indexing Early Christian Literature...


100%|██████████| 11/11 [07:28<00:00, 40.79s/it]



Indexing Historical and Biographical Texts...


100%|██████████| 63/63 [16:50<00:00, 16.03s/it]



Indexing Miscellaneous...


100%|██████████| 143/143 [31:07<00:00, 13.06s/it] 



Indexing Systematic Theology...


100%|██████████| 10/10 [03:51<00:00, 23.12s/it]



Indexing Theology...


100%|██████████| 68/68 [26:44<00:00, 23.59s/it]  



Indexing Theology and Beliefs...


100%|██████████| 177/177 [1:07:00<00:00, 22.71s/it]



Indexing Reformed Commentaries...


100%|██████████| 58/58 [30:22<00:00, 31.43s/it] 



Indexing Reformed Theology...


100%|██████████| 145/145 [52:39<00:00, 21.79s/it]  


In [8]:
books.loc[books['final_bundled_category'] == 'Christian Life and Worship', :].to_csv('christian_life_and_worship.csv', index=False)

In [12]:
for category in ['Christian Life and Worship', 'Sermons']:
    print(f"\nIndexing {category}...")
    category_docs[category] = []
    temp_index = VectorStoreIndex([], service_context=service_context)
    for book in tqdm(books.loc[books['final_bundled_category'] == category, 'filename']):
        shutil.copy(f'raw_data/ccel/{book}', 'temp/temp.txt')
        title, authors, validation = extract_title_and_creators(f'raw_data/ccel/{book}')
        documents = SimpleDirectoryReader('temp').load_data()
        for doc in documents:
            doc.metadata['title'] = title
            doc.metadata['authors'] = authors
            category_docs[category].append(doc)
            temp_index.insert(doc)
    temp_index.storage_context.persist(persist_dir=f'query_engines/{category.lower().replace(" ", "_")}')


Indexing Christian Life and Worship...


100%|██████████| 167/167 [21:48<00:00,  7.84s/it]



Indexing Sermons...


100%|██████████| 102/102 [44:18<00:00, 26.06s/it]
