In [1]:
from llama_index import (
    VectorStoreIndex, 
    KnowledgeGraphIndex,
    SimpleDirectoryReader, 
    ServiceContext,
    LLMPredictor,
    set_global_service_context
)
from llama_index.graph_stores import SimpleGraphStore
from llama_index.llms.openai import OpenAI
from tqdm import tqdm
import time, openai, os, re, shutil, ast
import pandas as pd

from llama_index.node_parser import SimpleNodeParser

# load environment variables
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.environ.get("OPENAI_API_KEY")

os.makedirs('temp', exist_ok=True)
os.makedirs('query_engines', exist_ok=True)

In [2]:
def extract_title_and_creators(filename):
    filename = os.path.join(os.getcwd(), filename)
    with open(filename, 'r', encoding='utf-8') as file:
        content = file.read()

    # Regular expressions to match title and creator(s)
    title_pattern = re.compile(r'Title:\s*(.+?)(?=Creator)', re.MULTILINE | re.DOTALL)
    creator_pattern = re.compile(r'Creator\(s\):\s*(.+?)(?=\n\s+[A-Z][a-zA-Z]*:|$)', re.MULTILINE | re.DOTALL)

    # Regular expression to find instances of "Image of page 1" or "Image of page 2" etc.
    image_pattern = re.compile(r'Image of page \d+')

    # If content has more than 3 instances of "Image of page 1" or "Image of page 2" etc., then it is not a book
    if len(image_pattern.findall(content)) > 3:
        return '', '', False
    else:
        # Extract title and creators from the content
        title_match = title_pattern.search(content)
        title = title_match.group(1).strip() if title_match else ''
        title = [title.strip() for title in title.split('\n') if title.strip()]
        title = ' '.join(title)

        creator_match = creator_pattern.search(content)
        creators = creator_match.group(1).strip() if creator_match else ''
        creators = [creator.strip() for creator in creators.split('\n') if creator.strip()]
        creators = ' - '.join(creators)

        return title, creators, True

In [3]:
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(
    chunk_size_limit=1024,
    llm=chatgpt
)

parser = SimpleNodeParser.from_defaults()

set_global_service_context(service_context)

In [4]:
books = pd.read_csv('filename_category.csv')
categories = pd.read_csv('category_description.csv')
last_index = 0
category_docs = {}
category_index = {}

In [5]:
index_summary = []
for row in categories.iterrows():
    index_summary.append(f"{row[1]['Description']}\nIncludes books like {', '.join(ast.literal_eval(row[1]['Example Titles']))}\nand authors like {', '.join(ast.literal_eval(row[1]['Frequent Authors']))}")

In [7]:
for category in categories['Category']:
    print(f"\nIndexing {category}...")
    category_docs[category] = []
    temp_index = VectorStoreIndex([], service_context=service_context)
    for book in tqdm(books.loc[books['final_bundled_category'] == category, 'filename']):
        shutil.copy(f'raw_data/ccel/{book}', 'temp/temp.txt')
        title, authors, validation = extract_title_and_creators(f'raw_data/ccel/{book}')
        documents = SimpleDirectoryReader('temp').load_data()
        for doc in documents:
            doc.metadata['title'] = title
            doc.metadata['authors'] = authors
            category_docs[category].append(doc)
            temp_index.insert(doc)
    temp_index.storage_context.persist(persist_dir=f'query_engines/{category}')


Indexing Biblical Texts and Commentaries...


100%|██████████| 232/232 [1:51:08<00:00, 28.74s/it]    



Indexing Christian Biography...


100%|██████████| 11/11 [00:48<00:00,  4.41s/it]



Indexing Christian Devotional...


100%|██████████| 12/12 [00:29<00:00,  2.48s/it]



Indexing Christian Fiction...


100%|██████████| 42/42 [05:00<00:00,  7.14s/it]



Indexing Christian Life and Worship...


100%|██████████| 254/254 [1:04:33<00:00, 15.25s/it]



Indexing Christian Living...


100%|██████████| 51/51 [04:40<00:00,  5.49s/it]



Indexing Christian Poetry...


100%|██████████| 11/11 [00:34<00:00,  3.14s/it]



Indexing Early Christian Fathers...


100%|██████████| 12/12 [01:05<00:00,  5.50s/it]



Indexing Early Christian Literature...


100%|██████████| 11/11 [07:29<00:00, 40.83s/it]



Indexing Historical and Biographical Texts...


100%|██████████| 63/63 [16:50<00:00, 16.04s/it]



Indexing Miscellaneous...


100%|██████████| 144/144 [30:49<00:00, 12.84s/it] 



Indexing Systematic Theology...


100%|██████████| 10/10 [03:52<00:00, 23.25s/it]



Indexing Theology...


100%|██████████| 68/68 [26:43<00:00, 23.59s/it] 



Indexing Theology and Beliefs...


100%|██████████| 177/177 [1:07:00<00:00, 22.71s/it]
