In [1]:
#Import necessary libraries
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import os
from uuid import uuid4
import json
from PyPDF2 import PdfReader, PdfWriter

  from tqdm.autonotebook import tqdm


In [2]:
#Load the JSON file that contains textbook list
book_folder = "."
f = open('{book_folder}/bcsc_books.json'.format(book_folder=book_folder))

book_list = json.load(f)

for book in book_list['books']:
    reader = PdfReader(book['path'])
    split_folder = book['id']
    print(split_folder)
    
    #Split the book into pages
    for page_num in range(1, book['pages']):
        writer = PdfWriter()
        page = reader.pages[page_num]

        # This is CPU intensive! It ZIPs the contents of the page
        # page.compress_content_streams()

        writer.add_page(page)
        file_name = "{book_folder}/{split_folder}/{page}.pdf".format(
            book_folder=book_folder,
            split_folder=split_folder,
            page=str(page_num)
        )
        print(file_name)
        os.makedirs(os.path.dirname(file_name), exist_ok=True)
        
        with open(file_name, "wb") as fh:
            writer.write(fh)

bcsc_2022_section1
./bcsc_2022_section1/1.pdf
./bcsc_2022_section1/2.pdf
./bcsc_2022_section1/3.pdf
./bcsc_2022_section1/4.pdf
./bcsc_2022_section1/5.pdf
./bcsc_2022_section1/6.pdf
./bcsc_2022_section1/7.pdf
./bcsc_2022_section1/8.pdf
./bcsc_2022_section1/9.pdf
./bcsc_2022_section1/10.pdf
./bcsc_2022_section1/11.pdf
./bcsc_2022_section1/12.pdf
./bcsc_2022_section1/13.pdf
./bcsc_2022_section1/14.pdf
./bcsc_2022_section1/15.pdf
./bcsc_2022_section1/16.pdf
./bcsc_2022_section1/17.pdf
./bcsc_2022_section1/18.pdf
./bcsc_2022_section1/19.pdf
./bcsc_2022_section1/20.pdf
./bcsc_2022_section1/21.pdf
./bcsc_2022_section1/22.pdf
./bcsc_2022_section1/23.pdf
./bcsc_2022_section1/24.pdf
./bcsc_2022_section1/25.pdf
./bcsc_2022_section1/26.pdf
./bcsc_2022_section1/27.pdf
./bcsc_2022_section1/28.pdf
./bcsc_2022_section1/29.pdf
./bcsc_2022_section1/30.pdf
./bcsc_2022_section1/31.pdf
./bcsc_2022_section1/32.pdf
./bcsc_2022_section1/33.pdf
./bcsc_2022_section1/34.pdf
./bcsc_2022_section1/35.pdf
./bcsc_202

In [3]:
#Create embedding object
embed_model_name = 'text-embedding-ada-002'

embedding = OpenAIEmbeddings(
    openai_api_key=os.environ['OPENAI_API_KEY']
)

In [4]:
import chromadb

#Load the textbook and Start embedding and upserting to Pinecone
book_folder = "."
f = open('{book_folder}/bcsc_books.json'.format(book_folder=book_folder))
book_list = json.load(f)

#Set persistent storage to use for query later
persist_directory = "./chromadb"
# client = chromadb.PersistentClient(path="./chromadb")
vectordb = Chroma(
                collection_name="bcsc", # Name of the collection
                persist_directory=persist_directory,
                embedding_function=embedding
            )

for book in book_list['books']:
    print("Processing: {book_name}".format(book_name=book['name']))
    
    sub_folder = book['id']
    print(sub_folder)
    
    pages = os.listdir("{book_folder}/{sub_folder}".format(
        book_folder=book_folder,
        sub_folder=sub_folder
    ))

    for page in pages:
        page_num = page.split('.')[0]
        print("\tPage: {page}".format(page=page_num))
        
        file = "{book_folder}/{sub_folder}/{page}".format(
            book_folder=book_folder,
            sub_folder=sub_folder,
            page=page
        )
        print("\t{file}".format(file=file))
        
        reader = PdfReader(file)
        data = reader.pages[0].extract_text()

        if(len(data) > 0):
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)

            texts = text_splitter.split_text(data)

            metadata = { #number #section: #type: question/answer
                'id': book['id'],
                'source': book['name'],
                'page': page_num,
                'year': book['year'],
                'pages': book['pages'],
                'author': book['authors'] 
            }

            ids = [str(uuid4()) for _ in range(len(texts))]

            metadatas = [metadata] * len(texts)

            vectordb.add_texts(
                texts=texts,
                ids=ids,
                metadatas=metadatas
            )

            vectordb.persist()

            

Processing: BCSC: Update on General Medicine
bcsc_2022_section1
	Page: 1
	./bcsc_2022_section1/1.pdf
	Page: 10
	./bcsc_2022_section1/10.pdf
	Page: 100
	./bcsc_2022_section1/100.pdf
	Page: 101
	./bcsc_2022_section1/101.pdf
	Page: 102
	./bcsc_2022_section1/102.pdf
	Page: 103
	./bcsc_2022_section1/103.pdf
	Page: 104
	./bcsc_2022_section1/104.pdf
	Page: 105
	./bcsc_2022_section1/105.pdf
	Page: 106
	./bcsc_2022_section1/106.pdf
	Page: 107
	./bcsc_2022_section1/107.pdf
	Page: 108
	./bcsc_2022_section1/108.pdf
	Page: 109
	./bcsc_2022_section1/109.pdf
	Page: 11
	./bcsc_2022_section1/11.pdf
	Page: 110
	./bcsc_2022_section1/110.pdf
	Page: 111
	./bcsc_2022_section1/111.pdf
	Page: 112
	./bcsc_2022_section1/112.pdf
	Page: 113
	./bcsc_2022_section1/113.pdf
	Page: 114
	./bcsc_2022_section1/114.pdf
	Page: 115
	./bcsc_2022_section1/115.pdf
	Page: 116
	./bcsc_2022_section1/116.pdf
	Page: 117
	./bcsc_2022_section1/117.pdf
	Page: 118
	./bcsc_2022_section1/118.pdf
	Page: 119
	./bcsc_2022_section1/119.pdf
	

In [5]:
print(vectordb._collection.count())

9236
