In [None]:
######################## Multiple PDF processing start ####################

In [None]:
import os
import glob
import re
import pickle
import faiss
import chromadb
from dotenv import load_dotenv

from llama_parse import LlamaParse
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
# from langchain_community.docstore.in_memory import InMemoryDocstore
# from langchain.agents import initialize_agent, Tool, AgentType
# from langchain.memory import ConversationBufferMemory
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

In [5]:
import nest_asyncio

nest_asyncio.apply()

In [6]:
load_dotenv()
os.environ['LLAMA_CLOUD_API_KEY'] = os.getenv("LLAMA_CLOUD_API_KEY")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["openai_api_key"] = os.getenv("openai_api_key")

In [None]:
# --------------------------------------------------
# 1. Process multiple PDFs and create restructured markdown files
# --------------------------------------------------
# Adjust the pattern if your PDFs are named as policy-a-plan1.pdf, policy-a-plan2.pdf, etc.
pdf_files = glob.glob('./data/input_data/pdf/policy-*-plan-*.pdf')

for pdf_file in pdf_files:
    # Use LlamaParse to convert PDF to markdown with a refinement prompt.
    documents = LlamaParse(
        result_type="markdown",
        user_prompt="""Streamline the policy document for clarity and conciseness while retaining all essential details. Remove redundancy without compromising key policies, coverage, or regulatory information. Ensure the final document is precise and well-structured"""
    ).load_data(pdf_file)
    
    # Create an output filename based on the PDF name.
    base_name = os.path.splitext(os.path.basename(pdf_file))[0]
    output_md_path = f'./data/input_data/markdown/{base_name}-restructured.md'
    
    with open(output_md_path, 'w') as f:
        for doc in documents:
            f.write(doc.text + '\n\n')

Started parsing the file under job_id 610b5674-d09e-4114-9c0e-d3db0e97d6f1
..Started parsing the file under job_id 0e1df70f-f9de-4cad-9331-9d20432c6c51
..Started parsing the file under job_id 72e6475d-b8ef-4c37-9a5a-a616e6b7edae
.Started parsing the file under job_id 07428a06-c989-4d78-b4bc-9ea9d36cf8b3
Started parsing the file under job_id c99969ed-0f4e-4c74-9b50-f48848a3511d
..Started parsing the file under job_id 188a95ad-ddf7-40bf-b380-37d1f29d1271


In [76]:
# --------------------------------------------------
# 2. Load all restructured markdown files and attach metadata
# --------------------------------------------------
md_files = glob.glob('./data/input_data/markdown/*-restructured.md')
all_documents = []
for md_file in md_files:
    loader = UnstructuredMarkdownLoader(md_file)
    docs = loader.load()
    for doc in docs:
        # Extract company and plan information from the filename.
        # Assumes filename pattern: policy-<company>-plan<plan_number>-restructured.md
        match = re.search(r'policy-([a-zA-Z_]+)-plan-([a-zA-Z_]+)', md_file, re.IGNORECASE)
        if match:
            doc.metadata["company"] = match.group(1).upper()
            doc.metadata["plan"] = match.group(2)
        else:
            doc.metadata["source_file"] = md_file
    all_documents.extend(docs)

In [78]:
# --------------------------------------------------
# 3. Split the loaded documents into chunks
# --------------------------------------------------
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs_split = text_splitter.split_documents(all_documents)

In [None]:
# # check the chunks have correct metadata
# docs_split[600].metadata

In [None]:
# --------------------------------------------------
# 4. Create embeddings using embeddings from Azure OpenAI
# --------------------------------------------------
embedding_model = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-3-small",  # Your Azure deployment name for embeddings
    openai_api_version="2024-02-01",
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"]
)

In [None]:
# Define the OpenAI embedding model
embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-small",  # You can choose a different model if needed
    openai_api_key=os.environ["openai_api_key"],
)

In [136]:
vector_store = Chroma.from_documents(
    documents=docs_split,  # List of document chunks
    embedding=embedding_model, 
    persist_directory="./data/vector_data/chroma_db"  # Path to persist data
)

# Persist the data
vector_store.persist()


In [None]:
######################## Multiple PDF processing End ####################

In [None]:
# # Create the vector store
# vector_store = FAISS.from_documents(docs_split, embedding_model)

# # Save FAISS index and document store
# faiss.write_index(vector_store.index, "./data/vector_data/faiss_index")
# with open("./data/vector_data/faiss_documents.pkl", "wb") as f:
#     pickle.dump(vector_store.docstore._dict, f)