In [20]:
# !pip install pinecone
# !pip show pinecone
# !pip install pinecone langchain openai pypdf tiktoken
# !pip install -U langchain-community
# !pip install python-dotenv
# !pip install -U python-docx lxml pandas
# !pip install unstructured

In [117]:
#Import Required libraries
from pinecone import Pinecone #pinecone: used for vector database storage
import pandas as pd #manage dataframe manupulations
from pathlib import Path #extract the files from folder
from dotenv import load_dotenv #load the key from .ecv

#Langchain: this python framework used to connect documents, LLMs embedding, vector databases.
from langchain_community.document_loaders import PyPDFLoader, TextLoader #To load docs
from langchain.document_loaders import UnstructuredXMLLoader #Toload xml files
from langchain.text_splitter import RecursiveCharacterTextSplitter # To chunck
from langchain.embeddings import OpenAIEmbeddings #Embed the chucks using OpenAI LLM
from langchain_openai import OpenAIEmbeddings #Initialization
from langchain_pinecone import PineconeVectorStore #Partner package Langchain pinecone
from uuid import uuid4 #unique id for chunks
from langchain_core.documents import Document #add documents into pinecone
from langchain.vectorstores import Pinecone as LangChainPinecone#Store the Embed vector

#Interactive with OpenAI (embeddings and LLM)
#from langchain.chat_models import ChatOpenAI # chat with openAI wrapper from Langchain
from langchain_openai import ChatOpenAI # chat with openAI wrapper from Langchain
from langchain.chains import RetrievalQA #To connect LLM with retriever

import os #handle keys

In [79]:
load_dotenv() #Load environment variables from .env file

openai_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENV")
index_name = "medassist-index"

In [112]:
from pinecone import Pinecone, ServerlessSpec
#OpenAI key (from .env)
os.environ["OPENAI_API_KEY"] = openai_api_key

pc = Pinecone(api_key="key")
index = pc.Index(index_name) #access the index in pinecone

In [18]:
data_folder = Path("source_docs") #Folder containing all files

all_docs = [] #This list will collect all parsed documents

for file_path in data_folder.glob("*"): #Loop through every file in that folder using glob("*")
    ext = file_path.suffix.lower() #get the file extension
    try:
        if ext == ".pdf":
            loader = PyPDFLoader(str(file_path)) # Create a loader for PDFs
            docs = loader.load() #Extract text and metadata

        elif ext in [".txt",".md"]:
            loader = TextLoader(str(file_path)) #Text loader
            docs = loader.load() #Extract text and metadata

        elif ext in[".xlsx",".xls"]:
            df_sheets = pd.read_excel(file_path, sheet_name=None) #takes all sheets
            docs = []
            for sheet_name, sheet_df in df_sheets.items():
                content = sheet_df.to_string() #convert DataFrame into string
                docs.append({
                    "page_content":content,
                    "metadata":{
                        "source":f"{file_path.name} - {sheet_name}"
                    }
                })
        elif ext in [".xml",".xsd"]:
            loader = UnstructuredXMLLoader(str(file_path)) #Uses lxml internally
            docs = loader.load() #Extract text and metadata
        else:
            print(f"Skipping unsupported file: {file_path.name}")
            continue
        
        all_docs.extend(docs)
        print (f" Loaded {file_path.name}")

    except Exception as e:
        print(f"Error loading {file_path.name}: {e}")

 Loaded cpt-category3-codes-long-descriptors.cleaned.pdf
 Loaded HCPC2025_JUL_ANWEB_Corrections_for_V3.xlsx
 Loaded HCPC2025_JUL_ANWEB_Transaction Report_v3.xlsx
 Loaded HCPC2025_JUL_ANWEB_v3.txt
 Loaded HCPC2025_JUL_ANWEB_v3.xlsx
 Loaded HCPC2025_recordlayout.txt
 Loaded ICD-10-CM FY25 Guidelines October 1, 2024.cleaned.pdf
 Loaded icd10cm_drug_2025.pdf
 Loaded icd10cm_drug_2025.xml
 Loaded icd10cm_drug_neoplasm.xsd
 Loaded icd10cm_eindex_2025.pdf
 Loaded icd10cm_eindex_2025.xml
 Loaded icd10cm_index.xsd
 Loaded icd10cm_index_2025.pdf
 Loaded icd10cm_index_2025.xml
 Loaded icd10cm_neoplasm_2025.pdf
 Loaded icd10cm_neoplasm_2025.xml
 Loaded icd10cm_tabular.xsd
 Loaded icd10cm_tabular_2025.pdf
 Loaded icd10cm_tabular_2025.xml
 Loaded NOC codes_JUL2025.xlsx
 Loaded proc_notes_JUL2025.txt


In [26]:
from langchain.schema import Document
clean_docs = []

for doc in all_docs:
    if isinstance(doc,dict):
        #If it's a dictionary, convert it to a langchain document
        converted = Document(
            page_content=doc["page_content"],
            metadata=doc["metadata"]
        )
        clean_docs.append(converted)
    else:
        #Already a Document object, just add it as is
        clean_docs.append(doc)

# Replace old list with cleaned list
all_docs = clean_docs

In [85]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
split_docs = splitter.split_documents(all_docs) #split into chunks
print(f" Split into {len(split_docs)} chunks")

 Split into 129192 chunks


In [109]:
print(pc.list_indexes().names())
print(pc.describe_index("medassist-index"))

['medassist-index']
{'deletion_protection': 'disabled',
 'dimension': 1536,
 'host': 'medassist-index-ikzyzex.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'medassist-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'},
 'tags': {'embedding_model': 'text-embedding-3-small'},
 'vector_type': 'dense'}


In [108]:
# Define the OpenAI embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

#Pass the pinecone APIkey and index name and OpenAI embedding model
vector_store = PineconeVectorStore(index = index, embedding= embedding_model) 

uuids_for_split_docs = [str(uuid4()) for _ in range(len(split_docs))] #Generate unique id for each chunk
batch_size = 50 #Choose a batch size

#Upload in batches to avoid Pinecone's 4MB limit
for i in range (0,len(split_docs), batch_size):
    batch_docs = split_docs[i:i+batch_size]
    batch_ids = uuids_for_split_docs[i:i+batch_size]
    try:
        vector_store.add_documents(documents=batch_docs,ids=batch_ids) # This is where chunks added to pinecone vectore store
        print (f" Upload batch {i // batch_size+1}:{len(batch_docs)} documents")
    except Exception as e:
        print (f" Failed on batch {i // batch_size + 1}: {e}")

print(f" Added {len(split_docs)} chunks to the Pinecone vector store '{index_name}'")

 Upload batch 1:50 documents
 Upload batch 2:50 documents
 Upload batch 3:50 documents
 Upload batch 4:50 documents
 Upload batch 5:50 documents
 Upload batch 6:50 documents
 Upload batch 7:50 documents
 Upload batch 8:50 documents
 Upload batch 9:50 documents
 Upload batch 10:50 documents
 Upload batch 11:50 documents
 Upload batch 12:50 documents
 Upload batch 13:50 documents
 Upload batch 14:50 documents
 Upload batch 15:50 documents
 Upload batch 16:50 documents
 Upload batch 17:50 documents
 Upload batch 18:50 documents
 Upload batch 19:50 documents
 Upload batch 20:50 documents
 Upload batch 21:50 documents
 Upload batch 22:50 documents
 Upload batch 23:50 documents
 Upload batch 24:50 documents
 Upload batch 25:50 documents
 Upload batch 26:50 documents
 Upload batch 27:50 documents
 Upload batch 28:50 documents
 Upload batch 29:50 documents
 Upload batch 30:50 documents
 Upload batch 31:50 documents
 Upload batch 32:50 documents
 Upload batch 33:50 documents
 Upload batch 34:50

In [125]:
#convert the vectorstore into a retriever
retriever = vector_store.as_retriever() #LLM "look up" relevant chunks from index

# Initialize OpenAI with model name
llm = ChatOpenAI (model="gpt-3.5-turbo")

# Create RAG chain this chain tells LLM to first retrieve relevant document from Pinecone
# Then answer the question using those documents.
qa_chain = RetrievalQA.from_chain_type(
    llm = llm, #Use the LLM
    retriever = retriever, #Use the retriever
    chain_type ="stuff" #concatenate retrieved docs and pass them to the LLM
)

#Ask a question to the RAG system
query = "Essential (primary) hypertension (comorbidity) Electrocardiogram"
Fullquery = f"Provide the result in JSON format like Code Type(ICD10/CPC/HCPC) - Code  - Description for this {query}"
# ST elevation (STEMI) myocardial infarction of unspecified site
# Essential (primary) hypertension (comorbidity) Electrocardiogram
# Routine ECG with at least 12 leads Catheterization of left heart (cardiac procedure)
# Ambulatory infusion pump, per day (if used for medication)

#Run the query through the RAG pipeline
# fetch relevant chunks from pinecone
# pass them as context to the LLM
# return the final answer
response = qa_chain.invoke(Fullquery)

#Print the result
print(response)

{'query': 'Provide the result in JSON format like Code Type(ICD10/CPC/HCPC) - Code  - Description for this Essential (primary) hypertension (comorbidity) Electrocardiogram', 'result': '{\n    "Code Type": "ICD-10",\n    "Code": "I10",\n    "Description": "Essential (primary) hypertension"\n}'}
