In [3]:
print("OK")

OK


In [4]:
import os
os.chdir("../")

In [5]:
%pwd

'c:\\Users\\shara\\OneDrive\\Desktop\\programs\\git repos\\Medical chatbot\\AI-Medical-ChatBot'

In [6]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [8]:
extracted_data = load_pdf_files("data")

In [9]:
len(extracted_data)

637

In [16]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of document objects, return a new list of documents with only the page content and source metadata
    """
    minimal_docs : List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [17]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [19]:
# Split document into small chunks

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, 
        chunk_overlap=20,
    )
    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [20]:
text_chunks = text_split(minimal_docs)
len(text_chunks)

5859

In [22]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFaceEmbeddings model for embedding the text chunks.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
    )
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [25]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [100]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["NVIDIA_API_KEY"] = NVIDIA_API_KEY

In [75]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [76]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,  # Dimension of the embedding vectors
        metric="cosine",  # Similarity metric
        spec=ServerlessSpec(cloud="aws", region="us-east-1")  # Serverless configuration
    )

index = pc.Index(index_name)

In [77]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embedding,
    index_name=index_name
)

In [78]:
# Load existing index

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and add to the Pinecone index

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

# Add more data to Pinecone index

In [None]:
new_data = Document(
    page_content = "Content of page 1 of document 1",
    metadata = {"source": "document1.pdf"}
)
# docsearch.add_documents([new_data])

In [79]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [80]:
tes_query = retriever.invoke("What is acne?")

In [81]:
tes_query

[Document(id='225438c1-0e53-40bf-b153-fbfcc3840550', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='ff2cdfca-64f1-44b4-a6e3-4c60e737a329', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a womanâ€™s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='8f9a2a5a-e1d4-4bad-a852-a6d77f31ff39', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged 

In [91]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
chatmodel = ChatNVIDIA(model="meta/llama-3.3-70b-instruct")



In [98]:
system_prompt = (
    "You are a medical assistant. Use the context provided below to answer the user's question. "
    "If the answer is not contained within the context, strictly state that you do not know. "
    "Limit your response to 3 concise sentences."
    "\n\n"
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [113]:
ques_ans_chain = create_stuff_documents_chain(chatmodel, prompt)
rag_chain = create_retrieval_chain(retriever, ques_ans_chain)

In [114]:
response = rag_chain.invoke({"input": "What is acne?"})
print(response["answer"])

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. Acne is also known as acne vulgaris, which is the most common skin disease, affecting nearly 17 million people in the United States.
