In [2]:
%pwd  # project working directory

'c:\\Users\\saivi\\OneDrive\\Desktop\\Medical-Chatbot\\research'

In [3]:
#always work in main project directory

In [4]:
import os
os.chdir("../") # change to main project directory

In [5]:
%pwd

'c:\\Users\\saivi\\OneDrive\\Desktop\\Medical-Chatbot'

In [6]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
#Extract data from PDF file
def load_pdf_file(data):
    loader=DirectoryLoader(data, glob="*.pdf",loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [8]:
extracted_data=load_pdf_file(data="Data/")

In [9]:
#extracted_data

In [10]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)

    return text_chunks

In [11]:
text_chunks=text_split(extracted_data)
print(f"Total Chunks: {len(text_chunks)}")

Total Chunks: 5859


In [12]:
from langchain.embeddings import HuggingFaceEmbeddings

In [13]:
#Download Embedding Model from HuggingFace
def download_huggingface_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [14]:
embeddings = download_huggingface_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [15]:
query_result=embeddings.embed_query("Hello world")
print("length of query result:", len(query_result))

length of query result: 384


In [16]:
#query_result

In [17]:
from dotenv import load_dotenv
load_dotenv()

True

In [18]:
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY")

In [22]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-chatbot"

pc.create_index(
    name=index_name,
    dimension=384,  # Dimension of the embeddings
    metric="cosine",  # Similarity metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [23]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [24]:
#Embed each chunk and upsert  the embeddings into Pinecone index
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name
)

In [25]:
#Load Existing Index
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [26]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1d77bea5300>

In [27]:
retriever=docsearch.as_retriever(search_type="similarity",search_kwargs={"k": 3})

In [28]:
retrieved_docs = retriever.invoke("what is cardiac arrest?")

In [29]:
retrieved_docs

[Document(id='61e328ff-74cf-4424-b4ef-c95851996010', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 421.0, 'page_label': '422', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\medical_book.pdf', 'total_pages': 637.0}, page_content='some cases, removal of a small portion of the heart (abla-\ntion), implantation of a pacemaker or a cardioverter\ndefibrillator, or maze surgery is needed.\nIf the heart rate cannot be quickly controlled, electri-\ncal cardioversion may be used. Cardioversion, the elec-\ntric shock to the chest wall, is usually performed emer-\ngencies. This device briefly suspends the heart’s activity\nand allows it to return to a normal rhythm.\nAblation destroys the heart tissue that causes the'),
 Document(id='bf07e7ad-622f-4dac-958f-dd762fc81320', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 366.0, 'page_label

In [30]:
from langchain_google_genai import GoogleGenerativeAI as genai
model = genai(
    model="gemini-1.5-flash",
    google_api_key=GOOGLE_API_KEY
)

In [31]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt =(
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer"
    "the question. If you don't know the answer, say that you"
    "don't l<now. Use three sentences maximum and keep the"
    "answer concise. "
    "\n\n"
    " {context} "
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [32]:
question_answer_chain=create_stuff_documents_chain(model,prompt)
rag_chain=create_retrieval_chain(retriever,question_answer_chain)

In [34]:
response=rag_chain.invoke(
    {
        "input": "What is Acne?",
        "context": retrieved_docs
    }
)

print(response["answer"])

Acne is a common skin disease characterized by pimples on the face, chest, and back.  It happens when skin pores become clogged with oil, dead skin cells, and bacteria.  Acne vulgaris is the medical term for common acne, affecting millions.

