In [12]:
from langchain.document_loaders import PyMuPDFLoader , DirectoryLoader,PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

In [2]:
def load_pdf_file(data):
    loader=DirectoryLoader(data, glob='*.pdf', loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [7]:
extracted_data=load_pdf_file(data='/home/somanathan/Desktop/chat_bot/Data')

In [9]:
def text_split(extracted_data):
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [10]:
text_chunks= text_split(extracted_data)

In [13]:
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [14]:
embeddings= download_hugging_face_embeddings()

  from tqdm.autonotebook import tqdm, trange


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

In [18]:
res=embeddings.embed_query('hello world')
len(res)

384

In [21]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import  load_dotenv
import os
load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "medicalchatbot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "medicalchatbot",
    "metric": "cosine",
    "host": "medicalchatbot-lsay9ft.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [22]:
from langchain_pinecone import PineconeVectorStore

docsearch= PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [23]:
docsearch=PineconeVectorStore.from_existing_index(index_name=index_name,embedding=embeddings)

In [24]:
retriever=docsearch.as_retriever(search_type='similarity',search_kwargs={"k":3})

In [25]:
result=retriever.invoke("what is Acne")

In [26]:
result

[Document(id='7b6f8812-51a6-419b-b2dc-8f9ff815305e', metadata={'page': 37.0, 'source': '/home/somanathan/Desktop/chat_bot/Data/Medical_book.pdf'}, page_content='Acidosis seeRespiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is'),
 Document(id='c7f95e2e-6faf-4673-bd74-5f4bf5c508a0', metadata={'page': 38.0, 'source': '/home/somanathan/Desktop/chat_bot/Data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25Acne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceousglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='26

In [27]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [28]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate  

In [29]:
system_prompt=(
    "You are an assistant for question-answering tasks. "
    "use the following pieces of retieved context to answer "
    "the question . If you don't know the answer that you "
    "don't know. Use three sentences maximum and keep the answer concise."
    "\n\n"
    '{context}'
)

In [30]:
prompt=ChatPromptTemplate.from_messages(
    [
        ('system',system_prompt),
        ("human","{input}"),
    ]
)

In [31]:
question_answer_chain= create_stuff_documents_chain(llm,prompt)
rag_chain=create_retrieval_chain(retriever,question_answer_chain) 

In [39]:
response=rag_chain.invoke({'input': "What is Brain biopsy and its uses"})
print(response)

{'input': 'What is Brain biopsy and its uses', 'context': [Document(id='2b4c53a0-9f6a-4b97-b5e5-4143b179324e', metadata={'page': 588.0, 'source': '/home/somanathan/Desktop/chat_bot/Data/Medical_book.pdf'}, page_content='What You Need to Know About Brain Tumors. <http://rex.\nnci.nih.gov/WTNK_PUBS/brain/index.htm (28 Septem-ber 1998).\nRosalyn Carson-DeWitt, M.D.\nBreast biopsy\nDefinition\nA breast biopsy is removal of breast tissue for exam-\nination by a pathologist. This can be accomplished surgi-cally, or by withdrawing tissue through a needle.\nPurpose\nA biopsy is recommended when a significant\nabnormality is found, either on physical examination'), Document(id='15aa4fec-f249-4965-ba69-7adc4cd5e11b', metadata={'page': 580.0, 'source': '/home/somanathan/Desktop/chat_bot/Data/Medical_book.pdf'}, page_content='tice of Neurology , Ed. Martin Samuels and Steven Feske.\nNew York: Churchill Livingstone, 1996.\nWispelwey, Brian, and Carole A. Sable. “Intracranial Suppura-\ntion.” In Cur

In [38]:
response["answer"]

'A brain biopsy is the removal of a small piece of brain tissue to diagnose abnormalities. These abnormalities can include Alzheimer’s disease, tumors, infection, or inflammation. By examining the tissue sample, doctors can guide diagnosis and treatment.\n'