In [2]:
pip install pyreadline3

Note: you may need to restart the kernel to use updated packages.


In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [4]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob ="*.pdf",
        loader_cls = PyPDFLoader
        
    )
    
    documents = loader.load()
    return documents

In [5]:
extracted_data = load_pdf_file(data = 'D:\Medical-Chatbot\Data')

  extracted_data = load_pdf_file(data = 'D:\Medical-Chatbot\Data')


In [6]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [7]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5859


In [8]:
from langchain.embeddings import HuggingFaceEmbeddings


In [9]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [10]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [11]:

embeddings = download_hugging_face_embeddings()


  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
from dotenv import load_dotenv
load_dotenv()
import os

In [14]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [15]:
from pinecone import Pinecone, ServerlessSpec

In [16]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))




index_name = "medicalbot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-04', 'x-cloud-trace-context': '342ba1c65b45eb8fc9f554fa98edde09', 'date': 'Thu, 22 May 2025 05:31:52 GMT', 'server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [17]:
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [18]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore


docsearch = PineconeVectorStore.from_documents(
    documents= text_chunks,
    index_name = index_name,
    embedding = embeddings
)


In [19]:
#Load existing index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)




In [20]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1c8fc4043e0>

In [21]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs ={"k":3})

In [22]:
retrieved_docs = retriever.invoke("What is Acne?")

In [23]:
retrieved_docs

[Document(id='9eaa5004-5970-4776-8e26-dce419cd9ef5', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'D:\\Medical-Chatbot\\Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='fe046580-4333-4dab-844d-395404428d4f', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'D:\\Medical-Chatbot\\Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='0c039c2d-8d39-47d7-baf4-d781740c093e', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '

In [24]:
from langchain_openai import OpenAI
llm = OpenAI(temperature = 0.4, max_tokens = 100)

In [25]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt =(
     "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)



prompt = ChatPromptTemplate.from_messages(
   [ ("system", system_prompt),
    ("human","{input}")
   ]
)


In [26]:
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [27]:
response = rag_chain.invoke({"input": "What is Acne?"})
print(response["answer"])



Acne is a skin disorder that causes inflammation of the sebaceous glands. It is also known as acne vulgaris. It typically appears on the face and is characterized by the presence of pimples, blackheads, and whiteheads.


In [28]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])



I don't know.


In [29]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])



Acromegaly is a disorder caused by the overproduction of a chemical from the pituitary gland in the brain, leading to excessive growth in bone and soft tissue. It can also cause other health issues throughout the body. It is also known as gigantism when it occurs during childhood.
