In [None]:
%pwd

'c:\\Users\\Muhammad Nasir\\Desktop\\MedicalChatbot\\trail'

In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'c:\\Users\\Muhammad Nasir\\Desktop\\MedicalChatbot'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [5]:
extracted_data=load_pdf_file(data='Data/')

In [6]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [7]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5860


In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

In [9]:
!pip install -U sentence-transformers



In [10]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [11]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
len(embeddings.embed_query("Hello"))

384

In [25]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')

In [None]:
# from pinecone import Pinecone, ServerlessSpec

# pc = Pinecone(api_key=PINECONE_API_KEY)

# index_name = "medicalbot"


# pc.create_index(
#     name=index_name,
#     dimension=384, 
#     metric="cosine", 
#     spec=ServerlessSpec(
#         cloud="aws", 
#         region="us-east-1"
#     ) 
# ) 



In [18]:
index_name = "medicalbot"

In [27]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
groq_api_key = os.getenv("GROQ_API_KEY")

In [19]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [20]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [21]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [22]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='422c211b-b974-4b3f-bdd9-e1356bb935ee', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='47ef2d61-e5f3-4ba6-86aa-cde2e625ce61', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM -

In [28]:
from langchain_groq import ChatGroq
llm=ChatGroq(groq_api_key=groq_api_key,model_name="deepseek-r1-distill-llama-70b",temperature=0.4)

In [33]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer or the answer is out of context, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [34]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [35]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

<think>
Okay, so I need to figure out what acromegaly and gigantism are. Let me start by looking at the context provided. 

First, I see that acromegaly is defined as a disorder caused by an abnormal release of a chemical from the pituitary gland. This chemical is probably growth hormone since it's related to growth issues. It causes increased growth in bone and soft tissues and other body disturbances. 

Then, there's a mention that when the abnormality occurs after bone growth stops, it's called acromegaly. So, if the issue happens before the bones stop growing, that must be gigantism. That makes sense because gigantism is associated with excessive growth during childhood.

Looking further, the context says acromegaly is rare, affecting about 50 in a million people, and symptoms develop gradually, often diagnosed in middle age. It affects both men and women. 

So putting it together: Acromegaly is when the pituitary gland releases too much growth hormone after normal growth has stopp

In [36]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])

<think>
Okay, so I'm trying to figure out what "stats" refers to in the given context. The user provided a few paragraphs, but they're all about blood counts and heart conditions, specifically mentioning a complete blood count (CBC) and something about athletic heart syndrome. 

The term "stats" isn't directly mentioned in the context. It could be short for "statistics," but looking through the text, the only numbers I see are related to SUD (Sudden Unexplained Death) rates, like seven incidents per 100,000 people under 35, and about 25 athletes annually. 

But wait, the question is asking about "stats," which isn't explained here. The context talks about blood tests and heart conditions, not statistics in general. So, without more information, I can't determine what "stats" refers to in this specific context. It might be something else entirely not covered in the provided text.

I should probably say I don't know because "stats" isn't defined or discussed in the given context. It's po