In [9]:
%pwd

'S:\\Projects\\Medical-ChatBot'

In [10]:
import os 
os.chdir("S:\Projects\Medical-ChatBot")

In [11]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents


In [13]:
extracted_data = load_pdf_files("data")

In [14]:
len (extracted_data)

637

In [16]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [17]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [18]:
len (minimal_docs)

637

In [19]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [20]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 5859


In [21]:
# texts_chunk

In [22]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [23]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [24]:
vector = embedding.embed_query("Hello Embedding Model")
# vector

In [25]:
print( "Vector length:", len(vector))

Vector length: 384


In [26]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [27]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [28]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [29]:
pc

<pinecone.pinecone.Pinecone at 0x263d232b9d0>

In [32]:
from pinecone import ServerlessSpec
 

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [33]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [34]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [35]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='f0732e42-1470-4ea1-b9ce-742de9bd3f50', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='3f2a93c1-9673-43dd-b44e-bb1fbb807549', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='9001c769-5ca4-4385-a022-1bc5ef0fc74c', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a womanâ€™s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25')]

In [36]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [None]:
# print (GEMINI_API_KEY) 


In [None]:
# from google import genai

# # The client gets the API key from the environment variable `GEMINI_API_KEY`.
# client = genai.Client()

# response = client.models.generate_content(
#     model="gemini-2.5-flash", contents="Explain how AI works in a few words"
# )
# print(response.text)

AI learns patterns from data to make decisions or perform tasks.


In [None]:
# from langchain.chains import create_retrieval_chain
# from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain_core.prompts import ChatPromptTemplate

In [None]:
# system_prompt = (
#     "You are an Medical assistant for question-answering tasks. "
#     "Use the following pieces of retrieved context to answer "
#     "the question. If you don't know the answer, say that you "
#     "don't know. Use three sentences maximum and keep the "
#     "answer concise."
#     "\n\n"
#     "{context}"
# )


# prompt = ChatPromptTemplate.from_messages(
#     [
#         ("system", system_prompt),
#         ("human", "{input}"),
#     ]
# )


In [None]:
# question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
# rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
# response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
# print(response["answer"])

In [37]:
# import os
# from langchain_google_genai import ChatGoogleGenerativeAI
# from langchain_pinecone import PineconeVectorStore
# from langchain_classic.chains import create_retrieval_chain
# from langchain_classic.chains.combine_documents import create_stuff_documents_chain
# from langchain_core.prompts import ChatPromptTemplate

# # 1. Configuration
# os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
# os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

# # 2. Initialize Gemini 1.5 Flash
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

# # 3. Define the Prompt (Medical Assistant Persona)
# system_prompt = (
#     "You are a professional Medical Assistant. "
#     "Use the provided context to answer the user's question accurately. "
#     "If the answer isn't in the context, state that you don't know. "
#     "Limit your response to three concise sentences."
#     "\n\n"
#     "Context: {context}"
# )

# prompt = ChatPromptTemplate.from_messages([
#     ("system", system_prompt),
#     ("human", "{input}"),
# ])

# # 4. Create the RAG Chain
# # Using your existing 'retriever' from Pinecone
# combine_docs_chain = create_stuff_documents_chain(llm, prompt)
# rag_chain = create_retrieval_chain(retriever, combine_docs_chain)

# # 5. Run the Query
# query = "What is Acne and how is it treated?"
# response = rag_chain.invoke({"input": query})

# print(f"Medical Assistant: {response['answer']}")

In [41]:
import os
import google.generativeai as genai

# 1. Setup API Key
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# 2. Initialize the Model
model = genai.GenerativeModel("gemini-2.5-flash")

# 3. Retrieval Step (Using your existing Pinecone retriever)
query = "What is Acne?"
docs = retriever.invoke(query)

# Combine the content of the retrieved documents
context_text = "\n\n".join([d.page_content for d in docs])

# 4. Merged Prompt Logic
# We translate the LangChain system prompt into a structured input for Gemini
prompt = f"""
You are a Medical assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, say that you don't know. 
Use three sentences maximum and keep the answer concise.

Context:
{context_text}

Question: 
{query}

Answer:
"""

# 5. Generate Response
response = model.generate_content(prompt)

print("--- Medical Assistant Response ---")
print(response.text)

--- Medical Assistant Response ---
Acne is a general name for a skin disorder. This condition occurs when the sebaceous glands become inflamed.
