In [147]:
import os
os.chdir("../")

In [158]:
%pwd

'/home/user/workspace'

In [165]:
import os
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

def load_pdf_files(data_path):
    # Ensure the path is absolute and correct
    abs_path = os.path.abspath(data_path)
    print(f"Searching in: {abs_path}")
    
    loader = DirectoryLoader(
        abs_path, 
        glob="*.pdf", # or "**/*.pdf" for subfolders
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

extracted_data = load_pdf_files("github.com/spha-code/Medical-Bot/data")

Searching in: /home/user/workspace/github.com/spha-code/Medical-Bot/data


In [None]:
extracted_data

In [166]:
len(extracted_data)

637

In [None]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """"
    Given a list of documents, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """                           
                        
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
            page_content=doc.page_content,
            metadata={"source": src}
            )
        )
    return minimal_docs


In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [None]:
minimal_docs

In [None]:
#Chunking

from langchain_text_splitters import RecursiveCharacterTextSplitter   # <- add this

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len
    )
    return text_splitter.split_documents(minimal_docs)


In [None]:
    texts_chunks = text_split(minimal_docs)
    print(f"Number of Chunks: {len(texts_chunks)}")

Number of Chunks: 5860


In [None]:
texts_chunks

In [None]:
#Save embedding model producing 384 length Arrays
from pathlib import Path
from langchain_huggingface import HuggingFaceEmbeddings

# keep the model inside the same dir as the notebook:  root/research/models
models_dir = Path.cwd() / "models"
models_dir.mkdir(exist_ok=True)

MODEL_NAME = "ibm-granite/granite-embedding-small-english-r2"

def download_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name=MODEL_NAME,
        cache_folder=str(models_dir),
        multi_process=False,
        encode_kwargs={"normalize_embeddings": True}
    )
    return embeddings

embedding = download_embeddings()
print("Model ready and cached at:", models_dir / MODEL_NAME.replace("/", "_"))

Model ready and cached at: /home/user/workspace/github.com/spha-code/Medical-Bot/models/ibm-granite_granite-embedding-small-english-r2


In [None]:
#Test the embedding model
vector = embedding.embed_query("Hello World")
print(vector)



[0.021002864465117455, -0.02443564310669899, -0.05162789300084114, -0.01234542578458786, -0.023628108203411102, -0.005379772745072842, -0.011701415292918682, -0.03672017529606819, -0.0016488833352923393, -0.06954658776521683, 0.0010515388567000628, 0.052037760615348816, 0.01886276714503765, 0.017806988209486008, 0.03734371438622475, 0.03196382895112038, 0.03222160413861275, 0.026957495138049126, -0.046651504933834076, 0.03092987649142742, 0.04854482412338257, -0.011765575967729092, 0.011466559022665024, 0.04000798240303993, -0.027761615812778473, 0.026947036385536194, 0.007618855219334364, 0.03420644253492355, -0.054677609354257584, 0.13453271985054016, -0.03166363015770912, -0.020482053980231285, -0.010511557571589947, 0.0005063044955022633, 0.04886968433856964, 0.009730384685099125, 0.026459503918886185, 0.006700490601360798, 0.028900211676955223, 0.006278106011450291, -0.0627150684595108, -0.022243812680244446, -0.047854483127593994, 0.023715773597359657, -0.004469519946724176, -0.0

In [None]:
print(len(vector))

384


In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [116]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [118]:
from pinecone import ServerlessSpec
index_name = "medical-chatbot" #This creates an Index in pinecone.io


In [119]:
if not pc.has_index(index_name):
    pc.create_index(
        name = index_name, # Index Container "medical-chatbot"
        dimension=384,
        metric = "cosine", # Similarity Metric: cosine (how to compare vectors)
        spec=ServerlessSpec(cloud="aws", region="us-east-1") # Location: AWS us-east-1 (eu not available in Free plan)
    )
index = pc.Index(index_name)


In [120]:
#Store Vector in Pincone Index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = texts_chunks,
    embedding = embedding,
    index_name = index_name
)

In [121]:
#Load an existing Index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embedding
)

In [122]:
#Add more data (e.g. String or PDF) to an existing Pinecone Index

AddDoc = Document(
    page_content="Add this piece of Data to the Pinecone Index",
    metadata={"source" : "Manual String Insert"}
    )

In [123]:
docsearch.add_documents(documents = [AddDoc])

['7c6cb16f-3dd1-443f-8925-291531fbb48b']

In [124]:
#Create the Retrieval

retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k" : 3})

In [125]:
retrieved_docs = retriever.invoke("What is a headache?")
retrieved_docs



In [None]:
#Connect the OPENAI LLM

from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-5-nano", use_responses_api=True)


In [None]:
#Diagnostic Script to list models available to API Key
import google.generativeai as genai
import os

# Set your key
genai.configure(api_key=GEMINI_API_KEY)

print("Available models for your API key:")
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(f"-> {m.name}")

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=GEMINI_API_KEY
)

In [129]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain

In [None]:
system_prompt = (
    "You are a medical assistant for question-answering tasks"
    "Use the following pieces of retrieved context to answer"
    "the question. If you don't know the answer, say that you"
    "don't know. Use three sentences maximum and keep the"
    "answer concise"
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt), #sytem prompt given above
    ("human", "{input}") #human input through chatbot
])



In [131]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [134]:
response = rag_chain.invoke({"input" : "List Antimigraine drugs"})
print(response["answer"])

The provided text defines antimigraine drugs as medicines used to prevent or reduce the severity of migraine headaches, but it does not list any specific drugs.
