In [1]:
print('Hello')

Hello


In [4]:
%pwd

'e:\\GenAi\\Medical-Doctor-Chatbot-Using-Pinecone-and-Gemini-'

In [3]:
import os
os.chdir('../')

In [5]:
from langchain.document_loaders import PyPDFLoader , DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
#Exracting Data from the PDF File
def load_pdf(file_path):
    loader = DirectoryLoader(file_path ,
                             glob="**/*.pdf",
                             loader_cls=PyPDFLoader)
    data = loader.load()
    return data

In [7]:
exctracted_data = load_pdf('data/')

In [10]:
#exctracted_data 


In [11]:
#Extracting into chunks
def split_text(data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(data)
    return text_chunks


In [12]:
text_chunks = split_text(exctracted_data)
print(f"Number of chunks: {len(text_chunks)}")

Number of chunks: 5860


In [19]:
#Dowloading the hugging face embedding model
from langchain.embeddings import HuggingFaceEmbeddings


def download_huggingface_model():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",)
    return embeddings



In [20]:
#Dowloading the embedding model
embeddings = download_huggingface_model()
print(embeddings)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='sentence-transformers/all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False


In [22]:
#pip install ipywidgets


In [36]:
#Initializing the Pinecone vector store
from dotenv import load_dotenv
from pinecone.grpc import PineconeGRPC as Pinecone
import os

load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

index_name = "doctorbot"

pc = Pinecone(PINECONE_API_KEY)
index = pc.Index(index_name)

In [28]:
#function to store the vector embedding
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name,
)

In [None]:
#getting the embeedding from an existing index

docsearch = PineconeVectorStore.from_existing_index(
    embedding=embeddings,
    index_name=index_name,
)

In [31]:
#initliazing the retriever

retriever = docsearch.as_retriever(search_type='similarity',search_kwargs={"k": 3})

In [32]:
test_retriver = retriever.invoke("What is cancer?")

In [34]:
test_retriver 


[Document(id='23ac73f2-4070-40f9-bafc-22a96ee95de7', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 184.0, 'page_label': '185', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='KEY TERMS\nBiopsy—A procedure in which a small piece of\nbody tissue is removed and examined under a\nmicroscope for cancer.\nChemotherapy —A cancer treatment in which\ndrugs delivered into the blood stream kill cancer\ncells or make them more vulnerable to radiation\ntherapy.\nHuman papillomavirus (HPV)—A virus with many\nsubtypes, some of which cause cell changes that\nincrease the risk of certain cancers.\nHuman immunodeficiency virus (HIV)—The virus\nthat causes acquired immune deficiency syn-'),
 Document(id='141b6c16-7cf8-4c94-abe2-393338a23d34', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 5

In [37]:
#iinitializing the LLM model
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",
                            google_api_key=GOOGLE_API_KEY , 
                            temperature=0.7)


In [38]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [39]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [42]:
response = rag_chain.invoke({"input": "what is cancer?"})
print(response["answer"])

Cancer is caused by the development of malignant cells. Cancer cells are characterized by uncontrolled division. In breast cancer, the malignant cells originate in the lining of the milk glands or ducts of the breast.
