In [2]:
%pwd

'c:\\Users\\H P\\Desktop\\Langchain Projects\\Medical Chatbot Agent\\Medical-Chatbot-Agent\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\H P\\Desktop\\Langchain Projects\\Medical Chatbot Agent\\Medical-Chatbot-Agent'

In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
# Extracting Data from the PDF File

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob = "*.pdf",
                             loader_cls = PyPDFLoader)
    
    documents = loader.load()

    return documents

In [7]:
extracted_data = load_pdf_file(data = "Data/")

In [None]:
extracted_data

In [9]:
# Splitting the Data into Text Chunks

def text_split(extracted_data, max_chunks = 9000):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks[:max_chunks]

In [10]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 9000


In [None]:
text_chunks

In [12]:
from langchain.embeddings import HuggingFaceEmbeddings

In [13]:
# Downloading the Embeddings from Hugging Face

def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [14]:
embeddings = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [15]:
query_result = embeddings.embed_query("Hello World")
print("Length", len(query_result))

Length 384


In [16]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key = "pcsk_6b7g47_AWy5TkuhPy1eymv4zEBLvngC9BgGv8ehkqRBPJGkt9CCiXmseKcTAndik7wSmzE")

index_name = "medicalbot"

pc.create_index(
    name = index_name,
    dimension = 384,
    metric = "cosine",
    spec = ServerlessSpec(
        cloud="aws",
        region="us-east-1"
        )
)

{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-lu6hoyy.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [17]:
import os
from dotenv import load_dotenv

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [18]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [19]:
# Embedding each Chunk and upsert the embeddings into the Pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings
)

In [20]:
# Loading the Existing Index

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)

In [21]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1e689daec10>

In [22]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

In [23]:
retrieved_docs = retriever.invoke("What is Acne?")

In [24]:
retrieved_docs

[Document(id='2593784d-d771-42b6-a219-5c4c85fb1554', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data\\Medical-Book.pdf', 'total_pages': 4505.0}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='3871564b-2a21-4c5a-bf55-9c270cfbbc0e', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data\\Medical-Book.pdf', 'total_pages': 4505.0}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nTretinoin— A drug that works by increasing the\nturnover 

In [25]:
# I need a full definition of Acne for that i will load LLM Models

from langchain_openai import OpenAI
llm = OpenAI(temperature = 0.4, max_tokens = 500)

In [26]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an Assisstant for Question-Answering Tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [27]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [28]:
response = rag_chain.invoke({"input" : "What is Acne?"})
print(response["answer"])



Acne is a skin disorder that is characterized by inflammation of the sebaceous glands, leading to the formation of pimples. It is caused by clogging of pores with oil, dead skin cells, and bacteria. Treatment options include drugs like tretinoin, which increases skin cell turnover.


In [29]:
# Asking some other disease question
response = rag_chain.invoke({"input" : "What is Acromegaly and gigantism?"})
print(response["answer"])



Acromegaly and gigantism are disorders caused by an abnormal release of a chemical from the pituitary gland, leading to increased growth in bone and soft tissue. It is a relatively rare disorder, affecting both men and women, and is often diagnosed in middle age due to the gradual onset of symptoms. If left untreated, the disease does not worsen.


In [31]:
# If i am giving any other field question not related to Medical
response = rag_chain.invoke({"input" : "What is tats?"})
print(response["answer"])



I don't know.
