In [70]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

# import logging
# logging.getLogger().setLevel(logging.ERROR)

In [71]:

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [72]:
# Extract Data From the PDF File
def load_pdf_file(data):
     # Read the text content from the file
     loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
     documents = loader.load()
     return documents

In [73]:
# Get the current working directory
current_dir = os.getcwd()
# Move one level up and then into the 'data' folder
data_folder_path = os.path.join(current_dir, '..', 'data')
extracted_data = load_pdf_file(data_folder_path)

In [74]:
# extracted_data

In [75]:
# Split the Data into Text Chunks
# Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
# Balances between maintaining coherence and adhering to character limits.
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [76]:
text_chunks = text_split(extracted_data)
print(f"Number of document chunks: {len(text_chunks)}")
print(f"Sample chunk:\n{text_chunks[0].page_content}\n")

Number of document chunks: 7035
Sample chunk:
TheGALE
ENCYCLOPEDIA
ofMEDICINE
SECOND EDITION



In [77]:
# text_chunks

In [78]:
from langchain.embeddings import HuggingFaceEmbeddings

In [79]:
# Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')  #`384` dimensional 
    return embeddings

In [80]:
embeddings = download_hugging_face_embeddings()

In [81]:
# check embedding works or not
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [82]:
# query_result

In [83]:
from dotenv import load_dotenv
# Load environment variables from .env
load_dotenv()

True

In [84]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

# Initialize Pinecone gRPC client instance
pc = Pinecone()

# Define the index name for the medical chatbot embeddings
index_name = "medicalchatbot"

pc.create_index(
    name = index_name,            # Name of the index to be created
    dimension = 384,              # Dimension size of the embeddings (replace with your model's embedding size)
    metric = "cosine",            # Distance metric for nearest neighbor search (e.g., cosine similarity)
    spec = ServerlessSpec(        # Serverless configuration
        cloud="aws",            # Cloud provider where the index is hosted
        region="us-east-1"      # Specific region to deploy the index
    )
)
print(f"Index '{index_name}' has been created.")
    
   

Index 'medicalchatbot' has been created.


In [85]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    embedding = embeddings, 
    index_name = index_name  
)

In [86]:
# Load Existing index from Pinecone (vector store) index 
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)

In [87]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1ed23e90eb0>

In [88]:
# Define the user's question
query = "What is Acne?"

# Retrieve relevant documents based on the query
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)
relevant_docs = retriever.invoke(query)

In [89]:
# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")


--- Relevant Documents ---
Document 1:
Acidosis seeRespiratory acidosis; Renal
tubular acidosis; Metabolic acidosis
Acne
Definition
Acne is a common skin disease characterized by
pimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.
Description
Acne vulgaris, the medical term for common acne, is

Source: c:\Users\yrobi\Desktop\Robin World\Data Science - Machine Learning Prep\01 - Generative AI\MedGPT-DiagnosisBot-LargeLanguageModel\research\..\data\Medical_book.pdf

Document 2:
GALE ENCYCLOPEDIA OF MEDICINE 2 25Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceousglands become inflamed. (Photograph by Biophoto Associ-
ates, Photo Researchers, Inc. Reproduced by permission.)GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25

Source: c:\Users\yrobi\Desktop\Robin World\Data Science - Machine Learning Prep\01 - Generative AI\MedGPT-DiagnosisBot-LargeLan

In [90]:
# Create a ChatOpenAI model
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [91]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Contextualize question prompt
# This system prompt helps the AI understand that it should reformulate the question
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

# Prompt with System and Human Messages (Using Tuples)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [92]:
# Create a chain to combine documents for question answering
# `create_stuff_documents_chain` feeds all retrieved context into the LLM
question_answer_chain = create_stuff_documents_chain(llm, prompt)
# Create a retrieval chain that combines the history-aware retriever and the question answering chain
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [93]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])



Acromegaly and gigantism are medical conditions that involve excessive growth hormone production, leading to abnormal growth of the bones and tissues. They are characterized by enlarged body parts, such as the hands, feet, and face. Treatment for these conditions may include medication, surgery, or radiation therapy.


In [94]:

response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])



Stats is a shortened term for statistics, which is the practice of collecting, analyzing, and interpreting data. It involves using mathematical and analytical techniques to understand and make conclusions about a particular set of data. It is commonly used in fields such as data science, economics, and social sciences.


## Medical chatbot RAG conversation

In [106]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.chains import create_history_aware_retriever, create_retrieval_chain

In [107]:
# Create a ChatOpenAI model
llm = ChatOpenAI(model="gpt-4o-mini")

# Contextualize question prompt
# This system prompt helps the AI understand that it should reformulate the question
# based on the chat history to make it a standalone question
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, just "
    "reformulate it if needed and otherwise return it as is."
)

# Create a prompt template for contextualizing questions
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


In [108]:
# Create a history-aware retriever
# This uses the LLM to help reformulate the question based on chat history
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [109]:
# Answer question prompt
# This system prompt helps the AI understand that it should provide concise answers
# based on the retrieved context and indicates what to do if the answer is unknown
qa_system_prompt  = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [110]:
# Create a prompt template for contextualizing questions
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [111]:
# Create a chain to combine documents for question answering
# `create_stuff_documents_chain` feeds all retrieved context into the LLM
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

In [112]:
# Create a retrieval chain that combines the history-aware retriever and the question answering chain
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [113]:
# Function to simulate a continual chat
def continual_chat():
    print("Start chatting with the AI! Type 'exit' to end the conversation.")
    chat_history = []  # Collect chat history here (a sequence of messages)
    while True:
        query = input("You: ")
        if query.lower() == "exit":
            break
        # Process the user's query through the retrieval chain
        result = rag_chain.invoke({"input": query, "chat_history": chat_history})
        # Display the AI's response
        print(f"AI: {result['answer']}")
        # Update the chat history
        chat_history.append(HumanMessage(content=query))
        chat_history.append(SystemMessage(content=result["answer"]))

In [114]:
# Main function to start the continual chat
if __name__ == "__main__":
    continual_chat()

Start chatting with the AI! Type 'exit' to end the conversation.
AI: Acne is a common skin disease characterized by pimples on the face, chest, and back, resulting from clogged pores due to oil, dead skin cells, and bacteria. The medical term for common acne is acne vulgaris, which involves inflammation of the sebaceous glands. Treatment options include topical lotions, soaps, gels, and, for severe cases, isotretinoin (Accutane).
AI: Robins Yadav is a data professional with a master's degree specializing in Data Science and Machine Learning. He is proficient in Python, SQL, AWS, and Big Data, and has expertise in healthcare analytics, NLP, and LLM. Robins also has experience in model development and MLOps for deploying scalable machine learning solutions.
