In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'c:\\Users\\soura\\OneDrive\\Desktop\\langchain\\Langchain_Project\\Medical-Chatbot'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [4]:
# Extract text from PDF files
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    documents = loader.load()
    return documents


In [5]:
extracted_data = load_pdf_file("data")

In [7]:
len(extracted_data)

637

In [8]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """_
    Args:
        docs (List[Document]): _description_

    Returns:
        List[Document]: return a new list of document objects
        containing only the "source" in metadata and the original "page_content".
    """
    
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={
                    "source": src,
                }
            )
        )
        
    return minimal_docs

In [9]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [10]:
# Split the documents into smaller chunks

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    
    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [11]:
text_chunks = text_split(minimal_docs)

In [12]:
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():
    """
    download and return the HuggingFace embedding model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

embedding = download_embeddings()


In [13]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [14]:
pinecone_api_key = os.getenv("PINECONE_API_KEY")
openai_api_key = os.getenv('OPENAI_API_KEY')

os.environ['PINECONE_API_KEY'] = pinecone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key

In [16]:
from pinecone import Pinecone

pc = Pinecone(
    api_key=pinecone_api_key
)

In [20]:
from pinecone import ServerlessSpec

index_name = 'medical-chatbot'

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws', region='us-east-1'))
    
    
index = pc.Index(index_name)

In [22]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embedding,
    index_name=index_name
)

In [25]:
# Loading Existing Index


docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [28]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [29]:
retriever_docs = retriever.invoke("What is a acne?")

print(retriever_docs)

[Document(id='ae082ebd-d50b-4429-a02a-622ee3268c40', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'), Document(id='57aaa34e-91d9-4cf6-8a06-2662ffb82914', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'), Document(id='97af45c9-7dc9-4e43-8200-63096c2a6201', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25')]


In [31]:
from langchain_openai import ChatOpenAI

chat = ChatOpenAI(model_name="gpt-4o")


In [32]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [35]:
system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer the question at the end. If you don't know the answer, say that you don't know."
    "Uee atmost three sentences and keep the answer concise."
    "\n\n"
    "{context}"
    
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [None]:
chain = prompt | retriever | chat

response = chain.invoke({'input':'What are Acromogaly and gigantism?'})
print(response['answer'])

KeyError: "Input to ChatPromptTemplate is missing variables {'context'}.  Expected: ['context', 'input'] Received: ['input']\nNote: if you intended {context} to be part of the string and not a variable, please escape it with double curly braces like: '{{context}}'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT "

In [36]:
question_answer_chain = create_stuff_documents_chain(chat, prompt=prompt)
rag_chain = create_retrieval_chain(
    retriever,
    question_answer_chain
)

In [None]:
response = chain.invoke({'input':'What are Acromogaly and gigantism?'})
print(response['answer'])

Acromegaly is a disorder characterized by the abnormal release of a specific chemical from the pituitary gland, leading to increased growth in bone and soft tissue and causing various disturbances throughout the body. Gigantism involves a similar mechanism but typically occurs before the fusion of growth plates in children, causing excessive growth. Both conditions result from excess growth hormone.
