In [None]:
import os

os.chdir("../")
!pwd


/Users/ravisohal/Projects/GenerativeAI-MedicalBot


In [25]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [26]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [None]:
extracted_data=load_pdf_file(data='data/')

In [9]:
#extracted_data

In [30]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [31]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 39994


In [32]:
from langchain.embeddings import HuggingFaceEmbeddings

In [33]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [39]:
embeddings = download_hugging_face_embeddings()

In [51]:
from dotenv import load_dotenv
load_dotenv()

True

In [52]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [54]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 



In [55]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [56]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [57]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [65]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x3063e0970>

In [66]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [68]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [69]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [70]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [71]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])



Acromegaly and gigantism are both disorders caused by abnormal release of a chemical from the pituitary gland, which leads to excessive growth in bone and soft tissue. This chemical is also responsible for other disturbances in the body. If left untreated, the disease will not worsen. There are tests that can confirm if the pituitary gland is underactive or overproducing hormones, which can lead to either hypopituitarism or hyperpituitarism.


In [72]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])



Stats, short for statistics, is a branch of mathematics that deals with the collection, analysis, interpretation, presentation, and organization of data. It involves using mathematical methods to summarize and describe data, as well as making inferences and predictions based on the data. Stats is used in a variety of fields, including science, business, economics, and social sciences, to help understand and make decisions based on data.
