In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    document = loader.load()
    return document

In [4]:
extracted_data = load_pdf_file(data = '../data/')

In [5]:
#split the data into Text Chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [6]:
text_chunks = text_split(extracted_data)
print("length of text chunks : ",len(text_chunks))

length of text chunks :  5860


In [7]:
from langchain.embeddings import HuggingFaceEmbeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
    #gives embedding of dimension 384
    return embeddings

In [8]:
embeddings  = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [9]:
query_result = embeddings.embed_query("hello world")

print("Length", len(query_result))

Length 384


In [10]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
import os
load_dotenv()
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
else:
    print(f"Index '{index_name}' already exists.")


Index 'medicalbot' already exists.


In [11]:

from dotenv import load_dotenv
import os
load_dotenv()
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
os.environ["HUGGINGFACEHUB_API_TOKEN"] =HUGGINGFACEHUB_API_TOKEN

In [73]:
from langchain_pinecone import PineconeVectorStore

#embed each chunk and upsert the embeddings into Pincone index

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings,
)

KeyboardInterrupt: 

In [12]:
#Load existing index

from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings,
)



In [13]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x21f0f053cb0>

In [13]:
retriever = docsearch.as_retriever(search_type ="similarity", search_kwargs = {"k":3})

In [14]:
retriever_docs = retriever.invoke("What is acne?")
retriever_docs

[Document(id='3e8c4133-aee0-4a3b-a499-00d0c030f05a', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': '..\\data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='ffe6c9c3-fa9b-4e19-a9a6-b21073179fc2', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': '..\\data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.

In [15]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
import os

TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
huggingface_repo_id = "mistralai/Mistral-7B-Instruct-v0.3"

def load_llm(hugging_face_repo_id):
    llm = HuggingFaceEndpoint(
        repo_id=hugging_face_repo_id,
        task="text-generation",
        temperature=0.3,
        huggingfacehub_api_token=TOKEN,
        model_kwargs={"max_length": 512}
    )
    return llm


In [16]:
from langchain_core.prompts import PromptTemplate

custom_prompt_template = """
Use the pieces of information provided in context to answer user's question.
If you don't know the answer, just say 'Sorry, I don't know. I can help you with medical related questions.'

Context = {context}
Question ={question}

Start thr answer directlt. No small talk please.
"""

prompt = PromptTemplate(template=custom_prompt_template,input_variables=["context","question"])


In [17]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm = load_llm(huggingface_repo_id),
    chain_type ="stuff",
    retriever =retriever,
    return_source_documents = True,
    chain_type_kwargs ={'prompt':prompt}

)

In [18]:
response = qa_chain.invoke({'query':"What is acne?"})
print(response['result'])




Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.


In [52]:
response = qa_chain.invoke({'query':"How to treat acne?"})
print(response['result'])




To treat acne, one can follow a few steps. For mild noninflammatory acne, reducing the formation of new comedones can be done with topical tretinoin, benzoyl peroxide, adapalene, or salicylic acid. Tretinoin is especially effective because it increases turnover (death and replacement) of skin cells. When acne is complicated by inflammation, topical antibiotics may be added to the treatment regimen. Improvement is usually seen in two to four weeks.


In [67]:
response = qa_chain.invoke({'query':"What is oesophagus"})
print(response['result'])




The esophagus is the muscular tube that leads from the back of the throat to the entrance of the stomach. It is the part of the digestive tract through which food passes on its way from the mouth to the stomach.


In [19]:
response = qa_chain.invoke({'query':"Who is stats?"})
print(response['result'])




The statistic provided in the context refers to the annual rate of Sudden Cardiac Death (SUD) in people less than 35 years of age. It is less than seven incidents per 100,000 people, and only about 8% of all SUD cases are exercise-related. This means that each year approximately 25 athletes experience SUD.
