## **DEPENDENCIES**

In [34]:
import os
from langchain_groq import ChatGroq
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain

### **Function to Load the PDF File using PyPDF**

In [35]:
def load_pdf_file(data : str) -> list:
    """
    Loads all PDF files from the specified directory using PyPDFLoader.

    Arguments:

        `data`         {str}       : Path to the directory containing PDF files.

    Returns:

        `documents`     {list}     : A list of documents loaded from the PDF files.
                                     Each document is typically a LangChain Document object.
    """
    loader      = DirectoryLoader(data,
                                  glob       = "*.pdf",
                                  loader_cls = PyPDFLoader)

    documents   = loader.load()

    return documents

In [36]:
extracted_data = load_pdf_file(data = '/content/')

### **Split the Data into Text Chunks**

In [37]:
def text_split(extracted_data : list) -> list:
    """
    Splits extracted documents into smaller text chunks using a recursive character splitter.

    Arguments:

        - `extracted_data`     {list}        : A list of documents, typically obtained from loading PDFs.

    Returns:

        - `text_chunks`        {list}        : A list of text chunks, each chunk being a smaller portion of the original documents.

    """
    text_splitter  = RecursiveCharacterTextSplitter(chunk_size     = 500,
                                                    chunk_overlap  = 20)
    text_chunks    = text_splitter.split_documents(extracted_data)

    return text_chunks


In [38]:
text_chunks   =  text_split(extracted_data)

print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5860


### **Download the Embeddings from Hugging Face**

In [39]:
def download_hugging_face_embeddings():

    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')

    return embeddings

In [15]:
embeddings     = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


## ***API Setup***

In [None]:
PINECONE_API_KEY               = "PINECONE_API_KEY"
OPENAI_API_KEY                 = "OPENAI_API_KEY"
GROQ_API_KEY                   = "GROQ_API_KEY"

index_name                     = "medical-chatbot"


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"]   = OPENAI_API_KEY
os.environ["GROQ_API_KEY"]     = GROQ_API_KEY

### **Creating Index in PineCone Vector-Database**

In [19]:
pc            = Pinecone(api_key = PINECONE_API_KEY)

index_name    = "medical-chatbot"


pc.create_index(name       = index_name,
                dimension  = 384,
                metric     = "cosine",
                spec       = ServerlessSpec(cloud   = "aws",
                                            region  = "us-east-1"
                                            )
                )


{
    "name": "medical-chatbot",
    "metric": "cosine",
    "host": "medical-chatbot-o5kd1ml.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

### **Embed each chunk and upsert the embeddings into your Pinecone index.**

In [23]:
docsearch       = PineconeVectorStore.from_documents(documents   = text_chunks,
                                                     index_name  = index_name,
                                                     embedding   = embeddings,
                                                     )


### **Load Existing index**

In [41]:
docsearch       = PineconeVectorStore.from_existing_index(index_name  = index_name,
                                                          embedding   = embeddings
                                                          )

### **Retrieve Information based on Similarity**

In [43]:
retriever       = docsearch.as_retriever(search_type    = "similarity",
                                         search_kwargs  = {"k":3}
                                         )

In [44]:
retrieved_docs  = retriever.invoke("What is Acne?")

retrieved_docs

[Document(id='81f3ce35-11cb-40ff-9da7-4a8c9e2db73c', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': '/content/Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='3f7aad12-f34d-44f0-8b89-349b7752f89a', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': '/content/Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\

# **Initialising the LLM using Groq**

In [56]:
llm            = ChatGroq(temperature   = 0.4,
                          groq_api_key  = GROQ_API_KEY,
                          model_name    = "llama3-8b-8192"
                          )

### **Prompt Template**

In [57]:
system_prompt  = (
                    "You are an assistant for question-answering tasks. "
                    "Use the following pieces of retrieved context to answer "
                    "the question. If you don't know the answer, say that you "
                    "don't know. Use three sentences maximum and keep the "
                    "answer concise."
                    "\n\n"
                    "{context}"
)


prompt         = ChatPromptTemplate.from_messages(
        [
        ("system", system_prompt),
         ("human", "{input}"),
        ]
    )


### **Setting up QA using Retrieval-Augmented Generation**

In [58]:
question_answer_chain  = create_stuff_documents_chain(llm, prompt)
rag_chain              = create_retrieval_chain(retriever, question_answer_chain)

# ***QUESTION-ANSWERING -- OUTPUT***

In [60]:
response               = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})

print(response["answer"])

Acromegaly is a disorder in which the abnormal release of a particular chemical from the pituitary gland causes increased growth in bone and soft tissue, as well as various other disturbances throughout the body. When this abnormality occurs before bone growth stops, it is called gigantism.


In [61]:
response               = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])


I don't know. The provided context does not mention "stats". It seems to be discussing blood counts, medical tests, and athletic heart syndrome.
