In [1]:
%pwd

'c:\\iNeuron\\MedChatBot\\research'

In [2]:
import os

In [3]:
os.chdir("..")

In [4]:
%pwd

'c:\\iNeuron\\MedChatBot'

In [5]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import time

In [6]:
# Load the .env file
load_dotenv()

# Retrieve the API key from the environment variable
api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)

In [7]:
# Extracting the text from the pdf file
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()
    
    return documents

data_extract = load_pdf("data/")

In [8]:
# Splitting the text into chunks
def split_text(data_extract):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks = text_splitter.split_documents(data_extract)
    
    return text_chunks

text_chunks = split_text(data_extract)
print(f"The length of the data chunk is {len(text_chunks)}")

The length of the data chunk is 7093


In [9]:
# Download the embedding model
def download_HugginFace_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    return embeddings

embeddings = download_HugginFace_embeddings()

embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [10]:
# Testing the embedding model
query_results = embeddings.embed_documents("Hello World")
print("length", len(query_results))
#query_results

length 11


In [11]:
index_name = "medbot" 

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [12]:
docsearch = PineconeVectorStore.from_documents(text_chunks, embeddings, index_name=index_name)

In [13]:
query = "What are Salivary Gland Disease"
docs = docsearch.similarity_search(query)
print(docs[0].page_content)

ications, both prescription and over-the-counter; sys-
temic diseases, such as anemia or diabetes, manifesta-
tions of Sjögren’s syndrome (as rheumatoid arthritis ,
lupus, chronic hardening and thickening of the skin, or
chronic and progressive inflammation of sketal muscles);
infections of the salivary glands; blockage of the salivary
ducts caused by stones or tumors forming in the ducts
through which the saliva passes; dehydration ; medical


In [14]:
retriever = docsearch.as_retriever(search_type="mmr")
matched_docs = retriever.invoke(query)
for i, d in enumerate(matched_docs):
    print(f"\n## Document {i}\n")
    print(d.page_content)


## Document 0

ications, both prescription and over-the-counter; sys-
temic diseases, such as anemia or diabetes, manifesta-
tions of Sjögren’s syndrome (as rheumatoid arthritis ,
lupus, chronic hardening and thickening of the skin, or
chronic and progressive inflammation of sketal muscles);
infections of the salivary glands; blockage of the salivary
ducts caused by stones or tumors forming in the ducts
through which the saliva passes; dehydration ; medical

## Document 1

Duodenal atresia seeDuodenal obstructionKEY TERMS
Salivary duct —Tube through which saliva is car-
ried from the salivary gland to the mouth.
Salivary gland —Gland in which saliva forms. Duodenal obstruction
Definition
Duodenal obstruction is a failure of food to pass out of
the stomach either from a complete or partial obstruction.
Description
The duodenum is the first part of the intestine, into
which the stomach, the gall bladder, and the pancreas

## Document 2

Sexually transmitted disease —A disease that is
pa

In [15]:
# Initialize the Pinecone Vector Store
docsearch = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)

In [16]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [17]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [18]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [24]:
retriever = docsearch.as_retriever(search_kwargs={'k': 2})
retriever

VectorStoreRetriever(tags=['PineconeVectorStore', 'HuggingFaceEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x000001901F873280>, search_kwargs={'k': 2})

In [29]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    return_source_documents=True, 
    chain_type_kwargs={"prompt": PROMPT})

ValidationError: 1 validation error for RetrievalQA
retriever
  Can't instantiate abstract class BaseRetriever with abstract methods _aget_relevant_documents, _get_relevant_documents (type=type_error)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectordb.as_retriever(),
        chain_type="stuff",
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True)