In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
# Extract the data from the PDF
def load_pdf(data_path):
    loader = DirectoryLoader(data_path, glob='*.pdf', loader_cls=PyPDFLoader)
    return loader.load()

extracted_data = load_pdf(data_path='../data/')
print(f'Length of extracted data {len(extracted_data)}')
print(extracted_data[99:105])

Length of extracted data 637


In [6]:
# Split the extracted data into text chunks
def split_text(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = split_text(extracted_data)
print(f'Length of text chunks {len(text_chunks)}')
text_chunks[99:105]

Length of text chunks 5860


[Document(metadata={'source': '..\\data\\medical_book.pdf', 'page': 18}, page_content='under most types of major medical insurance. As always,\nthough, the patient would be wise to confirm that their\ncoverage extends to the specific procedure proposed. For\nnonemergency situations, most underwriters stipulate\nprior approval as a condition of coverage.\nSpecific conditions for which ultrasound may be\nselected as a treatment option—certain types of tumors,\nlesions,kidney stones and other calculi, muscle and lig-\nament injuries, etc.—are described in detail under the'),
 Document(metadata={'source': '..\\data\\medical_book.pdf', 'page': 18}, page_content='appropriate entries in this encyclopedia.\nPreparation\nA patient undergoing abdominal ultrasound will be\nadvised by their physician about what to expect and how\nto prepare. As mentioned above, preparations generally\ninclude fasting and arriving for the procedure with a full\nbladder, if necessary. This preparation is particularl

In [8]:
# text_chunks

In [14]:
from langchain.embeddings import HuggingFaceEmbeddings

# Download the embeddings model from Hugging Face
def download_embeddings_model(model_name):
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

embeddings = download_embeddings_model(model_name='sentence-transformers/all-MiniLM-L6-v2')  # 384

  embeddings = HuggingFaceEmbeddings(model_name=model_name)


In [17]:
dict(embeddings)

{'client': SentenceTransformer(
   (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
   (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
   (2): Normalize()
 ),
 'model_name': 'sentence-transformers/all-MiniLM-L6-v2',
 'cache_folder': None,
 'model_kwargs': {},
 'encode_kwargs': {},
 'multi_process': False,
 'show_progress': False}

In [18]:
embeddings.embed_query('hello')[0:5]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


[-0.0627717450261116,
 0.05495885759592056,
 0.05216481164097786,
 0.0857900083065033,
 -0.08274893462657928]

In [19]:
# from sentence_transformers import SentenceTransformer

# def download_embeddings_model(model_name):
#     model = SentenceTransformer(model_name)
#     return model

# embeddings_sent_tranf = download_embeddings_model(model_name='sentence-transformers/all-MiniLM-L6-v2')
# embed_dimension = embeddings_sent_tranf.get_sentence_embedding_dimension()

# print(f"The embedding dimension for the model is {embed_dimension}.")

# embeddings_sent_tranf.encode('hello')[0:5]


The embedding dimension for the model is 384.


array([-0.06277175,  0.05495886,  0.05216481,  0.08579001, -0.08274893],
      dtype=float32)

In [11]:
import os
from dotenv import load_dotenv
load_dotenv()
PINECON_API_KEY = os.getenv("PINECON_API_KEY")

# pinecon create index
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECON_API_KEY)
index_name = "medibot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimension
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)


In [12]:
os.environ['PINECONE_API_KEY'] = PINECON_API_KEY

In [21]:
# Embed each chunk and upsert the embeddings to Pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [22]:
# Load exsiting pincone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,)
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1fd9bfeed60>

In [24]:
retriever_example = docsearch.as_retriever(search_type='similarity', search_kwargs={'k': 3})
retriever_example.invoke('What is inflammation?')

[Document(metadata={'page': 319.0, 'source': '..\\data\\medical_book.pdf'}, page_content='Inflammation—A process occurring in body tissues,\ncharacterized by increased circulation and the\naccumulation of white blood cells. Inflammation\nalso occurs in disorders such as arthritis and causes\nharmful effects.\nInflammatory—Pertaining to inflammation.\nImmune response —Physiological response of the\nbody controlled by the immune system that\ninvolves the production of antibodies to fight off\nspecific foreign substances or agents (antigens).\nImmune system —The sum of the defence mecha-'),
 Document(metadata={'page': 615.0, 'source': '..\\data\\medical_book.pdf'}, page_content='swelling, pain, and other symptoms of inflamma-\ntion.\nBronchitis —Inflammation of the air passages of\nthe lungs.\nChronic—A word used to describe a long-lasting\ncondition. Chronic conditions often develop grad-\nually and involve slow changes.\nEmphysema —A lung disease in which breathing\nbecomes difficult.\n

In [25]:
from langchain import PromptTemplate

prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}


In [35]:
from langchain.llms import CTransformers
llm=CTransformers(model="../model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [36]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
        llm=llm, 
        chain_type="stuff", 
        retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
        return_source_documents=True, 
        chain_type_kwargs=chain_type_kwargs
        )

In [37]:
# while True::
user_input=input(f"Input Prompt:")
result=qa.invoke({"query": user_input})
print("Response : ", result["result"])

Response :  Inflammation is a process occurring in body tissues, characterized by increased circulation and the accumulation of white blood cells. It can occur in response to injury or illness and is typically associated with symptoms such as swelling, redness, heat, and pain.
