In [1]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [8]:
PINECONE_API_KEY = "pcsk_6vak7A_QZQEv68cSuCxJjXkuFDQ1Y9q9Dt4rFJ5nsXwr8Hm1BTBsKQU541hnCqSj5o3GiT"
PINECONE_API_ENV = "us-east-1-aws"
INDEX_NAME = "medicalbot"

In [39]:
OPENAI_API_KEY = "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"

In [9]:
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)


In [10]:
index = pc.Index("medicalbot")


In [11]:
def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [12]:
extracted_data = load_pdf("../data")
print(f"Loaded {len(extracted_data)} documents from PDF files.")

Loaded 637 documents from PDF files.


In [26]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=20
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [27]:
text_chunks = text_split(extracted_data)
print("Length of text chunks:", len(text_chunks))

Length of text chunks: 15616


In [28]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

In [25]:
# docsearch=Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)

In [29]:
def store_embeddings_in_batches(index, text_chunks, embeddings, batch_size=100):
    """
    Store embeddings in Pinecone in smaller batches to avoid exceeding size limits.
    """
    print("Storing embeddings in batches...")
    for i in range(0, len(text_chunks), batch_size):
        batch = text_chunks[i:i + batch_size]
        vectors = [
            (f"chunk-{i+j}", embeddings.embed_documents([chunk.page_content])[0], {"text": chunk.page_content})
            for j, chunk in enumerate(batch)
        ]
        try:
            index.upsert(vectors)
            print(f"Stored batch {i // batch_size + 1} with {len(vectors)} embeddings.")
        except Exception as e:
            print(f"Error storing batch {i // batch_size + 1}: {e}")


In [30]:
store_embeddings_in_batches(index, text_chunks, embeddings, batch_size=50)


Storing embeddings in batches...
Stored batch 1 with 50 embeddings.
Stored batch 2 with 50 embeddings.
Stored batch 3 with 50 embeddings.
Stored batch 4 with 50 embeddings.
Stored batch 5 with 50 embeddings.
Stored batch 6 with 50 embeddings.
Stored batch 7 with 50 embeddings.
Stored batch 8 with 50 embeddings.
Stored batch 9 with 50 embeddings.
Stored batch 10 with 50 embeddings.
Stored batch 11 with 50 embeddings.
Stored batch 12 with 50 embeddings.
Stored batch 13 with 50 embeddings.
Stored batch 14 with 50 embeddings.
Stored batch 15 with 50 embeddings.
Stored batch 16 with 50 embeddings.
Stored batch 17 with 50 embeddings.
Stored batch 18 with 50 embeddings.
Stored batch 19 with 50 embeddings.
Stored batch 20 with 50 embeddings.
Stored batch 21 with 50 embeddings.
Stored batch 22 with 50 embeddings.
Stored batch 23 with 50 embeddings.
Stored batch 24 with 50 embeddings.
Stored batch 25 with 50 embeddings.
Stored batch 26 with 50 embeddings.
Stored batch 27 with 50 embeddings.
Stor

In [60]:
def query_index(index, query, embeddings, top_k=5):
    """
    Query the Pinecone index and return the top_k most similar results.
    """
    # Generate the embedding for the query
    query_embedding = embeddings.embed_query(query)

    # Perform the query on Pinecone using keyword arguments
    try:
        result = index.query(
            vector=query_embedding,  # Specify the vector explicitly
            top_k=top_k,             # Specify the number of results to return
            include_metadata=True    # Include metadata in the results
        )
        print(f"Query results: {result}")
        
        # Extract and return the text of the most similar results
        results_text = [
            match['metadata']['text'] for match in result.get('matches', [])
        ]
        return results_text
    except Exception as e:
        print(f"Error querying index: {e}")
        return []

    
# Example usage
query = "What are Allergies"

top_k_results = query_index(index, query, embeddings, top_k=5)
for i, text in enumerate(top_k_results, 1):
    print(f"Result {i}: {text}")


Query results: {'matches': [{'id': 'chunk-2970',
              'metadata': {'text': 'Definition\n'
                                   'Allergies are abnormal reactions of the '
                                   'immune sys-\n'
                                   'tem that occur in response to otherwise '
                                   'harmless sub-\n'
                                   'stances.\n'
                                   'GALE ENCYCLOPEDIA OF MEDICINE 2114\n'
                                   'Allergies'},
              'score': 0.775235713,
              'values': []},
             {'id': 'chunk-3098',
              'metadata': {'text': 'KEY TERMS\n'
                                   'Allergen —A substance that provokes an '
                                   'allergic\n'
                                   'response.\n'
                                   'Allergic rhinitis —Inflammation of the '
                                   'mucous\n'
                         

In [34]:
query = "Fever"

top_k_results = query_index(index, query, embeddings, top_k=5)
for i, text in enumerate(top_k_results, 1):
    print(f"Result {i}: {text}")

Query results: {'matches': [{'id': 'chunk-9584',
              'metadata': {'text': 'fever.\n'
                                   'GALE ENCYCLOPEDIA OF MEDICINE 2 377\n'
                                   'Aspirin\n'
                                   'GEM - 0001 to 0432 - A  10/22/03 1:43 PM  '
                                   'Page 377'},
              'score': 0.740697622,
              'values': []},
             {'id': 'chunk-1520',
              'metadata': {'text': 'fever in children. This disease is most '
                                   'often caused by\n'
                                   'types 3 and 7. Symptoms, which appear '
                                   'suddenly and\n'
                                   'usually disappear in less than a week, '
                                   'include:'},
              'score': 0.667628527,
              'values': []},
             {'id': 'chunk-15201',
              'metadata': {'text': 'fevers (a source of its nickname, 

In [38]:
query = "What is acne"

top_k_results = query_index(index, query, embeddings, top_k=5)
for i, text in enumerate(top_k_results, 1):
    print(f"Result {i}: {text}")

Query results: {'matches': [{'id': 'chunk-5861',
              'metadata': {'text': 'KEY TERMS\n'
                                   'Acne—A skin condition in which raised '
                                   'bumps,\n'
                                   'pimples, and cysts form on the face, neck, '
                                   'shoul-\n'
                                   'ders and upper back.\n'
                                   'Bacteria—Tiny, one-celled forms of life '
                                   'that cause'},
              'score': 0.684200048,
              'values': []},
             {'id': 'chunk-758',
              'metadata': {'text': 'Acne\n'
                                   'Acne vulgaris affecting a woman’s face. '
                                   'Acne is the general\n'
                                   'name given to a skin disorder in which the '
                                   'sebaceous\n'
                                   'glands become inflam

In [35]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [36]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [41]:
import os
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [43]:
from langchain.llms import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)


In [55]:
llm=CTransformers(model="llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [98]:
from langchain.vectorstores import Pinecone

retriever = Pinecone(text_key="text_field", embedding_function=embeddings.embed_query, index=index
)

In [None]:
# docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings, index_name=index_name)


In [99]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)


In [95]:
# results = index.query(
#     vector=[-0.0480710939,0.0220199078,0.0480854511,0.0471340045,0.0228304509,0.024082629,-0.0180700477,-0.0367366113,0.0076965834,-0.0147731835,-0.0220640562,0.0626425,-0.039848987,-0.0503015742,0.0568747148,-0.0390409194,-0.0107560577,0.0152228102,-0.028261397,0.0546746776,0.018619271,0.0483462177,-0.0840559527,0.102691017,-0.0890759155,0.0551148131,-0.00726613123,-0.051211305,-0.0139321648,0.00345990644,-0.0235272218,0.0467962883,-0.0264769252,0.0547096096,-0.0301403366,0.0778461769,-0.00945522,0.0296926126,-0.0870746747,0.0225620791,-0.00737218605,-0.0219303928,-0.0272690188,-0.0016999268,0.046583496,0.0266712569,0.00464030402,-0.0397419557,0.00270164968,0.0296819583,-0.0872843191,0.0385912843,0.0276299026,-0.123561278,0.0524900071,-0.0690621734,-0.0650957897,0.0701912716,0.0267540701,0.0168243796,-0.0512761287,-0.018579945,0.0164868403,0.0194774922,-0.0107553108,-0.0576801337,-0.0321543477,-0.0052367528,0.00246993639,0.0124155572,0.0594022684,-0.0279889833,-0.030514041,-0.0358040594,0.0154721094,0.0321903676,-0.0720412508,-0.0511956811,0.0446088,-0.0578097329,0.010312153,-0.0300835408,-0.0180432722,0.058203619,0.0293400828,0.00590091385,-0.0336831361,-0.0414428525,-0.0334314257,0.066028893,-0.0152334282,-0.0067407703,0.125101149,0.087555483,0.0207560342,0.0213052817,-0.0217933767,-0.0410858355,-0.0340576544,-0.0236238409,0.0402893797,0.0862539485,-0.0561452955,0.0994284153,0.0233305395,0.00583802583,0.0498696119,-0.0265950914,0.0433041975,0.04750387,0.0445346907,0.0406458154,0.0396460295,-0.0815653205,-0.0110182548,0.00302393641,0.0130197247,0.0354106,-0.0716614425,0.0483038276,-0.0274856146,-0.135025904,-0.0785327777,-0.0151764816,-0.00941660721,0.017939141,-0.00668939203,-5.65239168e-34,0.103783928,-0.0166477524,0.00166915567,0.0130775953,0.000265182229,-0.051467225,-0.00668821624,0.0351878554,-0.0616116971,-0.00288291136,0.0970257819,-0.111432485,0.0134278154,0.0905345455,-0.0958961174,-0.00397220533,-0.044352103,0.073408179,0.0350770652,0.0511753783,-0.00939828157,-0.0614238791,-0.053540919,0.0330665559,-0.116391182,-0.00563627435,-0.0355232432,-0.0127586564,-0.0480275676,0.0449922048,-0.0530929,-0.122893199,0.0288129952,-0.0117415283,-0.0950573534,0.0690377653,0.0109994588,0.0233480744,-0.0166500453,0.0882170647,-0.0293450654,0.0470978133,-0.0209855,0.0258144885,0.0720629692,-0.04011539,0.0318795033,-0.0116128763,-0.0951302499,-0.00799894,-0.120378889,-0.00761219673,0.0681178793,0.11095456,-0.00301376265,0.0303589087,-0.0129762441,0.00396001432,-0.0291603524,-0.0106366593,0.0412965529,0.085503459,-0.0571197048,0.100491099,0.0604595579,-0.0278961826,0.072792314,-0.00525786495,-0.0218534544,0.0471068732,0.00401403,-0.0516057387,-0.0130950138,-0.0323170237,-0.0358864367,0.0704488,-0.0792154148,0.00325535121,-0.00828292873,0.0010921139,-0.0374940373,-0.000653291587,0.11405579,-0.0112041077,0.036764767,0.0514387302,0.0454862975,-0.0973399505,-0.0624708384,-0.0812404454,0.0488413312,-0.00117818976,0.0748991743,-0.0306873322,-0.00137307192,5.59193775e-34,-0.0931831747,0.076030612,-0.0248329174,-0.0872301906,-0.0160146933,-0.07602299,-0.0743591785,-0.0271349214,-0.0627073199,-0.0815272629,0.0141088655,-0.022460686,-0.00513183419,0.0237818062,0.000524199277,-0.037881095,-0.00775378803,0.0123464782,-0.0278658327,-0.026840372,0.0435530953,-0.0422143936,0.00368586439,0.104097433,-0.0435990877,-0.0406731032,-0.139294416,-0.0482569598,0.0163217615,0.0170403756,0.013987856,-0.0172812361,-0.0490129106,0.00620007701,-0.032016322,0.0467485823,-0.0357383117,0.0836545378,0.00714643672,0.00967190135,0.0249760039,-0.013828204,0.0792837441,-0.109048709,0.0595303923,0.0309921559,0.0343028829,0.0292180479,0.00395207759,0.0505535752,-0.030698631,0.0135923252,0.0433993749,0.116730303,-0.000431084278,-0.0564798638,0.015803868,0.0491378084,-0.0582195409,0.00353888934,-0.015335436,0.0459425822,0.0189744812,0.0942206234,0.0526315,0.0252951123,0.031536743,-0.010214611,0.0411963724,-0.0125377653,0.0334128104,0.0428594,-0.0303886812,0.112260021,-0.00659317,0.0105131269,-0.00057198311,0.100197397,0.00247866311,0.0078318771,0.0948264822,0.0565105416,0.0244708415,0.031063851,-0.11013893,0.0455361083,-0.000550756289,-0.0455176719,0.00587729,-0.0241888743,-0.106334724,-0.0246128272,-0.0718777478,-0.0086594671,-0.0458747819,-2.493476e-8,-0.0609016754,-0.0409405939,-0.00432570605,-0.077493377,-0.0200448465,-0.014709577,0.0288388729,0.115417354,0.0433455445,-0.00300591579,-0.0132852532,0.0215052553,0.097537972,-0.0581805445,0.0654124618,0.00908899866,0.0816581,0.0058848015,0.0164939556,-0.00600451045,0.0789105,-0.0548660569,0.0985500664,0.0618752949,0.0348489285,-0.0989661,-0.00186964741,-0.0380772091,0.0240659583,0.00767628383,-0.00822813064,-0.0202115346,0.0417998396,0.0545573309,0.0428555273,-0.0537735373,-0.0779182687,0.005105271,-0.0218389761,0.0500095114,-0.0521191359,0.000380685291,0.0418583415,0.0234106984,0.00823558867,-0.0655485392,0.0263501722,0.0181486011,0.00106963271,-0.0818818882,-0.046919018,0.11221,0.0638047308,-0.0338171721,-0.0537240878,-0.0215752255,-0.0418794267,0.0592523552,0.0366374478,-0.0396691151,0.0897180811,0.0533716865,-0.0401794389,0.0377415903],  
#     top_k=10 
# )
# print(results)


{'matches': [{'id': 'chunk-3315', 'score': 1.00014913, 'values': []},
             {'id': 'chunk-9973', 'score': 0.651795149, 'values': []},
             {'id': 'chunk-5207', 'score': 0.650316238, 'values': []},
             {'id': 'chunk-11952', 'score': 0.639147401, 'values': []},
             {'id': 'chunk-5206', 'score': 0.637801, 'values': []},
             {'id': 'chunk-3316', 'score': 0.61419481, 'values': []},
             {'id': 'chunk-9987', 'score': 0.606882751, 'values': []},
             {'id': 'chunk-2508', 'score': 0.575321674, 'values': []},
             {'id': 'chunk-7541', 'score': 0.526298761, 'values': []},
             {'id': 'chunk-10282', 'score': 0.525085926, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}


In [100]:
que = "What is acne"
result = qa(que)
print(result["results"])

ValueError: The argument order for `query()` has changed; please use keyword arguments instead of positional arguments. Example: index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace')

In [76]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

ValueError: The argument order for `query()` has changed; please use keyword arguments instead of positional arguments. Example: index.query(vector=[0.1, 0.2, 0.3], top_k=10, namespace='my_namespace')