In [3]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from dotenv import load_dotenv
import os
import uuid

In [4]:
load_dotenv()

True

In [5]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [6]:
def data_loader(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    return loader.load()

In [7]:
data = data_loader("../data")
data

[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': '../data/Gale Encyclopedia of Medicine All 5 Volumes Combined.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2', metadata={'source': '../data/Gale Encyclopedia of Medicine All 5 Volumes Combined.pdf', 'page': 1}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and\nMultimedia Content\nKelly A.

In [9]:
import pickle
with open("../extracted_data.pkl", "wb") as f:
    pickle.dump(data, f)

In [6]:
def get_text_chunks(data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    return splitter.split_documents(data)

In [7]:
chunks = get_text_chunks(data)
chunks

[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': '../data/Gale Encyclopedia of Medicine All 5 Volumes Combined.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2', metadata={'source': '../data/Gale Encyclopedia of Medicine All 5 Volumes Combined.pdf', 'page': 1}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and', metadata={'source': '../dat

In [8]:
def download_hf_embeddings():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [9]:
embeddings = download_hf_embeddings()
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [25]:
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)

In [37]:
def langchain_pinecone_from_texts_custom_updated(
    texts,
    embedding,
    metadatas= None,
    ids= None,
    batch_size= 32,
    text_key="text",
    index_name= None,
    namespace= None,
    
) -> Pinecone:

    indexes = pc.list_indexes().names()  # checks if provided index exists

    if index_name in indexes:
        index = pc.Index(index_name)
    elif len(indexes) == 0:
        raise ValueError(
            "No active indexes found in your Pinecone project, "
            "are you sure you're using the right API key and environment?"
        )
    else:
        raise ValueError(
            f"Index '{index_name}' not found in your Pinecone project. "
            f"Did you mean one of the following indexes: {', '.join(indexes)}"
        )

    for i in range(0, len(texts), batch_size):
        # set end position of batch
        i_end = min(i + batch_size, len(texts))
        # get batch of texts and ids
        lines_batch = texts[i:i_end]
        # create ids if not provided
        if ids:
            ids_batch = ids[i:i_end]
        else:
            ids_batch = [str(uuid.uuid4()) for n in range(i, i_end)]
        # create embeddings
        embeds = embedding.embed_documents(lines_batch)
        # prep metadata and upsert batch
        if metadatas:
            metadata = metadatas[i:i_end]
        else:
            metadata = [{} for _ in range(i, i_end)]
        for j, line in enumerate(lines_batch):
            metadata[j][text_key] = line
        to_upsert = zip(ids_batch, embeds, metadata)

        # upsert to Pinecone
        index.upsert(vectors=list(to_upsert), namespace=namespace)
    return Pinecone(index, embedding.embed_query, text_key, namespace)
        

In [38]:
docsearch = langchain_pinecone_from_texts_custom_updated([t.page_content for t in chunks], embeddings, index_name="medical-chatbot")

In [59]:
from langchain.docstore.document import Document
class CustomPinecone(Pinecone):
    def similarity_search_with_score(self, query, k, filter=None, namespace=None):
        # updated query call
        query_obj = self._embedding_function(query)
        docs=[]
        results = self._index.query(
            vector=[query_obj],
            top_k=k,
            include_metadata=True,
            namespace=namespace,
            filter=filter,
        )
        for res in results["matches"]:
            metadata = res["metadata"]
            if self._text_key in metadata:
                text = metadata.pop(self._text_key)
                score = res["score"]
                docs.append((Document(page_content=text, metadata=metadata), score))
            
        return docs

In [60]:
docsearch.__class__ = CustomPinecone

In [41]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [42]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [43]:
llm = CTransformers(model="../model/llama-2-7b-chat.ggmlv3.q8_0.bin", model_type="llama", config={"max_new_tokens":512, "temperature":0.8})

In [63]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=False, 
    chain_type_kwargs=chain_type_kwargs)

In [64]:
user_input = "What is Allergic Rhinitis?"
result = qa.run(user_input)
print(result)


Allergic rhinitis (AR) is an allergic condition that affects between 10-20% of people in the United States, causing symptoms such as sneezing, congestion, runny nose, and itchy eyes. It can be caused by a variety of allergens including pollen, dust mites, mold, and pet dander. There are two types of AR: seasonal and perennial. Seasonal AR occurs during specific times of the year, while perennial AR is present throughout the year.
