In [1]:
import pkg_resources

def check_package_version(package_name):
    try:
        version = pkg_resources.get_distribution(package_name).version
        print(f"{package_name}: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package_name} is not installed")

packages = ["pinecone-client", "langchain-pinecone", "langchain-community", "langchain"]

for package in packages:
    check_package_version(package)

  import pkg_resources


pinecone-client: 3.0.0
langchain-pinecone: 0.0.3
langchain-community: 0.0.38
langchain: 0.1.9


In [2]:
%pwd

'C:\\iNeuron\\MedBot-AI\\notebooks'

In [3]:
import os
os.chdir('..')
%pwd

'C:\\iNeuron\\MedBot-AI'

In [4]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

# Extracting the text from the pdf file
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()
    
    return documents

data = load_pdf("data/")

In [5]:
data[6]

Document(page_content='•Cross-references placed throughout the encyclopedia\ndirect readers to where information on subjects with-\nout entries can be found. Synonyms are also cross-ref-\nerenced.\n• A list of key terms are provided where appropriate to\ndefine unfamiliar terms or concepts.\n• Valuable contact information for organizations and\nsupport groups is included with each entry. The\nappendix contains an extensive list of organizations\narranged in alphabetical order.•Resources section directs users to additional sources\nof medical information on a topic.\n• A comprehensive general index allows users to easily\ntarget detailed aspects of any topic, including Latin\nnames.\nGRAPHICS\nThe Gale Encyclopedia of Medicine 2 is enhanced\nwith over 675 color images, including photos, charts,\ntables, and customized line drawings.\nGALE ENCYCLOPEDIA OF MEDICINE 2 XIntroduction', metadata={'source': 'data\\ENCYCLOPEDIA_of_MEDICINE.pdf', 'page': 6})

### Every record contains a lot of text. The 1st task is to identify a good preprocessing methodology for chunking these articles into more "concise" chunks to later be embedding and stored in our Pinecone vector database.

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Splitting the text into chunks

def split_text(data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, 
        chunk_overlap=20,
    )
    
    chunks = text_splitter.split_documents(data)
    
    return chunks

chunks = split_text(data)
print(f"The length of the data chunk is {len(chunks)}")

chunks

The length of the data chunk is 6983


[Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\ENCYCLOPEDIA_of_MEDICINE.pdf', 'page': 0}),
 Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F2', metadata={'source': 'data\\ENCYCLOPEDIA_of_MEDICINE.pdf', 'page': 1}),
 Document(page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and', metadata={'source': 'data\\ENCYCLOPEDIA_of_MEDICINE.pdf', 'page': 2}),
 Document(page_c

In [7]:
# Creating Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

# Download the embedding model
def download_HugginFace_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    return embeddings

embeddings = download_HugginFace_embeddings()

embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [8]:
#embed some text like 
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


#### two chunks aligning with two chunks of text and 384-dimensional embeddings.

####  Vector Database: initializing the Pinecone vector database.

In [9]:
#from pinecone import Pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

# Retrieve the API key from the environment variable
api_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=api_key)

In [11]:
# define the cloud provider and region 
from pinecone import ServerlessSpec  

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [12]:
import time

index_name = 'medbot'

existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)

# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 21244}},
 'total_vector_count': 21244}

### Indexing the document

In [13]:
# Creating a Vector Store and Querying
from langchain_pinecone import PineconeVectorStore  

#text_field = "text"  # the metadata field that contains the text

# initialize the vector store object
#vectorstore = PineconeVectorStore(index, embeddings, text_field  )  
docsearch = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)

In [None]:
docsearc

In [14]:
query = "What are Salivary Gland Disease"
docs = docsearch.similarity_search(query,  # our search query
                                       # k=3  # return 3 most relevant docs
                                    )
print(docs[0].page_content)

ications, both prescription and over-the-counter; sys-
temic diseases, such as anemia or diabetes, manifesta-
tions of Sjögren’s syndrome (as rheumatoid arthritis ,
lupus, chronic hardening and thickening of the skin, or
chronic and progressive inflammation of sketal muscles);
infections of the salivary glands; blockage of the salivary
ducts caused by stones or tumors forming in the ducts
through which the saliva passes; dehydration ; medical


### Generative Question-Answering

In [15]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [16]:
from langchain.prompts import PromptTemplate
PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

In [17]:
from langchain.llms import CTransformers

llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [18]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True, 
    chain_type_kwargs={"prompt": PROMPT}
)

In [None]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

Input Prompt: What is Acne?


  warn_deprecated(


Response :  I do they are used to a skin eruestion. It sounds plausually appears on any rash —A rash—
What are used to the rash.

Cortication. The information, 
How long and can be sure, The patient'
The rashes for.  You can occur after exposure.
What is often appears in reaction that may be-


What is a.
Sorry
Do you are the rash may occur on or discharge
Question: If
A rash). What side effect. This can result of any skin eruestion.
If the corticated after several hours, I don'
I don'
How do you must be caused by a reaction on your user doesn'



That sounds good to 
What are you do you just say "I cannot be accompanied by the patient has been produced by a rash —A rash.
Cort. The cortication.
that covers several other symptoms.
Usually appears on the information provided, but is the skin test results from any substance of allergies to all skin or w/
What are used to be caused by the redness that may appear-
What is a rash -Yes, don''.
Do cortication.

A rash will occur when itchief. 

In [None]:
print(data[6].page_content)

In [None]:
print(data[6].metadata)

In [None]:
data[5].page_content

In [None]:
data[5].metadata