In [1]:
import os
os.chdir('/teamspace/studios/this_studio/Medical-Chatbot-')

In [2]:
DATA_DIR = 'data'

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
def load_pdf(data):
    """
    Load a PDF file from a directory.
    data (str): the directory containing the PDF file.
    """
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    
    document= loader.load()
    return document

In [6]:
document_data = load_pdf(DATA_DIR)

In [7]:
len(document_data)

637

In [8]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
data_chunks=text_split(document_data)
print("Length of Text Chunks", len(data_chunks))

Length of Text Chunks 7020


In [7]:
from langchain.embeddings import HuggingFaceEmbeddings

In [8]:
HF_Embedding_model = 'microsoft/unixcoder-base'
def download_hugging_face_embeddings():
    # embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    embeddings=HuggingFaceEmbeddings(model_name=HF_Embedding_model)
    return embeddings

In [9]:
embedding_model = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name=HF_Embedding_model)
  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name microsoft/unixcoder-base. Creating a new one with mean pooling.


In [10]:
query_result = embedding_model.embed_query("Hello world")
embed_size = len(query_result)
print("Length", embed_size)

Length 768


In [11]:
from dotenv import load_dotenv

load_dotenv()
PINECONE_API_KEY= os.environ.get('PINECONE_API_KEY')

assert PINECONE_API_KEY is not None, "Please set the PINECONE_API_KEY environment variable"
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [12]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medibot-index"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=embed_size,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)
    print(f"Created index {index_name}")
else:
    print(f"Index {index_name} already exists")

Index medibot-index already exists


In [13]:
from langchain_pinecone import PineconeVectorStore
## https://python.langchain.com/api_reference/pinecone/vectorstores/langchain_pinecone.vectorstores.PineconeVectorStore.html#pineconevectorstore
index = pc.Index(index_name)


# vector_store = PineconeVectorStore.from_existing_index(
#     documents=data_chunks,
#     index_name=index_name,
#     embedding=embedding_model, 
# )

In [14]:
vector_store = PineconeVectorStore(index=index, embedding=embedding_model)

In [26]:
# vector_store = PineconeVectorStore(index=index, embedding=embedding_model)
# vector_store.add_documents(documents=data_chunks)

['90b4d90b-a88c-4dad-9df8-366834227164',
 'f725ad21-9eb9-4342-bc79-85b448cfff17',
 'cf262920-84e7-4a94-901b-6ec4498f6eee',
 '9ce5b7c8-02d1-409e-9182-88c12009f7a1',
 '924cc341-842c-4c02-8bdb-d7f04664c13e',
 'bd778ff9-5890-4c88-97af-b8b58341faa3',
 '149827f9-a8cd-41cd-8187-7d3582ee0554',
 '9be70930-fb48-4d05-871a-0ca4c72eea4e',
 '809681d0-abf5-41ed-88ef-67290186530c',
 '51b6d244-c31b-4229-a535-c41153c91c81',
 'aa1ef83d-a4c1-42d4-933b-1d920c98a571',
 'ad28c5ee-9fbf-40d7-b46d-392406538fbb',
 'ac4cb3fc-1f47-4875-b9f1-f0aecb94537c',
 '5d4f0b3a-d69d-4a51-a66e-b8753346cc6d',
 '4b7b03d2-2814-4504-953e-f66e34df1122',
 'fa0af743-c2a1-4d9e-b65f-e7045cc19dca',
 'd65e844b-ab1d-480c-9fc0-17fb1dc575e3',
 'b0158a00-dd8f-4da3-b577-b94efd4a0411',
 '5ab1192a-d029-4fe6-abe7-17514a4e3ae2',
 '2cb5cfa0-13d7-46e9-8d3c-3e9ae09fc664',
 '1f68a22c-9aad-4951-b193-8915ef1ebb4b',
 '950ef7d3-efc9-41fe-9550-a7a22ad62c9a',
 '70fe62eb-eae3-4526-8055-a5b88ae2c440',
 'b7dc79e1-da91-40b4-a278-fc1324e07576',
 '31f3212d-70ab-

In [15]:
vector_store

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x7f5f303364a0>

In [16]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [17]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='d95f0caf-b868-4a6d-acd5-f9587b0ddffb', metadata={'page': 466.0, 'source': 'data/Medical_book.pdf'}, page_content='for the abused child.'),
 Document(id='4c441560-c99e-4175-824b-df329a0d5e65', metadata={'page': 331.0, 'source': 'data/Medical_book.pdf'}, page_content='to their sense of competence'),
 Document(id='b87a1485-329e-4bf2-9564-bc42cebbbf18', metadata={'page': 158.0, 'source': 'data/Medical_book.pdf'}, page_content='following initial infection.')]

# Chain building
https://medium.com/@ynikose/building-an-intelligent-pdf-question-answering-system-with-langchain-and-llama-2-0db84c6daabb

https://medium.com/@murtuza753/using-llama-2-0-faiss-and-langchain-for-question-answering-on-your-own-data-682241488476

In [22]:
# GPU llama-cpp-python
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.78 numpy==1.23.4 --force-reinstall --upgrade --no-cache-dir -q
# !pip install huggingface_hub -q
# !pip install llama-cpp-python==0.1.78 -q
# !pip install numpy==1.23.4 -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

## Loading the LLaMA-2 Model

In [18]:
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/autogptq-index/whl/cu118/
Collecting auto-gptq
  Downloading https://huggingface.github.io/autogptq-index/whl/cu118/auto-gptq/auto_gptq-0.7.1%2Bcu118-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m196.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate>=0.26.0 (from auto-gptq)
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Collecting datasets (from auto-gptq)
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting sentencepiece (from auto-gptq)
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.2.1-py3-none-any.whl.metadata (3.0 kB)
Collecting peft>=0.5.0 (

In [24]:
import torch
import transformers
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

# from huggingface_hub import hf_hub_download
# from llama_cpp import Llama

# model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
# model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin" # the model is in bin format

# model_path = hf_hub_download(repo_id=model_name_or_path, 
#                              filename=model_basename)
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
DEVICE = torch.device(DEVICE)
print(DEVICE)
model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
model_basename = "model"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    revision="gptq-4bit-128g-actorder_True",
    model_basename=model_basename,
    use_safetensors=True,
    trust_remote_code=True,
    inject_fused_attention=False,
    device=DEVICE,
    quantize_config=None,
)

cuda:0


INFO - The layer lm_head is not quantized.
Some weights of the model checkpoint at /home/zeus/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GPTQ/snapshots/8160f0b2c4c7b14a8241f7fdbe08fe300ae7f4ea/model.safetensors were not used when initializing LlamaForCausalLM: {'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.33.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.34.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.39.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_f

In [25]:
# model = model.to(DEVICE)

'/home/zeus/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGML/snapshots/3140827b4dfcb6b562cd87ee3d7f07109b014dd0/llama-2-13b-chat.ggmlv3.q5_1.bin'

In [26]:
# lcpp_llm = None
# lcpp_llm = Llama(
#     model_path=model_path,
#     n_threads=2, # CPU cores
#     n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
#     n_gpu_layers=32 # Change this value based on your model and your GPU VRAM pool.
# )

llama.cpp: loading model from /home/zeus/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGML/snapshots/3140827b4dfcb6b562cd87ee3d7f07109b014dd0/llama-2-13b-chat.ggmlv3.q5_1.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 5120
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 40
llama_model_load_internal: n_head_kv  = 40
llama_model_load_internal: n_layer    = 40
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 13824
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 9 (mostly Q5_1)
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size =    0.11 MB
llama_model_load_intern

32

In [29]:
# from langchain.chains import create_retrieval_chain
# from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain_core.prompts import ChatPromptTemplate

DEFAULT_SYSTEM_PROMPT = """
    You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer
    the question. If you don't know the answer, say that you
    don't know. Use three sentences maximum and keep the
    answer concise.

    {context}
    
""".strip()

def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
        [INST] <>
        {system_prompt}
        <>

        {prompt} [/INST]
        """.strip()

# prompt_template = ChatPromptTemplate(
#     [
#         ("system", system_prompt),
#         ("human", "{question}"),
#     ]
# )

In [None]:
# response = lcpp_llm(
#     prompt=prompt_template,
#     max_tokens=1024,
#     temperature=0.6,
#     top_p=0.95,
#     repeat_penalty=1.2,
#     top_k=150,
#     echo=True
# )

In [None]:
# print(response["choices"][0]["text"])

In [26]:
from transformers import pipeline, TextStreamer
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA


streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)


The model 'LlamaGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCaus

In [30]:
llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0.2})

template = generate_prompt(
    """
Question: {question}
""",
    system_prompt=DEFAULT_SYSTEM_PROMPT,
)


In [34]:
prompt = PromptTemplate(template=template, input_variables=["question"])

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever= vector_store.as_retriever(search_type="similarity", search_kwargs={"k":3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

In [35]:
result = qa_chain("what is Acromegaly and gigantism?")

  result = qa_chain("what is Acromegaly and gigantism?")
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


 Sure! Here's my response based on the provided context:

Acromegaly and gigantism refer to conditions where there is excessive growth and enlargement of body tissues due to excessive production of growth hormone (GH) after normal skeletal maturation. This can lead to physical symptoms such as tall stature, large hands and feet, and coarsening facial features. However, I do not have information about the bactericidal ability of aminoglycosides or their relevance to these conditions.


In [38]:
result

{'query': 'what is Acromegaly and gigantism?',
 'result': "[INST] <>\n        You are an assistant for question-answering tasks.\n    Use the following pieces of retrieved context to answer\n    the question. If you don't know the answer, say that you\n    don't know. Use three sentences maximum and keep the\n    answer concise.\n\n    The bactericidal ability of aminoglycosides has not\n\nDiagnosis\nBecause acromegaly produces slow changes over\n\nangiography isperformed in a cardiac catheterization laboratory and\n        <>\n\n        \nQuestion: what is Acromegaly and gigantism?\n [/INST]  Sure! Here's my response based on the provided context:\n\nAcromegaly and gigantism refer to conditions where there is excessive growth and enlargement of body tissues due to excessive production of growth hormone (GH) after normal skeletal maturation. This can lead to physical symptoms such as tall stature, large hands and feet, and coarsening facial features. However, I do not have information 

In [39]:
qa_chain("Whic medicine should i take for feaver?")



 Based on the provided context, I cannot recommend any specific medicine for fever. Prochlorperazine may interact with other medications, and it is important to consult with a physician before taking any medication, especially if you have liver disease or infection, are pregnant, or breastfeeding. It is best to seek medical advice from a qualified healthcare professional for proper diagnosis and treatment.


{'query': 'Whic medicine should i take for feaver?',
 'result': "[INST] <>\n        You are an assistant for question-answering tasks.\n    Use the following pieces of retrieved context to answer\n    the question. If you don't know the answer, say that you\n    don't know. Use three sentences maximum and keep the\n    answer concise.\n\n    Interactions\nProchlorperazine may interact with other medicines.\n\nToday, Ayurvedic medicine is used by 80% of the\n\nor liverdisease or liver infections should also consult with aphysician before using the drug. So should women whoare pregnant or breastfeeding.\n        <>\n\n        \nQuestion: Whic medicine should i take for feaver?\n [/INST]  Based on the provided context, I cannot recommend any specific medicine for fever. Prochlorperazine may interact with other medications, and it is important to consult with a physician before taking any medication, especially if you have liver disease or infection, are pregnant, or breastfeeding. It is b