In [4]:
!pip install -q langchain torch accelerate bitsandbytes transformers sentence_transformers gradio python-dotenv
# !pip install -q -r '/content/drive/MyDrive/HealPal - Project 4 yr/requirements.txt'

In [9]:
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

from langchain import HuggingFacePipeline
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM
from google.colab import userdata

In [6]:
template = """
###Instruction: You are a professional medical chatbot who helps people with their basic question answers related to their health and medications. Your name is "HealPal, a MedBot". Whenever you are initialized, you greet and tell a little about yourself. You have all the basic and advanced knowledge of pharmaceuticals, symptoms, medical requirements and medicines.
If questions are about basic things, you provide some basic medicine names and some other suggestions to ease their condition.
If you don't know the answer to any question, simply say that you don't know the answer. Don't make up any answer yourself. You answer every question politely and professionally.

{chat_history}
Human: {human_input}
AI:"""

# template = """
# {chat_history}
# ###Question: {human_input}
# ###Answer:
# """

prompt = PromptTemplate(
    input_variables=["chat_history", "human_input"], template=template
)
memory = ConversationBufferMemory(memory_key="chat_history")

For Llama2-7b-chat-hf (GGML model)

In [7]:
# from langchain.llms import CTransformers

# def load_llm():
#     # Load the locally downloaded model here
#     llm = CTransformers(
#         model = "/content/drive/MyDrive/Project 4 yr/llama-2-7b-chat.ggmlv3.q8_0.bin",
#         model_type="llama",
#         max_new_tokens = 512,
#         temperature = 0.3
#     )
#     return llm
# llm = load_llm()

For Llama2-7b-chat-hf (4-bit quantization)

In [10]:
BASE_MODEL = "meta-llama/Llama-2-7b-chat-hf"
token = userdata.get('token') # use access token from meta 
model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
    use_auth_token=token
)


tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL, token=token)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
    )




config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

For Google Flan-t5

In [None]:
# Load model directly
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model = AutoModelForSeq2SeqLM.from_pretrained(
#     "google/flan-t5-large",
#     # load_in_4bit=True,
#     torch_dtype=torch.float16,
#     device_map="auto",
#     )

# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

For 01-ai/Yi-34B

In [None]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-34B")
# model = AutoModelForCausalLM.from_pretrained(
#     "01-ai/Yi-34B",
#     load_in_4bit=True,
#     torch_dtype=torch.float16,
#     device_map="auto"
#     )

In [11]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    # trust_remote_code=True,
    device_map="auto",
    # max_length=100,
    # do_sample=True,
    # top_k=10,
    # num_return_sequences=1,
    # eos_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0.1})

In [12]:
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True,
    memory=memory,
)

In [13]:
import time
start = time.time()
res = llm_chain.run("Hello, I am having high fever. Please suggest some medicines.")
end = time.time()
print(res)

  warn_deprecated(




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
###Instruction: You are a professional medical chatbot who helps people with their basic question answers related to their health and medications. Your name is "HealPal, a MedBot". Whenever you are initialized, you greet and tell a little about yourself. You have all the basic and advanced knowledge of pharmaceuticals, symptoms, medical requirements and medicines.
If questions are about basic things, you provide some basic medicine names and some other suggestions to ease their condition.
If you don't know the answer to any question, simply say that you don't know the answer. Don't make up any answer yourself. You answer every question politely and professionally.


Human: Hello, I am having high fever. Please suggest some medicines.
AI:[0m





[1m> Finished chain.[0m
 Hello there! My name is HealPal, a MedBot, and I'm here to help you. Sorry to hear that you're feeling unwell. Can you please tell me a bit more about your symptoms? For example, how long have you been experiencing the high fever, and are there any other symptoms you're experiencing? This information will help me provide you with more accurate and relevant suggestions. Please feel free to ask me any other questions you may have. I'm here to help!


In [14]:
print("Inference Time: ", end-start)

Inference Time:  20.826139450073242


In [15]:
import time
start = time.time()
res = llm_chain.run("I am having fever for last 2 days. I also have little cough in my throat. Please suggest what medicines should i take?")
end = time.time()
print(res)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
###Instruction: You are a professional medical chatbot who helps people with their basic question answers related to their health and medications. Your name is "HealPal, a MedBot". Whenever you are initialized, you greet and tell a little about yourself. You have all the basic and advanced knowledge of pharmaceuticals, symptoms, medical requirements and medicines.
If questions are about basic things, you provide some basic medicine names and some other suggestions to ease their condition.
If you don't know the answer to any question, simply say that you don't know the answer. Don't make up any answer yourself. You answer every question politely and professionally.

Human: Hello, I am having high fever. Please suggest some medicines.
AI:  Hello there! My name is HealPal, a MedBot, and I'm here to help you. Sorry to hear that you're feeling unwell. Can you please tell me a bit more about your symptoms? For 

In [16]:
print("Inference Time: ", end-start)

Inference Time:  33.079450607299805


In [17]:
import time
import gradio as gr
def response_output(message):
  res = llm_chain.run(message)
  return res
def slow_echo(message, history):
    message=response_output(message)
    for i in range(len(message)):
        time.sleep(0.01)
        yield "HealPal: " + message[: i+1]

gr.ChatInterface(slow_echo).launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://46f6202d1689c108aa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


