In [1]:
import os
import re
import torch
import random
import warnings
import pandas as pd
from datasets import Dataset
from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel,
    PeftConfig,
    prepare_model_for_int8_training
)

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
warnings.filterwarnings('ignore')

2023-07-27 07:57:46.372562: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-27 07:57:46.556456: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'data', 'med_ques.csv'))
print(df.shape)
df.head()

(16333, 4)


Unnamed: 0,instruction,input,output,text
0,"If you are a doctor, please answer the medical...",What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,"Below is an instruction that describes a task,..."
1,"If you are a doctor, please answer the medical...",What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...","Below is an instruction that describes a task,..."
2,"If you are a doctor, please answer the medical...",What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,"Below is an instruction that describes a task,..."
3,"If you are a doctor, please answer the medical...",What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...","Below is an instruction that describes a task,..."
4,"If you are a doctor, please answer the medical...",What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,"Below is an instruction that describes a task,..."


In [3]:
df['text'][0]

"Below is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the request.\n\n### Instruction: \nIf you are a doctor, please answer the medical questions based on user's query.\n\n### Input: What is (are) Glaucoma ?\n\n### Response: Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. While glaucoma can strike anyone, the risk is much greater for people over 60. How Glaucoma Develops  There are several different types of glaucoma. Most of these involve the drainage system within the eye. At the front of the eye there is a small space called the anterior chamber. A clear fluid flows through this chamber and bathes and nourishes the nearby tissues. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.) In glaucoma

In [4]:
INTRO = "Below is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the request."
INSTRUCTION_FORMAT = (
    """{intro}\n\n### Instruction: \n{instruction}\n\n### Input: {input}\n\n ### Response: """
)

def load_model_tokenizer_for_generate(pretrained_model_name_or_path: str):
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path, padding_side="left"
    )
    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)
    return model, tokenizer



In [5]:
def generate_response(
    instruction: str,
    input_text: str,
    *,
    model,
    tokenizer,
    do_sample: bool = True,
    max_new_tokens: int = 512,
    top_p: float = 0.92,
    top_k: int = 0,
    **kwargs,
) -> str:
    input_ids = tokenizer(
        INSTRUCTION_FORMAT.format(
            intro=INTRO, instruction=instruction, input=input_text
        ),
        return_tensors="pt",
    ).input_ids
    gen_tokens = model.generate(
        input_ids=input_ids,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=do_sample,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        top_k=top_k,
        **kwargs,
    )
    decoded = tokenizer.batch_decode(gen_tokens)[0]

    # The response appears after "### Response:".  The model has been trained to append "### End" at the end.
    m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", decoded, flags=re.DOTALL)

    response = None
    if m:
        response = m.group(1).strip()
    else:
        # The model might not generate the "### End" sequence before reaching the max tokens.  In this case, return
        # everything after "### Response:".
        m = re.search(r"#+\s*Response:\s*(.+)", decoded, flags=re.DOTALL)
        if m:
            response = m.group(1).strip()
        else:
            print(f"Failed to find response in:\n{decoded}")

    return response



In [12]:
if __name__ == "__main__":
    base_model = "EleutherAI/gpt-j-6B"
    peft_model_id = os.path.join(os.path.dirname(os.getcwd()), 'results', 'checkpoint-8165')
    config = PeftConfig.from_pretrained(peft_model_id)
    model = AutoModelForCausalLM.from_pretrained(base_model, return_dict=True, 
                                                 load_in_8bit=True, torch_dtype = torch.float16)
    trained_model = PeftModel.from_pretrained(model, peft_model_id)
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    print("Welcome to the response generation program!")
    while True:
        instruction = "If you are a doctor, please answer the medical questions based on user's query"
        input_text = input("Enter the input text: ")
        response = generate_response(
            instruction=instruction,
            input_text=input_text,
            model=trained_model,
            tokenizer=tokenizer,
            
            max_new_tokens = 100,
        )
        print('*' * 100)
        print("Generated Response:")
        print(response)
        print('*' * 100)

        continue_generation = input("Do you want to continue (yes/no)? ").lower()
        if continue_generation != "yes":
            print("Exiting the response generation program.")
            break



Welcome to the response generation program!


Enter the input text:  what is the symptomps of cholera?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


****************************************************************************************************
Generated Response:
The symptoms of cholera include : 
    
 - vomiting of fluids and solids 
 - rapid and profuse bowel movements 
 - extreme thirst 
 - excessive urine output 
 - weakness 
 - abdominal pain 
 - dim or bluish skin color 
 - cool and clay-colored skin 
 - sunken eyes 
    
 Vomiting of fluids and solids     When cholera enters the
****************************************************************************************************


Do you want to continue (yes/no)?  yes
Enter the input text:  Let's go for a walk.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


****************************************************************************************************
Generated Response:
-  What is the outlook for Peripheral Vascular Disease?  -  Diagnosis of Peripheral Vascular Disease   -  Treatments for Peripheral Vascular Disease   -  Key Points for the Home Healthcare Provider: Peripheral Vascular Disease   -  The first step in home care for a patient with peripheral vascular disease (PVD) is education. A home healthcare professional can teach patients and their family members how to recognize symptoms of PVD and
****************************************************************************************************


Do you want to continue (yes/no)?  Cook me some good food.


Exiting the response generation program.


In [9]:
text = """i love you too!


### Instruction: What is (are) bronchopulmonary dysplasia?
                
Bronchopulmonary dysplasia (BPD) is a term used to describe a severe breathing disorder that occurs in premature infants. The lungs of premature infants may not mature (grow larger) well enough to provide sufficient amounts of oxygen. 
                
Bronchopulmonary dysplasia affects one or both lungs. Treatment may include oxygen and medicines given by a machine (ventilators) to help with breathing. Other treatments may include medications to treat inflammation.
                
The cause of bronchopulmonary dysplasia is not well understood, but many babies are born prematurely. Possible causes may include infection during pregnancy and problems during delivery. 
                
Some infants with bronchopulmonary dysplasia do not fully recover. Infants who suffer from bronchopulmonary dysplasia have a higher risk for long-term problems and disabilities.
                
More on this topic
                
Bronchopulmonary dysplasia is similar to another respiratory disorder called chronic lung disease. In most cases, the two conditions have different causes.
                
About two out of every 1,000 infants are born prematurely with a serious lung disorder called respiratory distress syndrome (RDS). RDS develops as a complication during birth. During pregnancy, the tiny blood vessels in the lungs (capillaries) cannot grow and stretch to supply oxygen to the developing baby. This can slow down or even stop the growth of the baby's lungs.
                
Respiratory distress syndrome is the most common cause of death in premature infants. Many infants who survive are left with lifelong breathing problems, such as bronchopulmonary dysplasia.
                
More information is provided in the NID"""

In [11]:
len(text)

1827