<a href="https://colab.research.google.com/github/sAndreotti/MedicalMeadow/blob/main/ATML_part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [30]:
!pip install datasets accelerate peft bitsandbytes transformers trl==0.12.0 plotly huggingface_hub
!pip install --upgrade smart_open
!pip install --upgrade gensim
!pip install ffmpeg-python
!pip install -U openai-whisper
!pip install scipy librosa unidecode inflect



In [31]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from trl import SFTTrainer
import re
from gensim.models.word2vec import Word2Vec
import plotly.express as px
import random
from sklearn.manifold import TSNE
from torch.utils.data import Dataset
from torch.utils.data import random_split
from peft import prepare_model_for_kbit_training, LoraConfig
from huggingface_hub import login
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments
)
from datasets import Dataset as HFDataset
from peft import AutoPeftModelForCausalLM
import whisper
from IPython.display import Audio

## Investigate Dataset

In [32]:
ds = load_dataset("medalpaca/medical_meadow_medical_flashcards")
ds = ds['train']
ds

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 33955
})

In [33]:
print(ds.features, "\n")
print("Instruction:")
print(f"length: {len(ds['instruction'])}")
print(f"example: {ds['instruction'][0]} \n")

print(f"Input:")
print(f"length: {len(ds['input'])}")
print(f"example: {ds['input'][0]} \n")

print(f"Output:")
print(f"length: {len(ds['output'])}")
print(f"example: {ds['output'][0]} \n")

{'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None), 'instruction': Value(dtype='string', id=None)} 

Instruction:
length: 33955
example: Answer this question truthfully 

Input:
length: 33955
example: What is the relationship between very low Mg2+ levels, PTH levels, and Ca2+ levels? 

Output:
length: 33955
example: Very low Mg2+ levels correspond to low PTH levels which in turn results in low Ca2+ levels. 



### Some plots about the dataset

In [34]:
instructions = ds['instruction']
input_phrases = ds['input']
output_phrases = ds['output']

## Train and evaluate models

#### Create dataset

In [35]:
class MedDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        messages = [
            {"role": "system", "content": example['instruction']},
            {"role": "user", "content": example['input']},
            {"role": "assistant", "content": example['output']}
        ]

        prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        tokens = self.tokenizer(
            prompt,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        tokens['labels'] = tokens['input_ids'].clone()
        tokens['labels'][tokens['input_ids'] == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": tokens['input_ids'].squeeze(),
            "attention_mask": tokens['attention_mask'].squeeze(),
            "labels": tokens['labels'].squeeze()
        }

In [36]:
login(token="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")

base_model = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

# tokenizer.padding_side = "right"

In [37]:
train_dataset, val_dataset, test_dataset = random_split(ds, [0.8, 0.1, 0.1])

train_dataset = MedDataset(train_dataset, tokenizer)
val_dataset = MedDataset(val_dataset, tokenizer)

print(f"Train dataset dimension: {len(train_dataset)}")
print(f"Validation dataset dimension: {len(val_dataset)}")
print(f"Test dataset dimension: {len(test_dataset)}")

Train dataset dimension: 27165
Validation dataset dimension: 3395
Test dataset dimension: 3395


In [38]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_representation="nested"
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0},
    torch_dtype=torch.float32,
    trust_remote_code=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

Unused kwargs: ['bnb_4bit_representation']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [39]:
# def convert_to_hf_dataset(med_dataset):
#     # Create lists to store all formatted text
#     formatted_texts = []

#     # Iterate through all items in the original dataset
#     for idx in range(len(med_dataset.instruction)):
#         # Get the formatted text directly using the dataset's __getitem__
#         formatted_text = med_dataset[idx]
#         formatted_texts.append(formatted_text)

#     # Create a dictionary with the required format
#     dataset_dict = {
#         'text': formatted_texts
#     }

#     # Convert to HuggingFace Dataset
#     hf_dataset = HFDataset.from_dict(dataset_dict)

#     return hf_dataset

# hf_dataset = convert_to_hf_dataset(garnachoDataset)

In [40]:
peft_params = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)

In [41]:
model.train()
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    eval_strategy="steps",
    logging_steps=90,
    eval_steps = 90,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    gradient_checkpointing=True
)

In [42]:
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     peft_config=peft_params,
#     max_seq_length=256,
#     tokenizer=tokenizer,
#     args=training_params,
#     packing=False,
# )

# # Train the model
# trainer.train()

# # Save the model and tokenizer
# trainer.save_model("./fine-tuned-model")
# tokenizer.save_pretrained("./fine-tuned-model")

In [43]:
# trainer.model.save_pretrained("model-chatbot-medical-mew")
# trainer.tokenizer.save_pretrained("model-chatbot-medical-mew")

## Test Trained Model

### Load pre-trained model

In [44]:
trained_model = "/kaggle/input/medicalllm/pytorch/default/1/model-chatbot-medical-mew"
question = "What does low Mobility suggest?"

model = AutoPeftModelForCausalLM.from_pretrained(
    trained_model, # change with folder where u have the files
    quantization_config=quant_config,
    device_map={"": 0},
    torch_dtype=torch.float32,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(trained_model)

In [45]:
messages = [{"role": "system", "content": instructions[0]},
    {"role": "user", "content": question}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

model_inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**model_inputs, max_new_tokens=128)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Question: {question}")
print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What does low Mobility suggest?


Low mobility may suggest a condition such as osteoporosis or atherosclerosis.


### Compare with MedLlama

In [46]:
# Prima disinstalliamo e reinstalliamo bitsandbytes per sicurezza
!pip uninstall -y bitsandbytes
!pip install bitsandbytes
!pip install accelerate transformers

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging

  pid, fd = os.forkpty()


Found existing installation: bitsandbytes 0.45.0
Uninstalling bitsandbytes-0.45.0:
  Successfully uninstalled bitsandbytes-0.45.0
Collecting bitsandbytes
  Using cached bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Using cached bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0


In [47]:
def setup_medAlpaca():   
    try:
        # bits and bytes version
        import bitsandbytes as bnb
        print(f"bitsandbytes version: {bnb.__version__}")
        
        # device
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"CUDA version: {torch.version.cuda}")
        
        # try not quantize model
        model = AutoModelForCausalLM.from_pretrained(
            "medalpaca/medalpaca-7b",
            trust_remote_code=True,
            device_map='auto',
            torch_dtype=torch.float16
        )
        
        tokenizer = AutoTokenizer.from_pretrained("medalpaca/medalpaca-7b")
        
        return model, tokenizer
        
    except Exception as e:
        print(f"Si è verificato un errore: {str(e)}")
        print("\nInformazioni di debug:")
        print(f"Python version: {sys.version}")
        if torch.cuda.is_available():
            print(f"GPU: {torch.cuda.get_device_name()}")
        return None, None

In [48]:
# Esecuzione
modelMED, tokenizerMED = setup_medAlpaca()

bitsandbytes version: 0.45.0
CUDA available: True
CUDA version: 12.1




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [49]:
# tokenizerMED = AutoTokenizer.from_pretrained("OpenScienceReseach/Med-LLaMA-7b")
# modelMED = AutoModelForCausalLM.from_pretrained("OpenScienceReseach/Med-LLaMA-7b") funzia ma lento

# tokenizerMED = AutoTokenizer.from_pretrained("medalpaca/medalpaca-7b")
# modelMED = AutoModelForCausalLM.from_pretrained("medalpaca/medalpaca-7b")

In [50]:
!pip install nltk
import nltk
nltk.download('punkt')

from nltk.translate.bleu_score import sentence_bleu

# BLUE score between the candiate (generate answer) and the reference answer
def calculate_bleu(reference, candidate):
    try:
      score = sentence_bleu(reference, candidate)
      return score
    except Exception as e:
      print(f"Error during BLUE computing: {e}")
      return None

  pid, fd = os.forkpty()


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [51]:
test_dataset[0]

{'input': 'What is the mechanism responsible for the expression of tissue-restricted self-antigens in the thymus for negative selection, and what is the name of the protein that mediates this mechanism?',
 'output': 'The mechanism responsible for the expression of tissue-restricted self-antigens in the thymus for negative selection is mediated by the autoimmune regulator (AIRE) protein. AIRE is responsible for inducing the expression of tissue-specific antigens in thymic epithelial cells, which is important for the development of self-tolerance and prevention of autoimmune diseases.',
 'instruction': 'Answer this question truthfully'}

In [52]:
test = test_dataset[:10]

In [57]:
# questions = ["What does low REM sleep latency and experiencing hallucinations/sleep paralysis suggest?",
#              "What are some possible causes of low PTH and high calcium levels?",
#              "What is the term used to describe a condition of low sodium levels and very high proteins or lipids?"]

# dataset_answer = ["Low REM sleep latency and experiencing hallucinations/sleep paralysis suggests narcolepsy.",
#                   "PTH-independent hypercalcemia, which can be caused by cancer, granulomatous disease, or vitamin D intoxication.",
#                   "The term used to describe a condition of low sodium levels and very high proteins or lipids is pseudohyponatremia."]

alpaca_bleu = []
our_bleu = []

from transformers import pipeline
import time
import re

instruction = "Answer this question truthfully: "

# medAlpaca
print("MedAlpaca")
for i, question in enumerate(test['input']):
    print(f"\nQuestion: {question}")
    
    input_text = instruction + question

    # generate response
    input_ids = tokenizerMED(input_text, return_tensors='pt').to(modelMED.device)["input_ids"]
    outputs = modelMED.generate(input_ids, max_new_tokens=128)
    response = tokenizerMED.decode(outputs[0])
    clean_response = re.sub(r'.*?\? ', '', response, flags=re.DOTALL)
    
    print("   Response MedAlpaca:", clean_response)

    # blue score
    candidate = clean_response.split()
    bleu_score = calculate_bleu(test['output'][i], candidate)

    if bleu_score is not None:
        print(f"BLEU score for Llama answer {i}: {bleu_score}")   
        alpaca_bleu.append(bleu_score)

print("\nMedical Meadow")
# medical meadow
for i, question in enumerate(test['input']):
    print(f"\nQuestion: {question}")
    
    # generate response
    messages = [{"role": "system", "content": instructions[0]},
    {"role": "user", "content": question}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
    outputs = model.generate(**model_inputs, max_new_tokens=128)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    our = text.split("assistant")[1]
    
    print("Response Medical Meadow: ", our)

    # bleu score
    candidate = our.split()
    bleu_score = calculate_bleu(test['output'][i], candidate)

    if bleu_score is not None:
        print(f"BLEU score for Medical Meadow answer {i}: {bleu_score}")   
        our_bleu.append(bleu_score)

MedAlpaca

Question: What is the mechanism responsible for the expression of tissue-restricted self-antigens in the thymus for negative selection, and what is the name of the protein that mediates this mechanism?
   Response MedAlpaca: The mechanism responsible for the expression of tissue-restricted self-antigens in the thym
BLEU score for Llama answer 0: 0

Question: What type of organisms do 1st generation cephalosporins have activity mostly against?
   Response MedAlpaca: Gram positive organisms.</s><s>
BLEU score for Llama answer 1: 0

Question: What are some of the metabolic abnormalities associated with von Gierke disease?
   Response MedAlpaca: </s> Answer this question truthfully: What are some of the metabolic abnormalities associated with von Gierke disease?

The metabolic abnormalities associated with von Gierke disease include increased levels of lactate, glucose, and lipids in the blood.</s><s>
BLEU score for Llama answer 2: 0

Question: What factor most stimulates gastri

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


   Response MedAlpaca: Etanercept is a medication that is primarily used to treat Rheumatoid Arthritis.</s><s>
BLEU score for Llama answer 4: 0.537284965911771

Medical Meadow

Question: What is the mechanism responsible for the expression of tissue-restricted self-antigens in the thymus for negative selection, and what is the name of the protein that mediates this mechanism?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Response Medical Meadow:  

The mechanism responsible for the expression of tissue-restricted self-antigens in the thymus for negative selection is mediated by the TCRγβ (CD161) protein. TCRγβ is a receptor that is expressed on the surface of T cells and helps to distinguish between self and non-self antigens. When T cells encounter self-antigens, TCRγβ binds to them and activates a signaling cascade that promotes their differentiation into a regulatory T cell, which is then deleted from the thymus. This process helps to prevent T cells from attacking self-antigens and ensures that the immune system does
BLEU score for Medical Meadow answer 0: 0.38503228868787126

Question: What type of organisms do 1st generation cephalosporins have activity mostly against?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Response Medical Meadow:  

1st generation cephalosporins have activity mostly against Gram-positive bacteria.
BLEU score for Medical Meadow answer 1: 0

Question: What are some of the metabolic abnormalities associated with von Gierke disease?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Response Medical Meadow:  

Von Gierke disease is a metabolic disorder that is characterized by a deficiency of the enzyme glucose-6-phosphofructokinase, which is involved in the conversion of fructose-6-phosphate to glucose-6-phosphate. This leads to an accumulation of fructose-6-phosphate, which can cause a range of metabolic abnormalities, including:

* Hyperglycemia (high blood sugar)
* Hyperlipidemia (elevated cholesterol and triglycerides)
* Lipid metabolism disorders
* Hypoglycemia (low blood sugar)
* Increased risk of developing fatty liver disease
* High levels
BLEU score for Medical Meadow answer 2: 0.34095107969299543

Question: What factor most stimulates gastrin secretion in the stomach?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Response Medical Meadow:  

Gastrin secretion in the stomach is most stimulated by H2R2 receptors, which are located on the parietal cells of the stomach lining. When H2R2 receptors are activated by histamine, they send a signal to the parietal cells to secrete gastrin, which stimulates the release of gastric acid and digestive enzymes. This helps to break down food in the stomach and prepare it for further digestion in the small intestine.
BLEU score for Medical Meadow answer 3: 0.346966645494591

Question: What is the primary use of etanercept, and what condition is it used to treat?
Response Medical Meadow:  

Etanercept is a monoclonal antibody that is primarily used to treat rheumatoid arthritis (RA) and psoriatic arthritis (PsA).
BLEU score for Medical Meadow answer 4: 0.4854917717073234


In [54]:
from tabulate import tabulate

table = list(zip([0,1,2,3,4,5,6,7,8,9], alpaca_bleu, our_bleu))
print('BLEU score')
print(tabulate(table, headers=['Question', 'MedAlpaca', 'MedicalMeadow'], tablefmt='grid'))

import numpy
bleu_alpaca = np.mean(alpaca_bleu)
bleu_med = np.mean(our_bleu)
print(f"\n-> Average BLEU score for MedAlpaca model: {bleu_alpaca}")
print(f"-> Average BLEU score for MedicalMeadow model: {bleu_med}")

BLEU score
+------------+-------------+-----------------+
|   Question |   MedAlpaca |   MedicalMeadow |
|          0 |    0        |        0.461737 |
+------------+-------------+-----------------+
|          1 |    0        |        0        |
+------------+-------------+-----------------+
|          2 |    0        |        0.321157 |
+------------+-------------+-----------------+
|          3 |    0.323772 |        0        |
+------------+-------------+-----------------+
|          4 |    0.326497 |        0.562341 |
+------------+-------------+-----------------+

-> Average BLEU score for MedAlpaca model: 0.1300538748354739
-> Average BLEU score for MedicalMeadow model: 0.26904708650012193
