In [None]:
# Python 3.10.14
# Below works only on Python > 3.9 
# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 --upgrade --force-reinstall
# pip show torch # 2.5.1+cu121
# pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# pip install pandas matplotlib pillow tqdm 
# pip install gdown

In [None]:
if False: 
    !gdown https://drive.google.com/uc?id=19En9_JPWpdp9NlZIMV6hmIgOW4Uti4y8

    import shutil
    shutil.unpack_archive('input.zip') 

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

from PIL import Image
import json
from pathlib import Path
from tqdm.notebook import tqdm
from unsloth import FastVisionModel
import torch
import joblib
pd.options.display.max_rows = 5

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Image and Text data for Visual-Language LLM models

In [2]:
path_text_train = 'input/mitbih_train_resampled.csv'
path_text_test  = 'input/mitbih_test_renamed.csv'

df_train = pd.read_csv(path_text_train, usecols=['class'])
df_test  = pd.read_csv(path_text_test, usecols=['class']) 

df_train['image'] = None
df_test['image']  = None

df_train

Unnamed: 0,class,image
0,Normal,
1,Normal,
...,...,...
130445,Ventricular-Normal Fusion,
130446,Ventricular-Normal Fusion,


In [None]:
path_images_train = 'input/images/train'

def process_image(index, path_images):
    image_file = f'{index}.png'
    image_path = os.path.join(path_images, image_file)
    try: 
        image = Image.open(image_path)
        image.resize((300, 200))
        image_rgb = image.convert('L')  # Grey scale
        image.close() 
    except:
        image_rgb = None
    return index, image_rgb
    
results_train = joblib.Parallel(n_jobs=20)(joblib.delayed(process_image)(index, path_images_train) for 
                                           index in tqdm(df_train.index))

image_dict_train = {index: image_rgb for index, image_rgb in results_train}
df_train['image'] = df_train.index.map(image_dict_train)

  0%|          | 0/130447 [00:00<?, ?it/s]

In [None]:
df_train

In [None]:
path_images_test = 'input/images/test'

results_test = joblib.Parallel(n_jobs=-1)(joblib.delayed(process_image)(index, path_images_test) for
                                          index in tqdm(df_test.index))

image_dict_test = {index: image_rgb for index, image_rgb in results_test}
df_test['image'] = df_test.index.map(image_dict_test) 

In [6]:
instruction_user = """
You are a specialist Cardiologist specializing in the analysis of ECG (Electrocardiogram) signals.
Your goal is to analyze ECG signals and classify them accurately.
You are provided with a ECG image and a class that corresponds to the image.
You only classify ECG signals into one of the following five categories: 
"Normal", "Supraventricular Arrhythmia", "Ventricular Arrhythmia", "Ventricular-Normal Fusion", "Paced-Normal Fusion".
Each ECG signal must be classified into exactly one of these categories.
No additional text should be included in the output.
You are provided with an ECG signal image to look into. It is Time (x-axis) vs Voltage (y-axis). 
Classify the ECG signal accurately by analyzing the provided data.
"""

def conversations(row):

   conversation = [ {  "role": "user",
                     "content": [ {"type": "text", "text": instruction_user},
                                    {"type": "image", "image": row["image"]},],},

                    {  "role": "assistant",
                     "content": [{"type": "text", "text": row['class']}]}]

   return {"messages": conversation}

In [7]:
data_train = [  conversations(row) for _, row in df_train.iterrows() ]
data_test  = [  conversations(row) for _, row in df_test.iterrows()  ]

In [8]:
data_train[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '\nYou are a specialist Cardiologist specializing in the analysis of ECG (Electrocardiogram) signals.\nYour goal is to analyze ECG signals and classify them accurately.\nYou are provided with a ECG image and a class that corresponds to the image.\nYou only classify ECG signals into one of the following five categories: \n"Normal", "Supraventricular Arrhythmia", "Ventricular Arrhythmia", "Ventricular-Normal Fusion", "Paced-Normal Fusion".\nEach ECG signal must be classified into exactly one of these categories.\nNo additional text should be included in the output.\nYou are provided with an ECG signal image to look into. It is Time (x-axis) vs Voltage (y-axis). \nClassify the ECG signal accurately by analyzing the provided data.\n'},
    {'type': 'image', 'image': <PIL.Image.Image image mode=L size=600x400>}]},
  {'role': 'assistant', 'content': [{'type': 'text', 'text': 'Normal'}]}]}

# Fine-tuning

In [None]:
name_model = "Qwen2-VL-7B-Instruct-bnb-4bit" 

model, tokenizer = FastVisionModel.from_pretrained( "unsloth/" + name_model, 
                                                   load_in_4bit = True,
                                                   use_gradient_checkpointing = "unsloth")

In [None]:
# Setting up the PEFT (Parameter-Efficient Fine-Tuning) includes LoRA (Low-Rank Adaptation) 

model = FastVisionModel.get_peft_model(
            model,
            finetune_vision_layers    = True,
            finetune_language_layers  = True,
            finetune_attention_modules= True,
            finetune_mlp_modules      = True,
            r = 16,
            lora_alpha = 16,
            lora_dropout = 0,
            bias = "none",
            random_state = 3443,
            use_rslora = False,
            loftq_config = None )

In [None]:
from unsloth import is_bf16_supported # 16-bit fl-point format to improve performance/memory
from unsloth.trainer import UnslothVisionDataCollator # preparing/collating data for vision tasks
from trl import SFTTrainer, SFTConfig

In [None]:
%%time

# Enable "model" for training
FastVisionModel.for_training(model)  

trainer = SFTTrainer(model=model, tokenizer=tokenizer, 
                     data_collator=UnslothVisionDataCollator(model, tokenizer),  
                     train_dataset=data_train, 
                     args=SFTConfig(per_device_train_batch_size=2,
                                    gradient_accumulation_steps=4,
                                    warmup_steps=5,
                                    #max_steps=60,       # either this or num_train_epochs
                                    num_train_epochs=1,  # full pass over your dataset [1:3 max]
                                    learning_rate=2e-4,
                                    fp16=not is_bf16_supported(),
                                    bf16=is_bf16_supported(),
                                    logging_steps=5,
                                    optim="adamw_8bit",
                                    weight_decay=0.01,
                                    lr_scheduler_type="linear",
                                    seed=3407,
                                    output_dir="outputs",
                                    report_to="none",  # for weights and biases
                                    remove_unused_columns=False,
                                    dataset_text_field="",
                                    dataset_kwargs={"skip_prepare_dataset": True},
                                    dataset_num_proc=4,
                                    max_seq_length=2048))

trainer_stats = trainer.train()

In [None]:
# Save the model and tokenizer locally

name_finetuned = name_model + "_finetuned_" + "ecg"

model.save_pretrained(name_finetuned) 
tokenizer.save_pretrained(name_finetuned)

import shutil
shutil.make_archive(name_finetuned, 'zip', name_finetuned)

In [None]:
# Push models to the Hugging Face Hub

if False: 
    # !pip install --upgrade huggingface_hub
    from huggingface_hub import notebook_login
    notebook_login()

model.push_to_hub( "Aidan777/"    + name_finetuned) 
tokenizer.push_to_hub("Aidan777/" + name_finetuned) 