## Visual Transformer for Skin Cancer Detection

In [None]:
import evaluate
import torch
from transformers import ViTForImageClassification, Trainer, TrainingArguments
from transformers import ViTImageProcessor
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
import datasets
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import h5py
import cv2
from PIL import Image
from transformers import pipeline
import seaborn as sns
import matplotlib.pyplot as plt
import safetensors.torch

## Grabbing Data
This code grabs the image data for the competition. 

In [None]:
BASE_PATH = "isic-2024-challenge"

# Train + Valid
df = pd.read_csv(f'{BASE_PATH}/train-metadata.csv')
df = df.ffill()
display(df.head(2))

# Testing
testing_df = pd.read_csv(f'{BASE_PATH}/test-metadata.csv')
testing_df = testing_df.ffill()
display(testing_df.head(2))

## Preprocessing the Data
You'll notice the dataset is heavily skewed towards 'target:0'. If you run the following code block, you'll see the Class Distribution Before Sampling (%):

    target_0:99.902009

    target_1:0.097991

The following code redistributes the data such that the ratio of 'target:0' and 'target:1' is much closer:

    target_0:67.09645

    target_1:32.90355

We will use this data in our following steps.


In [None]:
#Handle Class Imbalance
print("Class Distribution Before Sampling (%):")
display(df.target.value_counts(normalize=True)*100)
seed = 1
neg_sample = .01
pos_sample = 5.0
# Sampling
positive_df = df.query("target==0").sample(frac=neg_sample, random_state=seed)
negative_df = df.query("target==1").sample(frac=pos_sample, replace=True, random_state=seed)
df = pd.concat([positive_df, negative_df], axis=0).sample(frac=1.0)

print("\nClass Distribution After Sampling (%):")
display(df.target.value_counts(normalize=True)*100)

# Assume df is your DataFrame and 'target' is the column with class labels
class_weights = compute_class_weight('balanced', classes=np.unique(df['target']), y=df['target'])
class_weights = dict(enumerate(class_weights))
print("Class Weights:", class_weights)

### Training, Validation, and Test Data

This script splits our data into training (60%), validation (20%), and test (20%) sets

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

training_validation_hdf5 = h5py.File(f"{BASE_PATH}/train-image.hdf5", 'r')

# Reset index to ensure a continuous index
df = df.reset_index(drop=True)
df["fold"] = -1

# Set up the StratifiedGroupKFold with 5 splits
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=seed)

# Assign fold numbers to each data point
for i, (training_idx, validation_idx) in enumerate(sgkf.split(df, y=df.target, groups=df.patient_id)):
    df.loc[validation_idx, "fold"] = int(i)

# Define the train, validation, and test sets
# Use fold 0 for test, fold 1 for validation, and remaining folds for training
training_df = df.query("fold > 1")  # Folds 2, 3, 4 for training
validation_df = df.query("fold == 1")  # Fold 1 for validation
test_df = df.query("fold == 0")  # Fold 0 for testing

# Print the number of samples in each set
print(f"# Num Train: {len(training_df)} | Num Valid: {len(validation_df)} | Num Test: {len(test_df)}")


# The Model
Here we instantiate the Visual Transformer (ViT). The model takes in 224x224 images, so some transformations are needed for the inputs 

In [None]:
processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')

transform = Compose([
        Resize((224, 224)),
        ToTensor(),
        Normalize(mean=processor.image_mean, std=processor.image_std),
    ])

def preprocess_images(example):
        byte_string = training_validation_hdf5[example["isic_id"]][()]
        nparr = np.frombuffer(byte_string, np.uint8)
        image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)[...,::-1]
        image = Image.fromarray(image)
        example['pixel_values'] = transform(image)
        return example

ds_training = datasets.Dataset.from_pandas(pd.DataFrame(data=training_df))
ds_valid = datasets.Dataset.from_pandas(pd.DataFrame(data=validation_df))
ds_test = datasets.Dataset.from_pandas(pd.DataFrame(data=test_df))


ds_training = ds_training.map(preprocess_images, batched=False)
ds_valid = ds_valid.map(preprocess_images, batched=False)
ds_test = ds_test.map(preprocess_images, batched=False)

## Evaluation Metric
For a classification problem, we use an accuracy metrics. 

In [18]:
#accuracy metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
model_name='google/vit-base-patch16-224-in21k'
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)

# Make sure its the right tensor types
def collate_fn(batch):
    pixel_values = torch.stack([torch.tensor(example['pixel_values']) for example in batch])
    labels = torch.tensor([example['target'] for example in batch])
    return {'pixel_values': pixel_values, 'labels': labels}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./vit-finetuned-agent0',
    metric_for_best_model = "accuracy",
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=20,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    remove_unused_columns=False,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_training,
    eval_dataset=ds_valid,
    data_collator=collate_fn,
    tokenizer=processor,
    compute_metrics=compute_metrics
)

In [None]:
#train da model
trainer.train()
# Save da fine-tuned model
trainer.save_model('./vit-finetuned-agentO')

In [None]:
trainer.eval_dataset=ds_test
trainer.evaluate()

# Evaluation of Model
This code evaulates the final model on the test dataset and plots the confusion matrix

In [None]:
def process_test_set(ds, num_samples):
    inputs = []
    labels = []
    for i in range(num_samples):
        byte_string = training_validation_hdf5[ds[i]["isic_id"]][()]
        nparr = np.frombuffer(byte_string, np.uint8)
        image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)[...,::-1]
        image = Image.fromarray(image)
        inputs.append(image)
        labels.append(ds[i]["target"]) 
    return inputs, labels

inputs, y_true = process_test_set(ds_test, 1000)
print(inputs[:10])

In [None]:
model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224-in21k")
state_dict = safetensors.torch.load_file("vit-finetuned-agent0/checkpoint-228/model.safetensors")
model.load_state_dict(state_dict)

image_classifier = pipeline("image-classification", model,image_processor=processor)
predictions = image_classifier(inputs)
#select highest labels
predictions = [max(item, key=lambda x: x['score'])['label'] for item in predictions]
#convert from LABEL_0, LABEL_1 to 0,1
predictions = [1 if item == 'LABEL_1' else 0 for item in predictions]


In [None]:
#plot confusion matrix
confusion_metric = evaluate.load("confusion_matrix")
confusion_matrix = confusion_metric.compute(predictions=predictions, references=y_true)
matrix = confusion_matrix['confusion_matrix']
if 'labels' in confusion_matrix:
    labels = confusion_matrix['labels']
else:
    labels = np.unique(predictions + y_true)
sns.heatmap(matrix, annot=True, fmt="d", xticklabels=labels, yticklabels=labels, cmap="Blues")
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.title("Confusion Matrix")
plt.show()