<a href="https://colab.research.google.com/github/qasimzee/isic2024/blob/main/classifier101_scikit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fastai h5py pillow
!pip install scikit-learn==1.2.2
!pip install numpy==1.26.4
import numpy as np
import pandas as pd
import os
from fastai.vision.all import *

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls /content/drive/MyDrive/kaggle/isic-2024-data/


In [None]:
HDF5_FILE = "/content/drive/MyDrive/kaggle/isic-2024-data/train-image.hdf5"
METADATA_FILE = "/content/drive/MyDrive/kaggle/isic-2024-data/train-metadata.csv"

In [None]:
import pandas as pd

# Load the metadata CSV file
metadata_df = pd.read_csv(METADATA_FILE)
# Display the first few rows to understand the structure
df_target_1 = metadata_df[metadata_df['target'] == 1]

# Filter the DataFrame for entries where 'target' is 0 and select the first 5000
df_target_0 = metadata_df[metadata_df['target'] == 0][:5000]

# Concatenate the two DataFrames
metadata_df = pd.concat([df_target_1, df_target_0])
metadata_df

In [None]:
metadata_df[metadata_df['target'] == 1]

In [None]:
from sklearn.model_selection import train_test_split

# Assuming metadata_df has columns 'isic_id' and 'target'
train_df, test_df = train_test_split(metadata_df, test_size=0.01, random_state=42)


target_size = (56, 56)
batch_size = 1024
input_shape=(56, 56, 3)

test_df[test_df['target'] == 1]

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Setup ImageDataGenerator with augmentations
datagen = ImageDataGenerator(
    rotation_range=60,            # Random rotations up to 30 degrees
    width_shift_range=0.2,        # Horizontal shifts up to 10% of the width
    height_shift_range=0.2,       # Vertical shifts up to 10% of the height
    brightness_range=[0.8, 1.2],  # Random brightness adjustment
    zoom_range=0.5,               # Random zooms up to 20%
    horizontal_flip=True,         # Random horizontal flips
    fill_mode='nearest'           # Fill mode for points outside the boundaries
)


In [None]:
import h5py
import numpy as np
import cv2

def load_and_preprocess_images(image_bytes, target_size):

  # Decode the bytes into an image array
  image = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.IMREAD_COLOR)

  # Resize the image to the target size
  resized_image = cv2.resize(image, target_size)

  # Normalize the image
  normalized_image = resized_image / 255.0

  # Convert list to NumPy array
  features = np.array(normalized_image)

  return features


In [None]:

import h5py
import numpy as np
import cv2

def hdf5_data_generator(hdf5_file, metadata_df, batch_size, target_size=(56, 56)):
  with h5py.File(hdf5_file, 'r') as hf:
    num_samples = len(train_df)

    while True:
      for start in range(0, num_samples, batch_size):
        end = min(start + batch_size, num_samples)

        batch_isic_ids = train_df['isic_id'].iloc[start:end]
        batch_labels = train_df['target'].iloc[start:end].values

        batch_images = []

        for isic_id in batch_isic_ids:
          image_bytes = hf[str(isic_id)][()]
          features = load_and_preprocess_images(image_bytes, target_size)
          batch_images.append(features)

        # Apply augmentations using datagen
        augmented_images = np.array([datagen.random_transform(image) for image in batch_images])

        yield augmented_images, batch_labels

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import layers, models
from sklearn.utils.class_weight import compute_class_weight

# Load the pre-trained EfficientNetB0 model without the top layers
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=input_shape)

# Freeze the base model
base_model.trainable = False

# Add custom layers on top
model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')  # Single output for binary classification
])

# Calculate class weights based on the training set
class_weights = compute_class_weight('balanced', classes=np.unique(train_df['target']), y=train_df['target'])
class_weight_dict = dict(enumerate(class_weights))

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(hdf5_data_generator(HDF5_FILE, train_df, batch_size),
          steps_per_epoch=len(train_df) // 32,
          epochs=3)  # Start with fewer epochs to see how it performs

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/3
[1m 43/166[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m14:55[0m 7s/step - accuracy: 0.8632 - loss: 0.3375

In [None]:
import h5py
import numpy as np
import cv2
from tensorflow.keras.models import load_model


results = []

with h5py.File(HDF5_FILE, 'r') as hf:
    num_samples = len(test_df)

    for start in range(0, num_samples, batch_size):
        end = min(start + batch_size, num_samples)
        batch_df = test_df.iloc[start:end]
        batch_isic_ids = batch_df['isic_id'].values
        batch_images = []

        for isic_id in batch_isic_ids:
          image_bytes = hf[str(isic_id)][()]

          # Decode the bytes into an image array
          image = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), cv2.IMREAD_COLOR)

          # Resize the image to the target size
          resized_image = cv2.resize(image, target_size)

          # Normalize the image
          normalized_image = resized_image / 255.0

          # Add a batch dimension to the image (from (56, 56, 3) to (1, 56, 56, 3))
          image_batch = np.expand_dims(normalized_image, axis=0)

          # Make a prediction
          prediction = model.predict(image_batch) [0]
          results.append({'isic_id': isic_id, 'target': prediction[0]})


results_df = pd.DataFrame(results)

# results_df.to_csv('predictions.csv', index=False)




In [None]:
test_df[test_df['target'] == 1]

In [None]:
results_df

In [None]:
# import joblib

# # Save the trained model to a file
# joblib_filename = 'random_forest_model.joblib'
# joblib.dump(model, joblib_filename)


In [None]:
# import pickle

# # Save the trained model to a file
# pickle_filename = '/content/drive/MyDrive/kaggle/isic-2024-data/cnn.pkl'
# with open(pickle_filename, 'wb') as file:
#     pickle.dump(model, file)


In [None]:
# import pickle

# # Load the model from the pkl file
# with open(pickle_filename, 'rb') as file:
#     model = pickle.load(file)


In [None]:
import numpy as np
import pandas as pd
import pandas.api.types
from sklearn.metrics import roc_curve, auc, roc_auc_score

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, min_tpr: float=0.80) -> float:
    '''
    2024 ISIC Challenge metric: pAUC

    Given a solution file and submission file, this function returns the
    the partial area under the receiver operating characteristic (pAUC)
    above a given true positive rate (TPR) = 0.80.
    https://en.wikipedia.org/wiki/Partial_Area_Under_the_ROC_Curve.

    (c) 2024 Nicholas R Kurtansky, MSKCC

    Args:
        solution: ground truth pd.DataFrame of 1s and 0s
        submission: solution dataframe of predictions of scores ranging [0, 1]

    Returns:
        Float value range [0, max_fpr]
    '''

    del solution[row_id_column_name]
    del submission[row_id_column_name]

    # check submission is numeric
    if not pandas.api.types.is_numeric_dtype(submission.values):
        raise ParticipantVisibleError('Submission target column must be numeric')

    # rescale the target. set 0s to 1s and 1s to 0s (since sklearn only has max_fpr)
    v_gt = abs(np.asarray(solution.values)-1)

    # flip the submissions to their compliments
    v_pred = -1.0*np.asarray(submission.values)
    max_fpr = abs(1-min_tpr)

    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=None)

    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)

    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)

    partial_auc = auc(fpr, tpr)

    return(partial_auc)

# solution = test_df[['isic_id', 'target']]
# predictions = predictions_df[['isic_id', 'target']]

# partial_auc = score(solution, predictions, "isic_id")
# partial_auc