In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import cv2
import numpy as np
from PIL import Image
import albumentations as A
import h5py
import io

In [8]:
df_train_metadata = pd.read_csv('/Users/jimmyhe/Desktop/KaggleCompetitions/ISISCANCER/isic-2024-challenge/train-metadata.csv')
len(df_train_metadata)
# train_image_ds = "isic-2024-challenge/train-image.hdf5"

  df_train_metadata = pd.read_csv('/Users/jimmyhe/Desktop/KaggleCompetitions/ISISCANCER/isic-2024-challenge/train-metadata.csv')


401059

In [None]:
condition = df_train_metadata['target'] == 1
positive_isic_ids = df_train_metadata.loc[condition, 'isic_id'].values

In [None]:
positive_isic_ids[:10]

array(['ISIC_0082829', 'ISIC_0096034', 'ISIC_0104229', 'ISIC_0119495',
       'ISIC_0157834', 'ISIC_0190307', 'ISIC_0211092', 'ISIC_0220459',
       'ISIC_0238218', 'ISIC_0275647'], dtype=object)

In [None]:
positive_isic_ids[:10], len(positive_isic_ids)

(array(['ISIC_0082829', 'ISIC_0096034', 'ISIC_0104229', 'ISIC_0119495',
        'ISIC_0157834', 'ISIC_0190307', 'ISIC_0211092', 'ISIC_0220459',
        'ISIC_0238218', 'ISIC_0275647'], dtype=object),
 393)

In [6]:

def read_h5df_file(h5_file_path, image_ids):
    with h5py.File(h5_file_path, 'r') as h5_file:
        images = []
        for img_id in image_ids:
            img_data = h5_file[img_id][()]  # Access the dataset as a string
            img = Image.open(io.BytesIO(img_data))  # Decode the string into an image
            images.append(img)
    return images

# def get_transforms(image_size=224):
#Version 1. Caused significant drop in LB score, harmful
#     transforms_train = A.Compose([
#         A.Transpose(p=0.5),
#         A.VerticalFlip(p=0.5),
#         A.HorizontalFlip(p=0.5),
#         A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.75),
#         A.CLAHE(clip_limit=4.0, p=0.7),
#         A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=20, val_shift_limit=10, p=0.5),
#         A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=5, border_mode=cv2.BORDER_REFLECT_101, p=0.7),
#     ])

#     return transforms_train


def get_transforms(image_size=224):
    transforms_train = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(brightness_limit=0.05, contrast_limit=0.05, p=0.5),
        A.ShiftScaleRotate(shift_limit=0.02, scale_limit=0.02, rotate_limit=10, border_mode=cv2.BORDER_CONSTANT, p=0.5),
        A.GaussNoise(var_limit=(0.5, 2.0), p=0.2),
        A.GaussianBlur(blur_limit=3, p=0.1),
    ])
    return transforms_train

def augment_image(image, transforms):
    augmented = transforms(image=np.array(image))
    return augmented['image']

def preprocess_image(image, target_size=224):
    image_np = np.array(image)
    h, w = image_np.shape[:2]
    scale = target_size / max(h, w)
    new_h, new_w = int(h * scale), int(w * scale)
    resized = cv2.resize(image_np, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)

    preprocessed = np.zeros((target_size, target_size, 3), dtype=np.uint8)
    x_offset = (target_size - new_w) // 2
    y_offset = (target_size - new_h) // 2
    preprocessed[y_offset:y_offset+new_h, x_offset:x_offset+new_w] = resized

    return Image.fromarray(preprocessed)

def save_augmented_image(image, original_image_id, output_directory, augmentation_number):
    img = Image.fromarray(image)
    img.save(f"{output_directory}/{original_image_id}_aug{augmentation_number}.png")

# Main program to read, augment, and save images
def main(h5_file_path, image_ids, output_directory, image_size=224, num_augmentations=5):
    images = read_h5df_file(h5_file_path, image_ids)

    preprocessed_images = [preprocess_image(img, image_size) for img in images]

    transforms_train = get_transforms(image_size)

    # Apply augmentation and save multiple augmented images per original image
    for i, img in enumerate(preprocessed_images):
        for n in range(1, num_augmentations + 1):
            augmented_img = augment_image(img, transforms_train)
            save_augmented_image(augmented_img, image_ids[i], output_directory, n)




In [7]:
if __name__ == "__main__":
    h5_file_path = "isic-2024-challenge/train-image.hdf5"
    output_directory = "./augmented_pos_img_ver2"
    image_ids = positive_isic_ids
    if not os.path.exists(output_directory):
      os.makedirs(output_directory)
    main(h5_file_path, image_ids, output_directory)




In [9]:
df_train_metadata.columns

Index(['isic_id', 'target', 'patient_id', 'age_approx', 'sex',
       'anatom_site_general', 'clin_size_long_diam_mm', 'image_type',
       'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext',
       'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L',
       'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio',
       'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB',
       'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm',
       'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple',
       'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border',
       'tbp_lv_norm_color', 'tbp_lv_perimeterMM',
       'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt',
       'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y',
       'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id',
       'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',
       'mel_mitotic_index', '

In [16]:
df_train_metadata['iddx_1'].value_counts()

iddx_1
Benign           400552
Malignant           393
Indeterminate       114
Name: count, dtype: int64

In [15]:
mali = df_train_metadata.loc[condition,'iddx_2']
mali.value_counts()
#these three are the ones we are trying to calculate

iddx_2
Malignant adnexal epithelial proliferations - Follicular    163
Malignant melanocytic proliferations (Melanoma)             157
Malignant epidermal proliferations                           73
Name: count, dtype: int64

In [19]:
filtered_rows = df_train_metadata[df_train_metadata['iddx_2'] == 'Malignant melanocytic proliferations (Melanoma)']
target_values = filtered_rows['target']
print(len(target_values))

157


In [None]:
mali_iddx3 = df_train_metadata.loc[condition,'iddx_3']
mali.value_counts()


Unnamed: 0_level_0,count
iddx_4,Unnamed: 1_level_1
"Basal cell carcinoma, Nodular",98
"Basal cell carcinoma, Superficial",48
"Melanoma Invasive, Superficial spreading",37
"Melanoma in situ, Lentigo maligna type",12
"Melanoma in situ, associated with a nevus",12
"Melanoma in situ, Superficial spreading",10
"Melanoma Invasive, Associated with a nevus",7
"Basal cell carcinoma, Infiltrating",6
"Squamous cell carcinoma, Invasive, Keratoacanthoma-type",5
"Melanoma Invasive, On chronically sun-exposed skin or lentigo maligna melanoma",5


In [None]:
path_list = [f"/content/drive/MyDrive/Colab_Notebooks/SkinCancer_ISIC/preprocessed_image/{id}.jpg" for id in df_train_metadata.isic_id]
path_list = path_list[:10]

In [None]:
def display_images_path(path_list,num_to_display=10):
  for i in range(num_to_display):
    image_path = path_list[i]
    img = mpimg.imread(image_path)
    plt.figure()
    plt.imshow(img)
    plt.title(image_path)
    plt.show()
display_images_path(path_list)

In [None]:
embed1 = np.load("/Users/jimmyhe/Desktop/KaggleCompetitions/ISISCANCER/facebook_deit_small_patch16_224_image_embeddings.npy")
embed2 = np.load("/Users/jimmyhe/Desktop/KaggleCompetitions/ISISCANCER/google_vit_base_patch16_224_image_embeddings.npy")
embed1.shape, embed2.shape

((401059, 384), (401059, 768))