In [None]:
# Notebook 1: preprocessing_and_splits.ipynb
# FIXED: correct dataset paths + remove duplicate images in part1/part2

import os, sys, math
import numpy as np, pandas as pd
import cv2
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf

print("TF:", tf.__version__)
print("GPU:", tf.config.list_physical_devices('GPU'))
for g in tf.config.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(g, True)



TF: 2.19.0
GPU: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
# ======================================================
# 1) Correct dataset paths
# ======================================================
ZIP_PATH = "/content/drive/MyDrive/HAMDATASET.zip"
ROOT = "/content/HAMDATASET"
IMAGE_FOLDER = "/content/drive/MyDrive/HAM10000_images"
META_CSV = f"/content/HAM10000_metadata.csv"

# Extract ZIP (if not already extracted)
if not os.path.exists(ROOT):
    !unzip -q "/content/drive/MyDrive/HAMDATASET.zip" -d /content/

# Create folder for unique images
os.makedirs(IMAGE_FOLDER, exist_ok=True)




unzip:  cannot find or open /content/drive/MyDrive/HAMDATASET.zip, /content/drive/MyDrive/HAMDATASET.zip.zip or /content/drive/MyDrive/HAMDATASET.zip.ZIP.


In [None]:
# ======================================================
# 2) Merge part1 + part2 WITHOUT duplicates
# ======================================================
part1 = f"/content/HAM10000_images_part_1"
part2 = f"/content/HAM10000_images_part_2"

all_files = set()          # track unique filenames
duplicate_count = 0
moved = 0

for folder in [part1, part2]:
    for fname in os.listdir(folder):
        if not fname.endswith(".jpg"):
            continue

        src = os.path.join(folder, fname)
        dst = os.path.join(IMAGE_FOLDER, fname)

        if fname in all_files:
            duplicate_count += 1
            continue   # skip duplicates

        all_files.add(fname)
        os.system(f"cp '{src}' '{dst}'")
        moved += 1

print(f"Unique images moved: {moved}")
print(f"Duplicates skipped: {duplicate_count}")



In [None]:
IMAGE_FOLDER = "/content/drive/MyDrive/HAM10000_images"

In [None]:
# ======================================================
# 3) Load metadata and attach correct filepath
# ======================================================
df = pd.read_csv("/content/drive/MyDrive/HAM10000_metadata.csv")
print("Metadata loaded:", df.shape)

df['filename'] = df['image_id'] + ".jpg"
df['filepath'] = df['filename'].apply(lambda x: os.path.join(IMAGE_FOLDER, x))

# Keep only rows where the image exists (no duplicates / missing)
df = df[df['filepath'].apply(os.path.exists)].reset_index(drop=True)
print("After matching images:", df.shape)



Metadata loaded: (10015, 7)
After matching images: (8896, 9)


In [None]:
# ======================================================
# 4) Label encoding (7 classes)
# ======================================================
le = LabelEncoder()
df['dx'] = df['dx'].astype(str)
df['label'] = le.fit_transform(df['dx'])
print("Label map:", dict(zip(le.classes_, le.transform(le.classes_))))



Label map: {'akiec': np.int64(0), 'bcc': np.int64(1), 'bkl': np.int64(2), 'df': np.int64(3), 'mel': np.int64(4), 'nv': np.int64(5), 'vasc': np.int64(6)}


In [None]:
# ======================================================
# 5) Lesion-level split (NO leakage)
# ======================================================
group_col = 'lesion_id' if 'lesion_id' in df.columns else 'image_id'

gss = GroupShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_idx, test_idx = next(gss.split(df, groups=df[group_col]))

df_trainval = df.iloc[train_idx].reset_index(drop=True)
df_test = df.iloc[test_idx].reset_index(drop=True)

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.12, random_state=42)
tr_idx, val_idx = next(gss2.split(df_trainval, groups=df_trainval[group_col]))

df_train = df_trainval.iloc[tr_idx].reset_index(drop=True)
df_val   = df_trainval.iloc[val_idx].reset_index(drop=True)

print("Train size:", len(df_train))
print("Val size:", len(df_val))
print("Test size:", len(df_test))



Train size: 6255
Val size: 861
Test size: 1780


In [None]:
# ======================================================
# 6) Oversampling TRAIN ONLY (balanced)
# ======================================================
target = df_train['label'].value_counts().max()

rows = []
for lbl, g in df_train.groupby('label'):
    if len(g) < target:
        extra = g.sample(target - len(g), replace=True, random_state=42)
        rows.append(pd.concat([g, extra], axis=0))
    else:
        rows.append(g)

df_train_bal = pd.concat(rows).sample(frac=1, random_state=42).reset_index(drop=True)
print("Balanced train counts:\n", df_train_bal['dx'].value_counts())



Balanced train counts:
 dx
bcc      4213
bkl      4213
df       4213
vasc     4213
nv       4213
mel      4213
akiec    4213
Name: count, dtype: int64


In [None]:
# ======================================================
# 7) Save splits for other notebooks
# ======================================================
os.makedirs("/content/drive/MyDrive/splits", exist_ok=True)
df_train_bal.to_csv("/content/drive/MyDrive/splits/df_train.csv", index=False)
df_val.to_csv("/content/drive/MyDrive/splits/df_val.csv", index=False)
df_test.to_csv("/content/drive/MyDrive/splits/df_test.csv", index=False)

print("Saved splits to /content/drive/MyDrive/splits/")

Saved splits to /content/drive/MyDrive/splits/
