In [91]:
import json, random
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks


In [82]:
# Cell 1 ── Imports & Paths
import json, pandas as pd, numpy as np, tensorflow as tf
from pathlib import Path
from sklearn.model_selection import train_test_split

# Where your allergy images live:
IMG_DIRS    = [
    Path("/mnt/ssd1/saumia/data/images/imgs_part_1"),
    Path("/mnt/ssd1/saumia/data/images/imgs_part_2"),
    Path("/mnt/ssd1/saumia/data/images/imgs_part_3"),
]


In [83]:
# Cell 2 ── Build & Clean DataFrame
meta_df = pd.read_csv("/mnt/ssd1/saumia/data/text/metadata.csv")

rows = []
for d in IMG_DIRS:
    for p in d.glob("*.png"):
        parts = p.stem.split("_")
        if len(parts) < 4: 
            continue
        # e.g. PAT_1516_1765_530.png → lesion_id = 1765
        try:
            lesion_id = int(parts[2])
        except:
            continue
        m = meta_df[meta_df["lesion_id"] == lesion_id]
        if m.empty:
            continue
        m = m.iloc[0]
        # base row
        row = {
            "path": str(p),
            "label": m["diagnostic"],
            "age": m["age"],
            "diameter_1": m["diameter_1"],
            "diameter_2": m["diameter_2"],
            "gender_M": 1.0 if str(m["gender"]).upper()=="MALE" else 0.0,
            "region": m["region"],
        }
        # the six boolean cols
        for col in ["itch","bleed","elevation","changed","hurt","grew"]:
            row[col] = 1.0 if bool(m.get(col)) else 0.0
        rows.append(row)

df = pd.DataFrame(rows)
assert not df.empty, "No images matched metadata!"

# 1) Drop the all-1 nuisance features
constant_cols = ["itch","bleed","elevation","changed","hurt","grew"]
df = df.drop(columns=constant_cols, errors="ignore")

# 2) Normalize the continuous cols to [0,1]
for col in ["age","diameter_1","diameter_2"]:
    if col in df:
        mn, mx = df[col].min(), df[col].max()
        df[col] = (df[col] - mn) / (mx - mn + 1e-8)

# 3) One‐hot the region
df = pd.get_dummies(df, columns=["region"], dtype="float32")

# 4) Label‐map & feature‐cols
classes    = sorted(df["label"].unique())
label_map  = {c:i for i,c in enumerate(classes)}
feat_cols  = [c for c in df.columns if c not in {"path","label"}]

df["label_id"] = df["label"].map(label_map)

# persist maps
with open("label_map_image.json","w") as f: json.dump(label_map, f, indent=2)
with open("feature_cols_image.json","w") as f: json.dump(feat_cols, f, indent=2)

# 5) Train/Val split & cast
train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=42
)

train_df[feat_cols] = train_df[feat_cols].astype("float32")
val_df[feat_cols]   = val_df[feat_cols].astype("float32")

print(f"{len(classes)} classes  |  {len(train_df)} train  /  {len(val_df)} val samples")


6 classes  |  1825 train  /  457 val samples


In [105]:
train_df.describe()

Unnamed: 0,age,diameter_1,diameter_2,gender_M,region_ABDOMEN,region_ARM,region_BACK,region_CHEST,region_EAR,region_FACE,region_FOOT,region_FOREARM,region_HAND,region_LIP,region_NECK,region_NOSE,region_SCALP,region_THIGH,label_id
count,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0,1182.0
mean,0.651986,0.116382,0.12257,0.489848,0.00846,0.083756,0.107445,0.137056,0.038071,0.233503,0.010152,0.129442,0.050761,0.01692,0.059222,0.090525,0.003384,0.031303,1.463621
std,0.16288,0.080234,0.074348,0.500109,0.091628,0.277139,0.309809,0.344052,0.191449,0.423238,0.100288,0.33583,0.219603,0.129028,0.236139,0.287053,0.058099,0.174209,1.374725
min,0.079545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.556818,0.07,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.659091,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.772727,0.15,0.142857,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0


In [106]:
# Cell 3 ── tf.data Pipelines
IMG_SIZE   = (224,224)
BATCH_SIZE = 16
AUTOTUNE   = tf.data.AUTOTUNE

def preprocess_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    return tf.keras.applications.efficientnet.preprocess_input(img)

def make_dataset(df, shuffle=True):
    paths  = df["path"].values
    metas  = df[feat_cols].values
    labels = df["label_id"].values
    ds = tf.data.Dataset.from_tensor_slices((paths, metas, labels))
    if shuffle:
        ds = ds.shuffle(len(df), seed=42)
    def _load(path, meta, label):
        return {"image": preprocess_image(path), "meta": meta}, label
    ds = ds.map(_load, num_parallel_calls=AUTOTUNE)
    return ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

train_ds = make_dataset(train_df, shuffle=True)
val_ds   = make_dataset(val_df,   shuffle=False)
# Clean and coerce feature types
for col in feat_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors="coerce")
    val_df[col]   = pd.to_numeric(val_df[col], errors="coerce")

# Drop rows with missing features
train_df = train_df.dropna(subset=feat_cols + ["label_id"]).reset_index(drop=True)
val_df   = val_df.dropna(subset=feat_cols + ["label_id"]).reset_index(drop=True)
print(f"✅ Dropped rows with missing features: now {len(train_df)} train / {len(val_df)} val")

# Ensure all features are float32
train_df[feat_cols] = train_df[feat_cols].astype("float32")
val_df[feat_cols]   = val_df[feat_cols].astype("float32")

# Optional: Clip to [0, 1] for safety
train_df[feat_cols] = train_df[feat_cols].clip(0.0, 1.0)
val_df[feat_cols]   = val_df[feat_cols].clip(0.0, 1.0)


# sanity check
for (batch_x, batch_y) in train_ds.take(1):
    print("Image batch shape:", batch_x["image"].shape)
    print("Meta batch shape:",  batch_x["meta"].shape)
    print("Labels:", batch_y.numpy()[:8])


✅ Dropped rows with missing features: now 1182 train / 299 val
Image batch shape: (16, 224, 224, 3)
Meta batch shape: (16, 18)
Labels: [5 0 1 1 0 1 1 0]


In [112]:
# Cell 4 ── Build & Compile the Model (Improved Version)
from tensorflow.keras import layers, Model
import tensorflow as tf

# Load EfficientNetB0 with pretrained ImageNet weights
base_img = tf.keras.applications.EfficientNetB0(
    include_top=False, pooling="avg", weights="imagenet")
base_img.trainable = False  # Freeze to prevent overfitting on small data

# Inputs
img_in  = layers.Input(shape=IMG_SIZE + (3,), name="image")
meta_in = layers.Input(shape=(len(feat_cols),), name="meta")

# Image path
x1 = base_img(img_in)

# Metadata path
x2 = layers.BatchNormalization()(meta_in)
x2 = layers.Dense(64, activation="relu")(x2)
x2 = layers.Dropout(0.3)(x2)

# Combine
x = layers.concatenate([x1, x2])
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.4)(x)
out = layers.Dense(len(classes), activation="softmax")(x)

# Build model
model = Model([img_in, meta_in], out, name="image_meta_model")

# Compile with gradient clipping and learning rate
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR, clipnorm=1.0),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [108]:
print("✅ Any NaNs in final train data?", train_df[feat_cols].isnull().any().any())


✅ Any NaNs in final train data? False


In [114]:
# Cell 5 ── Train (Improved Version)
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        "image_meta_best.keras",   # Save only the best
        save_best_only=True,
        monitor="val_accuracy",
        verbose=1
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor="val_accuracy",
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=2,
        min_lr=1e-6,
        verbose=1
    )
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=33,
    callbacks=callbacks
)


Epoch 1/33
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step - accuracy: 0.7376 - loss: 0.7547
Epoch 1: val_accuracy improved from -inf to 0.63545, saving model to image_meta_best.keras
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 261ms/step - accuracy: 0.7376 - loss: 0.7546 - val_accuracy: 0.6355 - val_loss: 1.0922 - learning_rate: 6.2500e-06
Epoch 2/33
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step - accuracy: 0.7369 - loss: 0.7521
Epoch 2: val_accuracy did not improve from 0.63545
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 252ms/step - accuracy: 0.7369 - loss: 0.7519 - val_accuracy: 0.6321 - val_loss: 1.0926 - learning_rate: 6.2500e-06
Epoch 3/33
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step - accuracy: 0.7364 - loss: 0.7090
Epoch 3: val_accuracy did not improve from 0.63545
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 253ms/step - accuracy:

In [116]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import os


In [119]:
data_dir = "/mnt/ssd1/saumia/data/images/IMG_CLASSES"
class_names = sorted(os.listdir(data_dir))
num_classes = len(class_names)



In [120]:
datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    horizontal_flip=True,
    rotation_range=20,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
)

train_gen_2 = datagen.flow_from_directory(
    data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical",
    subset="training",
    shuffle=True,
    seed=42
)

val_gen_2 = datagen.flow_from_directory(
    data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical",
    subset="validation",
    shuffle=False,
    seed=42
)

Found 21726 images belonging to 10 classes.
Found 5427 images belonging to 10 classes.


In [121]:
# --- 3. Compute class weights (handle imbalance) ---
labels = train_gen_2.classes
class_weights_2 = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
class_weights_dict_2 = {i: weight for i, weight in enumerate(class_weights_2)}


In [123]:
# --- 4. Model Definition (Model 2) ---
model_2 = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')
])
model_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [130]:
# --- 5. Training ---
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history_2 = model_2.fit(
    train_gen_2,
    validation_data=val_gen_2,
    epochs=35,
    class_weight=class_weights_dict_2,
    callbacks=[early_stop]
)


ImportError: Could not import PIL.Image. The use of `load_img` requires PIL.

In [131]:
pip install pillow


Note: you may need to restart the kernel to use updated packages.
