In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the updated training data
data_path = "/content/drive/My Drive/bttai-ajl-2025/updated_train.csv"
df = pd.read_csv(data_path)

In [4]:
# Display basic information about the dataset
print(df.head())

                            md5hash  fitzpatrick_scale  fitzpatrick_centaur  \
0  fd06d13de341cc75ad679916c5d7e6a6                  4                    4   
1  a4bb4e5206c4e89a303f470576fc5253                  1                    1   
2  c94ce27e389f96bda998e7c3fa5c4a2e                  5                    5   
3  ebcf2b50dd943c700d4e2b586fcd4425                  3                    3   
4  c77d6c895f05fea73a8f3704307036c0                  1                    1   

                              label nine_partition_label  \
0                 prurigo-nodularis     benign-epidermal   
1  basal-cell-carcinoma-morpheiform  malignant-epidermal   
2                            keloid         inflammatory   
3              basal-cell-carcinoma  malignant-epidermal   
4                 prurigo-nodularis     benign-epidermal   

  three_partition_label            qc  ddi_scale  
0                benign           NaN         34  
1             malignant           NaN         12  
2        no

In [5]:
train_dir = "/content/drive/My Drive/bttai-ajl-2025/train/train"
test_dir = "/content/drive/My Drive/bttai-ajl-2025/test/test"

In [6]:
# Add the full image paths to the dataframe, where labels are the subfolder names
df["image_path"] = df.apply(
    lambda row: os.path.join(train_dir, row['label'], f"{row['md5hash']}.jpg"),
    axis=1
)


In [7]:
# Verify the new column 'image_path' has been added
print(df.head())

                            md5hash  fitzpatrick_scale  fitzpatrick_centaur  \
0  fd06d13de341cc75ad679916c5d7e6a6                  4                    4   
1  a4bb4e5206c4e89a303f470576fc5253                  1                    1   
2  c94ce27e389f96bda998e7c3fa5c4a2e                  5                    5   
3  ebcf2b50dd943c700d4e2b586fcd4425                  3                    3   
4  c77d6c895f05fea73a8f3704307036c0                  1                    1   

                              label nine_partition_label  \
0                 prurigo-nodularis     benign-epidermal   
1  basal-cell-carcinoma-morpheiform  malignant-epidermal   
2                            keloid         inflammatory   
3              basal-cell-carcinoma  malignant-epidermal   
4                 prurigo-nodularis     benign-epidermal   

  three_partition_label            qc  ddi_scale  \
0                benign           NaN         34   
1             malignant           NaN         12   
2       

In [8]:
# Filter out rows where the image file is missing
df = df[df["image_path"].apply(lambda x: os.path.exists(x))]


In [9]:
# Define labels and prepare train-test split
X = df["image_path"]
y = df["label"]

In [10]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Create data generators for training and validation
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

In [12]:
train_generator = train_datagen.flow_from_dataframe(
    dataframe=pd.DataFrame({"filename": X_train, "class": y_train}),
    x_col="filename",
    y_col="class",
    target_size=(299, 299),
    batch_size=32,
    class_mode="categorical"
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=pd.DataFrame({"filename": X_val, "class": y_val}),
    x_col="filename",
    y_col="class",
    target_size=(299, 299),
    batch_size=32,
    class_mode="categorical"
)

Found 2210 validated image filenames belonging to 21 classes.
Found 553 validated image filenames belonging to 21 classes.


In [13]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing import image

# Image size (ensure this matches the input size used during training)
img_size = (299, 299)  # Adjust to the target_size used in your ImageDataGenerator

# Load test images
test_images = sorted(os.listdir(test_dir))  # Sort for consistency
test_data = []

# Load and preprocess each image in the test directory
for img_name in test_images:
    img_path = os.path.join(test_dir, img_name)
    img = image.load_img(img_path, target_size=img_size)
    img_array = image.img_to_array(img) / 255.0  # Normalize the image
    test_data.append(img_array)

test_data = np.array(test_data)

In [14]:
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D
from tensorflow.keras.models import Model

# Load Pretrained Model with the modified input shape
base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(299, 299, 3))  # Change input shape

# Freeze base layers
for layer in base_model.layers:
    layer.trainable = False

# Add Custom Layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dense(len(train_generator.class_indices), activation='softmax')(x)  # Output classes

model = Model(inputs=base_model.input, outputs=x)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model (no change to the image size here)
model.fit(train_generator, validation_data=val_generator, epochs=10)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m87910968/87910968[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


  self._warn_if_super_not_called()


Epoch 1/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m824s[0m 12s/step - accuracy: 0.2319 - loss: 2.6938 - val_accuracy: 0.3562 - val_loss: 2.1283
Epoch 2/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m783s[0m 11s/step - accuracy: 0.4545 - loss: 1.7494 - val_accuracy: 0.4702 - val_loss: 1.8950
Epoch 3/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m724s[0m 10s/step - accuracy: 0.5716 - loss: 1.3762 - val_accuracy: 0.4430 - val_loss: 1.9012
Epoch 4/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m777s[0m 11s/step - accuracy: 0.6324 - loss: 1.2071 - val_accuracy: 0.4340 - val_loss: 1.8826
Epoch 5/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m817s[0m 12s/step - accuracy: 0.6742 - loss: 1.0372 - val_accuracy: 0.4539 - val_loss: 1.8815
Epoch 6/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m803s[0m 12s/step - accuracy: 0.7114 - loss: 0.9358 - val_accuracy: 0.4810 - val_loss: 1.8754
Epoch 7/10
[1m70/70[0m [3

<keras.src.callbacks.history.History at 0x7d6b021c60d0>

In [15]:
predictions = model.predict(test_data)
predicted_labels = [list(train_generator.class_indices.keys())[i] for i in predictions.argmax(axis=1)]

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 8s/step


In [16]:
val_loss, val_acc = model.evaluate(val_generator)
print(f"Validation Accuracy: {val_acc:.4f}")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 8s/step - accuracy: 0.4492 - loss: 2.0910
Validation Accuracy: 0.4738


In [17]:
submission_df = pd.DataFrame({
    "md5hash": [name[:-4] for name in test_images],  # Remove ".jpg"
    "label": predicted_labels
})

submission_df.to_csv("submission.csv", index=False)