In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models

TRAIN_CSV = "../data/Training_set.csv"
TEST_CSV  = "../data/Testing_set.csv"
TRAIN_DIR = "../data/train"
TEST_DIR  = "../data/test"

train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

print(train_df.head())
print(test_df.head())
print(train_df['label'].nunique(), "classes")

      filename                     label
0  Image_1.jpg          SOUTHERN DOGFACE
1  Image_2.jpg                    ADONIS
2  Image_3.jpg            BROWN SIPROETA
3  Image_4.jpg                   MONARCH
4  Image_5.jpg  GREEN CELLED CATTLEHEART
      filename
0  Image_1.jpg
1  Image_2.jpg
2  Image_3.jpg
3  Image_4.jpg
4  Image_5.jpg
75 classes


In [4]:
IMAGE_SIZE = (128, 128)
BATCH_SIZE = 32

train_df = train_df.rename(columns={"Image": "filename", "label": "class"})
test_df  = test_df.rename(columns={"Image": "filename"})

# Train/validation split
from sklearn.model_selection import train_test_split

train_df_split, val_df_split = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df['class'],
    random_state=123
)

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

val_datagen = ImageDataGenerator(rescale=1./255)

train_gen = train_datagen.flow_from_dataframe(
    train_df_split,
    directory=TRAIN_DIR,
    x_col="filename",
    y_col="class",
    target_size=IMAGE_SIZE,
    class_mode="categorical",
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_gen = val_datagen.flow_from_dataframe(
    val_df_split,
    directory=TRAIN_DIR,
    x_col="filename",
    y_col="class",
    target_size=IMAGE_SIZE,
    class_mode="categorical",
    batch_size=BATCH_SIZE,
    shuffle=False
)

num_classes = train_df_split["class"].nunique()
print("Number of classes:", num_classes)

# Test generator – IMPORTANT: shuffle=False to preserve CSV order
test_datagen = ImageDataGenerator(rescale=1./255)

test_gen = test_datagen.flow_from_dataframe(
    test_df,
    directory=TEST_DIR,
    x_col="filename",
    y_col=None,
    target_size=IMAGE_SIZE,
    class_mode=None,
    batch_size=BATCH_SIZE,
    shuffle=False
)

Found 5199 validated image filenames belonging to 75 classes.
Found 1300 validated image filenames belonging to 75 classes.
Number of classes: 75
Found 2786 validated image filenames.


In [5]:
model = models.Sequential([
    layers.Input(shape=(*IMAGE_SIZE, 3)),

    layers.Conv2D(32, (3, 3), activation="relu"),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation="relu"),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(128, (3, 3), activation="relu"),
    layers.MaxPooling2D((2, 2)),

    layers.Flatten(),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(num_classes, activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [6]:
EPOCHS = 10

history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=EPOCHS
)

Epoch 1/10


  self._warn_if_super_not_called()


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 201ms/step - accuracy: 0.0350 - loss: 4.2356 - val_accuracy: 0.1777 - val_loss: 3.4018
Epoch 2/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 186ms/step - accuracy: 0.1374 - loss: 3.4284 - val_accuracy: 0.3100 - val_loss: 2.6320
Epoch 3/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 191ms/step - accuracy: 0.2620 - loss: 2.7725 - val_accuracy: 0.4277 - val_loss: 2.1542
Epoch 4/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 201ms/step - accuracy: 0.3345 - loss: 2.4192 - val_accuracy: 0.4854 - val_loss: 1.8593
Epoch 5/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 210ms/step - accuracy: 0.4033 - loss: 2.1618 - val_accuracy: 0.4908 - val_loss: 1.9067
Epoch 6/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 224ms/step - accuracy: 0.4440 - loss: 1.9620 - val_accuracy: 0.5254 - val_loss: 1.7958
Epoch 7/10
[1m163/16

In [10]:
# Predict class probabilities
pred_probs = model.predict(test_gen)

# Convert to class indices
pred_indices = np.argmax(pred_probs, axis=1)

# Map indices back to labels (strings)
index_to_class = {v: k for k, v in train_gen.class_indices.items()}
pred_labels = [index_to_class[i] for i in pred_indices]

# Attach to test_df in the same order
test_df['predicted_label'] = pred_labels

# Save for the competition / your vignette
test_df.to_csv("../results/butterfly_predictions.csv", index=False)
print(test_df.head())


[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 46ms/step
      filename         predicted_label
0  Image_1.jpg              PINE WHITE
1  Image_2.jpg           CRIMSON PATCH
2  Image_3.jpg      RED SPOTTED PURPLE
3  Image_4.jpg         IPHICLUS SISTER
4  Image_5.jpg  MILBERTS TORTOISESHELL
