In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("phucthaiv02/butterfly-image-classification")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/phucthaiv02/butterfly-image-classification?dataset_version_number=3...


100%|██████████| 226M/226M [00:01<00:00, 148MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/phucthaiv02/butterfly-image-classification/versions/3


In [4]:
base_path = "/root/.cache/kagglehub/datasets/phucthaiv02/butterfly-image-classification/versions/3"
train_dir = os.path.join(base_path, "train")
test_dir = os.path.join(base_path, "test")

train_df = pd.read_csv(os.path.join(base_path, "Training_set.csv"))
test_df = pd.read_csv(os.path.join(base_path, "Testing_set.csv"))

print("Training set:")
print(train_df.head())

Training set:
      filename                     label
0  Image_1.jpg          SOUTHERN DOGFACE
1  Image_2.jpg                    ADONIS
2  Image_3.jpg            BROWN SIPROETA
3  Image_4.jpg                   MONARCH
4  Image_5.jpg  GREEN CELLED CATTLEHEART


In [5]:
#Preprocess data
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])

print("Unique classes:", label_encoder.classes_)

# Ensure label column is string type
train_df["label"] = train_df["label"].astype(str)

Unique classes: ['ADONIS' 'AFRICAN GIANT SWALLOWTAIL' 'AMERICAN SNOOT' 'AN 88' 'APPOLLO'
 'ATALA' 'BANDED ORANGE HELICONIAN' 'BANDED PEACOCK' 'BECKERS WHITE'
 'BLACK HAIRSTREAK' 'BLUE MORPHO' 'BLUE SPOTTED CROW' 'BROWN SIPROETA'
 'CABBAGE WHITE' 'CAIRNS BIRDWING' 'CHECQUERED SKIPPER' 'CHESTNUT'
 'CLEOPATRA' 'CLODIUS PARNASSIAN' 'CLOUDED SULPHUR' 'COMMON BANDED AWL'
 'COMMON WOOD-NYMPH' 'COPPER TAIL' 'CRECENT' 'CRIMSON PATCH'
 'DANAID EGGFLY' 'EASTERN COMA' 'EASTERN DAPPLE WHITE'
 'EASTERN PINE ELFIN' 'ELBOWED PIERROT' 'GOLD BANDED' 'GREAT EGGFLY'
 'GREAT JAY' 'GREEN CELLED CATTLEHEART' 'GREY HAIRSTREAK' 'INDRA SWALLOW'
 'IPHICLUS SISTER' 'JULIA' 'LARGE MARBLE' 'MALACHITE' 'MANGROVE SKIPPER'
 'MESTRA' 'METALMARK' 'MILBERTS TORTOISESHELL' 'MONARCH' 'MOURNING CLOAK'
 'ORANGE OAKLEAF' 'ORANGE TIP' 'ORCHARD SWALLOW' 'PAINTED LADY'
 'PAPER KITE' 'PEACOCK' 'PINE WHITE' 'PIPEVINE SWALLOW' 'POPINJAY'
 'PURPLE HAIRSTREAK' 'PURPLISH COPPER' 'QUESTION MARK' 'RED ADMIRAL'
 'RED CRACKER' 'RED POSTMA

In [6]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Convert labels to strings (important for sparse mode)
train_df["label"] = train_df["label"].astype(str)

# Create ImageDataGenerator
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

# Image size & batch
img_size = (128, 128)
batch_size = 32

# Create training and validation generators
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=train_dir,
    x_col="filename",
    y_col="label",
    target_size=img_size,
    batch_size=batch_size,
    class_mode="sparse",
    subset="training",
    shuffle=True,
    seed=42
)

val_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=train_dir,
    x_col="filename",
    y_col="label",
    target_size=img_size,
    batch_size=batch_size,
    class_mode="sparse",
    subset="validation",
    shuffle=True,
    seed=42
)

Found 5200 validated image filenames belonging to 75 classes.
Found 1299 validated image filenames belonging to 75 classes.


In [7]:
#build CNN
import tensorflow as tf
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Input(shape=(128, 128, 3)),

    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),

    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),

    layers.Dense(len(train_generator.class_indices), activation='softmax')
])

model.summary()

In [8]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [9]:
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10
)

Epoch 1/10


  self._warn_if_super_not_called()


[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 1s/step - accuracy: 0.0245 - loss: 4.2829 - val_accuracy: 0.1093 - val_loss: 3.8029
Epoch 2/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 1s/step - accuracy: 0.1170 - loss: 3.7142 - val_accuracy: 0.2879 - val_loss: 2.9586
Epoch 3/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 1s/step - accuracy: 0.2217 - loss: 3.0264 - val_accuracy: 0.3903 - val_loss: 2.4299
Epoch 4/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 1s/step - accuracy: 0.3294 - loss: 2.5597 - val_accuracy: 0.4750 - val_loss: 2.0793
Epoch 5/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 1s/step - accuracy: 0.3991 - loss: 2.1844 - val_accuracy: 0.5235 - val_loss: 1.7972
Epoch 6/10
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 1s/step - accuracy: 0.4771 - loss: 1.8599 - val_accuracy: 0.5527 - val_loss: 1.7051
Epoch 7/10
[1m163/163[0m [32m━

In [10]:
loss, accuracy = model.evaluate(val_generator)
print(f"Validation Loss: {loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 338ms/step - accuracy: 0.5829 - loss: 1.5188
Validation Loss: 1.5642
Validation Accuracy: 0.5797


In [11]:
df_test = pd.read_csv(os.path.join(base_path, "Testing_set.csv"))
df_test["filename"] = df_test["filename"].astype(str)

test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=df_test,
    directory=test_dir,
    x_col="filename",
    y_col=None,
    target_size=img_size,
    batch_size=1,
    class_mode=None,
    shuffle=False
)

Found 2786 validated image filenames.


In [12]:
import numpy as np

preds = model.predict(test_generator)
predicted_class_indices = np.argmax(preds, axis=1)

# Map class indices back to labels
labels = (train_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predicted_labels = [labels[k] for k in predicted_class_indices]

# Save predictions
df_test["predicted_label"] = predicted_labels
df_test.to_csv("test_predictions.csv", index=False)

print("Predictions saved to test_predictions.csv")

  self._warn_if_super_not_called()


[1m2786/2786[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 17ms/step
Predictions saved to test_predictions.csv
