In [1]:
import os
import kagglehub
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

# Step 1: Download Dataset using kagglehub
path = kagglehub.dataset_download("paramaggarwal/fashion-product-images-small")
print("Dataset downloaded at:", path)

# Define dataset paths
image_folder = os.path.join(path, "images")
metadata_path = os.path.join(path, "styles.csv")

# Step 2: Load metadata
df = pd.read_csv(metadata_path, on_bad_lines='skip')

# Filter dataset (only use images present in folder)
df["image_path"] = df["id"].astype(str) + ".jpg"
df = df[df["image_path"].isin(os.listdir(image_folder))]

# Select a subset of classes for simplicity
df = df[df['masterCategory'].isin(['Apparel', 'Footwear', 'Accessories'])]

# Encode labels
label_map = {label: idx for idx, label in enumerate(df["masterCategory"].unique())}
df["category_id"] = df["masterCategory"].map(label_map)

# Split dataset
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["category_id"], random_state=42)

Downloading from https://www.kaggle.com/api/v1/datasets/download/paramaggarwal/fashion-product-images-small?dataset_version_number=1...


100%|██████████| 565M/565M [00:18<00:00, 31.8MB/s] 

Extracting files...





Dataset downloaded at: /Users/wuwenfei/.cache/kagglehub/datasets/paramaggarwal/fashion-product-images-small/versions/1


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Image Preprocessing

In [2]:
# Step 3: Image Preprocessing
image_size = (128, 128)
batch_size = 32

datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_generator = datagen.flow_from_dataframe(
    train_df, directory=image_folder, x_col="image_path", y_col="masterCategory",
    target_size=image_size, batch_size=batch_size, class_mode="categorical", subset="training"
)

val_generator = datagen.flow_from_dataframe(
    train_df, directory=image_folder, x_col="image_path", y_col="masterCategory",
    target_size=image_size, batch_size=batch_size, class_mode="categorical", subset="validation"
)

Found 26807 validated image filenames belonging to 3 classes.
Found 6701 validated image filenames belonging to 3 classes.


### Train Model

In [3]:
# Step 4: Model Definition (EfficientNet)
base_model = EfficientNetB0(weights="imagenet", include_top=False, input_shape=(128, 128, 3))
base_model.trainable = False  # Freeze base model

model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.5),
    layers.Dense(len(label_map), activation="softmax")
])

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train Model
history = model.fit(train_generator, validation_data=val_generator, epochs=10)

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Epoch 1/10


  self._warn_if_super_not_called()


[1m838/838[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 198ms/step - accuracy: 0.4996 - loss: 1.0519 - val_accuracy: 0.5193 - val_loss: 1.0255
Epoch 2/10
[1m838/838[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 199ms/step - accuracy: 0.5100 - loss: 1.0332 - val_accuracy: 0.5193 - val_loss: 1.0226
Epoch 3/10
[1m838/838[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 176ms/step - accuracy: 0.5120 - loss: 1.0294 - val_accuracy: 0.5193 - val_loss: 1.0239
Epoch 4/10
[1m838/838[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 183ms/step - accuracy: 0.5081 - loss: 1.0315 - val_accuracy: 0.5193 - val_loss: 1.0236
Epoch 5/10
[1m838/838[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 184ms/step - accuracy: 0.5049 - loss: 1.0341 - val_accuracy: 0.5193 - val_loss: 1.0235
Epoch 6/10
[1m838/838[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 184ms/step - accuracy: 0.5099 - loss: 1.0310 - val_accuracy: 0.5193 - val_loss: 1.0233
Epoch 7/10
[1m

## Evaluate Model

In [4]:
# Step 5: Evaluate Model
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_dataframe(
    test_df, directory=image_folder, x_col="image_path", y_col="masterCategory",
    target_size=image_size, batch_size=batch_size, class_mode="categorical", shuffle=False
)

loss, accuracy = model.evaluate(test_generator)
print(f"Test Accuracy: {accuracy:.2f}")

# Save Model
model.save("fashion_classifier.h5")

Found 8377 validated image filenames belonging to 3 classes.


  self._warn_if_super_not_called()


[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 150ms/step - accuracy: 0.5104 - loss: 1.0284




Test Accuracy: 0.51
