# Imports and Helper functions

In [None]:
# Import necessary libraries
import os
import json
import pickle
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import DenseNet201, nasnet
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, average_precision_score
from sklearn.preprocessing import label_binarize, LabelEncoder

# Exploratory Data Analysis

In [None]:
# Load the dataset
file_path = '/kaggle/input/fashion-product-images-dataset/fashion-dataset/styles.csv'
eda_df = pd.read_csv(file_path, on_bad_lines='skip')
eda_df = eda_df[['id', 'gender', 'masterCategory', 'subCategory', 'articleType', 'baseColour', 'season', 'year', 'usage', 'productDisplayName']]

In [None]:
# Display basic information about the dataset
print("Dataset Information:")
print(eda_df.info())

In [None]:
print(eda_df.describe(include='all'))

In [None]:
print("\nFirst few rows of the dataset:")
print(eda_df.head())

In [None]:
# Number of unique values in each column
print("\nNumber of unique values in each column:")
print(eda_df.nunique())

In [None]:
# Count the occurrences of each articleType
article_counts = eda_df['articleType'].value_counts()

# Total number of classes before preprocessing
total_classes = len(article_counts)

print(f"Total number of classes in 'articleType' before preprocessing: {total_classes}")

In [None]:
# Calculate the cumulative percentage
cumulative_percentage = 100 * article_counts.cumsum() / article_counts.sum()

# Find the number of classes that represent 95% of the data
classes_95_percent = (cumulative_percentage <= 95).sum()

print(f"Number of classes in 'articleType' representing 95% of the data: {classes_95_percent}")

# Data Visualization

In [None]:
def create_pie_and_bar_chart(data, title, filename, top_n=20):
    # Pie chart
    plt.figure(figsize=(12, 12))  
    top_categories = data.head(top_n)
    others = pd.Series({'Others': data.iloc[top_n:].sum()})
    pie_data = pd.concat([top_categories, others])
    
    colors = sns.color_palette("husl", len(pie_data))
    plt.pie(pie_data.values, labels=pie_data.index, autopct='%1.1f%%', startangle=90, colors=colors)
    plt.title(f'Top {top_n} Categories', fontsize=28)  
    plt.axis('equal')
    
    # Increase font size for pie chart labels and percentages
    plt.rcParams['font.size'] = 16  
    
    plt.savefig(f'{filename}_pie.png', dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

    # Bar chart
    plt.figure(figsize=(15, 15))  
    remaining_categories = data.iloc[top_n:classes_95_percent]
    remaining_categories = remaining_categories.sort_values(ascending=True)
    
    colors = sns.color_palette("YlOrBr", len(remaining_categories))
    bars = plt.barh(range(len(remaining_categories)), remaining_categories.values, color=colors)
    plt.yticks(range(len(remaining_categories)), remaining_categories.index, fontsize=16)  
    plt.xlabel('Count', fontsize=18)  
    plt.title(f'Remaining Categories up to 95% of Data', fontsize=26)  
    
    # Add value labels to the end of each bar
    for bar in bars:
        width = bar.get_width()
        plt.text(width, bar.get_y() + bar.get_height()/2, f'{width}', 
                 ha='left', va='center', fontsize=16)  
    
    plt.tight_layout()
    plt.savefig(f'{filename}_bar.png', dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

In [None]:
# Create charts for all classes up to 95%
create_pie_and_bar_chart(article_counts, 'Distribution of Article Types (95% of Data)', 'article_types_distribution')

print(f"Charts have been saved as 'article_types_distribution_pie.png' and 'article_types_distribution_bar.png'")

# Optional: Print the classes representing 95% of the data
classes = article_counts.index[:classes_95_percent].tolist()
print("\nClasses representing 95% of the data:")
for i, class_name in enumerate(classes, 1):
    print(f"{i}. {class_name}: {article_counts[class_name]}")

# Load and Preprocess Data

In [None]:
# Disable warnings
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# Set the style for the plots
sns.set_palette("pastel")

def load_and_preprocess_data(data_dir, valid_samples=100):
    """Loads the dataset, preprocesses it, and splits it into train, validation, and test sets."""
    images_folder = os.path.join(data_dir, "images")
    styles_path = os.path.join(data_dir, "styles.csv")

    df = pd.read_csv(styles_path, on_bad_lines="skip")
    df = df[df["id"].apply(lambda x: os.path.isfile(os.path.join(images_folder, str(x) + ".jpg")))]
    df["image"] = df["id"].apply(lambda x: os.path.join(images_folder, str(x) + ".jpg"))
    df = df[["image", "articleType"]]

    # Filter categories with at least valid_samples samples
    valid_categories = df["articleType"].value_counts()[df["articleType"].value_counts() >= valid_samples].index
    df = df[df["articleType"].isin(valid_categories)]

    # Split data into train, validation, and test sets
    train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["articleType"], random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["articleType"], random_state=42)
    
    # Calculate total number of images
    total_images = len(df)

    # Print the number of samples in each set
    print(f"Total number of images after filtering: {total_images}")
    print(f"Number of samples in train set: {len(train_df)}")
    print(f"Number of samples in validation set: {len(val_df)}")
    print(f"Number of samples in test set: {len(test_df)}")
    
    return train_df, val_df, test_df, total_images

In [None]:
# Resize with Padding
def resize_with_padding(image, target_height, target_width):
    """Resizes the image to the target size with padding."""
    height, width = tf.shape(image)[0], tf.shape(image)[1]

    if tf.equal(height, 0) or tf.equal(width, 0):
        return tf.zeros([target_height, target_width, 3], dtype=tf.float32)

    scale = tf.minimum(
        tf.cast(target_width, tf.float32) / tf.cast(width, tf.float32),
        tf.cast(target_height, tf.float32) / tf.cast(height, tf.float32),
    )
    new_height = tf.cast(tf.cast(height, tf.float32) * scale, tf.int32)
    new_width = tf.cast(tf.cast(width, tf.float32) * scale, tf.int32)
    resized_image = tf.image.resize(image, [new_height, new_width])
    padded_image = tf.image.pad_to_bounding_box(
        resized_image,
        (target_height - new_height) // 2,
        (target_width - new_width) // 2,
        target_height,
        target_width,
    )
    return padded_image

In [None]:
# Preprocess Image
def preprocess_image(image_path, target_size, preprocess_input_func):
    """Loads, resizes, and preprocesses a single image."""
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = resize_with_padding(image, target_size[0], target_size[1])
    image = preprocess_input_func(image)
    return image

In [None]:
# Create Dataset
def create_dataset(df, batch_size, target_size, preprocess_input_func, is_training=False):
    """Creates a TensorFlow dataset from the dataframe."""
    image_paths = df["image"].values
    labels = pd.get_dummies(df["articleType"]).values
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))

    if is_training:
        dataset = dataset.shuffle(buffer_size=len(df))

    dataset = dataset.map(
        lambda image_path, label: (preprocess_image(image_path, target_size, preprocess_input_func), label),
        num_parallel_calls=tf.data.AUTOTUNE,
    )
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

# Build the Models

In [None]:
def build_model(model_name, input_shape, num_classes):
    """Builds and compiles the CNN model."""           
    if model_name == "ResNet50":
        base_model = tf.keras.applications.ResNet50(weights="imagenet", include_top=False, input_shape=input_shape)
    elif model_name == "DenseNet201":
        base_model = tf.keras.applications.DenseNet201(weights="imagenet", include_top=False, input_shape=input_shape)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

    # Make the last 20 layers trainable
    for layer in base_model.layers[-20:]:
        layer.trainable = True

    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(2048, activation="relu"),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation="softmax"),
    ])

    model.compile(loss="categorical_crossentropy",
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                  metrics=["accuracy"])
    return model

### Model Training Function

In [None]:
def train_model(model, model_name, train_dataset, val_dataset, epochs=30):
    """Trains the model and returns the training history."""
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        f"best_model_{model_name}.keras",
        save_best_only=True,
        monitor="val_accuracy",
        mode="max",
        verbose=1
    )
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.4, patience=3, min_lr=1e-6),
        model_checkpoint
    ]

    history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, verbose=1, callbacks=callbacks)
    return history

### Confusion Matrix Plotting Function

In [None]:
def plot_confusion_matrix(cm, classes, title='Confusion Matrix'):
    plt.figure(figsize=(30, 25))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    plt.tight_layout()

### Macro-Average Precision-Recall Curve Plotting Function

In [None]:
def plot_macro_average_prc_curve(true_labels, predicted_probs, class_labels, model_name):
    plt.figure(figsize=(10, 8))
    
    # Binarize the labels
    true_labels_bin = label_binarize(true_labels, classes=range(len(class_labels)))
    
    # Compute Precision-Recall curve and average precision for each class
    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(len(class_labels)):
        precision[i], recall[i], _ = precision_recall_curve(true_labels_bin[:, i], predicted_probs[:, i])
        average_precision[i] = average_precision_score(true_labels_bin[:, i], predicted_probs[:, i])
    
    # Compute macro-average precision-recall curve
    precision["macro"], recall["macro"], _ = precision_recall_curve(true_labels_bin.ravel(), predicted_probs.ravel())
    average_precision["macro"] = average_precision_score(true_labels_bin, predicted_probs, average="macro")
    
    # Plot macro-average precision-recall curve
    plt.plot(recall["macro"], precision["macro"], 
             label=f'Macro-average PRC (AP = {average_precision["macro"]:.2f})',
             color='navy', linestyle='-', linewidth=2)

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Macro-Average Precision-Recall Curve for {model_name}')
    plt.legend(loc="lower left")
    plt.savefig(f'macro_avg_prc_{model_name}.png', dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

    return average_precision["macro"]

# Model Configuration

In [None]:
# Configuration
data_dir = "/kaggle/input/fashion-product-images-dataset/fashion-dataset"
batch_size = 128
epochs = 30
models = ["ResNet50", "DenseNet201"]

model_configs = {
    "ResNet50": {"target_size": (224, 224), "preprocess_input": tf.keras.applications.resnet50.preprocess_input},
    "DenseNet201": {"target_size": (224, 224), "preprocess_input": tf.keras.applications.densenet.preprocess_input}
}

# Load and preprocess data
train_df, val_df, test_df, total_images = load_and_preprocess_data(data_dir)

## Data Split Distribution

In [None]:
# Create a pie chart
plt.figure(figsize=(14, 7))  # Increased figure size
sizes = [len(train_df), len(val_df), len(test_df)]
labels = ['Train', 'Validation', 'Test']
colors = sns.color_palette("pastel")[0:3]

plt.pie(sizes, labels=labels, colors=colors, autopct=lambda pct: f'{pct:.1f}%\n({int(pct/100.*sum(sizes))})', 
        startangle=90, textprops={'fontsize': 16}, explode=(0.1, 0, 0))
plt.title(f'Distribution of Images in Train, Validation, and Test Sets\nTotal Images: {total_images}', fontsize=22)
plt.axis('equal')

# Add a much larger legend
plt.legend(title="Datasets", loc="center left", bbox_to_anchor=(1, 0.5), fontsize=18, title_fontsize=20, 
           labelspacing=1.5, handlelength=3, handletextpad=1.5)

# Adjust layout to make room for the legend
plt.tight_layout(rect=[0, 0, 0.85, 1])  # Adjust the right margin to accommodate the legend

# Save the pie chart
plt.savefig('data_split_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()

print("Pie chart has been saved as 'data_split_distribution.png'")

In [None]:
# Save test dataset
test_df.to_csv('test_dataset.csv', index=False)

# Save model configurations
with open('model_configs.json', 'w') as f:
    json.dump({k: {**v, "preprocess_input": v["preprocess_input"].__name__} for k, v in model_configs.items()}, f)

# Save class labels
class_labels = np.unique(train_df["articleType"])
np.save('class_labels.npy', class_labels)

# Get a random sample of 9 images
for model_name, config in model_configs.items():
    print(f"\nDisplaying resized images for {model_name}:")

    # Get a random sample of 9 images
    sample_df = train_df.sample(n=9)

    fig, axes = plt.subplots(3, 3, figsize=(12, 12))
    for i, (_, row) in enumerate(sample_df.iterrows()):
        img = preprocess_image(row['image'], config['target_size'], config['preprocess_input'])
        img = img.numpy()

        # Denormalize the image for display
        img = (img - np.min(img)) / (np.max(img) - np.min(img))

        ax = axes[i // 3, i % 3]
        ax.imshow(img)
        ax.set_title(row['articleType'])
        ax.axis('off')

    plt.tight_layout()
    plt.show()

# Model Training

In [None]:
# Detect and connect to a TPU if available
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print("Running on TPU ", tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
    print("TPU is not available, using default strategy")
    strategy = tf.distribute.get_strategy()

# Train and evaluate models
histories = {}
for model_name in models:
    print(f"Training {model_name} model...")

    config = model_configs[model_name]
    train_dataset = create_dataset(
        train_df,
        batch_size,
        config["target_size"],
        config["preprocess_input"],
        is_training=True,
    )
    val_dataset = create_dataset(
        val_df, batch_size, config["target_size"], config["preprocess_input"]
    )

    with strategy.scope():
        model = build_model(model_name, (*config["target_size"], 3), len(class_labels))
        history = train_model(model, model_name, train_dataset, val_dataset, epochs)
        histories[model_name] = history

    # Save training history
    with open(f'history_{model_name}.pkl', 'wb') as f:
        pickle.dump(history.history, f)

# Evaluation

In [None]:
# Evaluation
test_results = {}
predicted_labels = {}
predicted_probabilities = {}

for model_name in models:
    print(f"Evaluating {model_name} model...")

    config = model_configs[model_name]
    test_dataset = create_dataset(
        test_df, batch_size, config["target_size"], config["preprocess_input"]
    )

    # Load the best model
    model = tf.keras.models.load_model(f"best_model_{model_name}.keras")
    test_loss, test_accuracy = model.evaluate(test_dataset, verbose=1)
    test_results[model_name] = {'loss': test_loss, 'accuracy': test_accuracy}

    predictions = model.predict(test_dataset)
    predicted_classes = np.argmax(predictions, axis=1)
    predicted_labels[model_name] = predicted_classes
    predicted_probabilities[model_name] = predictions

    print(f"{model_name} Test Loss: {test_loss:.4f}")
    print(f"{model_name} Test Accuracy: {test_accuracy * 100:.2f}%")

In [None]:
# True labels
true_labels = np.argmax(pd.get_dummies(test_df["articleType"]).values, axis=1)

# Print classification report for each model
for model_name in models:
    print(f"Classification Report for {model_name}:")
    print(classification_report(true_labels, predicted_labels[model_name], target_names=class_labels))

In [None]:
# Plot confusion matrix for each model
for model_name in models:
    cm = confusion_matrix(true_labels, predicted_labels[model_name])
    plt.figure(figsize=(30, 25))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.title(f'Confusion Matrix for {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{model_name}.png', dpi=300, bbox_inches='tight')
    plt.show()
    plt.close()

In [None]:
# Compare model performance
print("Model Performance Comparison:")
for model_name, result in test_results.items():
    print(f"{model_name}: Test Loss = {result['loss']:.4f}, Test Accuracy = {result['accuracy'] * 100:.2f}%")

In [None]:
# Plot Precision-Recall curve and calculate AUPRC for each model
macro_auprcs = {}
for model_name in models:
    macro_auprcs[model_name] = plot_macro_average_prc_curve(true_labels, predicted_probabilities[model_name], class_labels, model_name)

# Print macro-average AUPRC scores for all models
print("\nMacro-Average AUPRC Scores:")
for model_name, auprc_score in macro_auprcs.items():
    print(f"{model_name}: {auprc_score:.4f}")

In [None]:
# Plot training history for all models
for model_name, history in histories.items():
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='validation')
    plt.title(f'{model_name} - Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='validation')
    plt.title(f'{model_name} - Loss')
    plt.legend()

    plt.savefig(f'training_history_{model_name}.png')
    plt.show()
    plt.close()

print("Evaluation completed for all models.")
print(f"Class labels: {class_labels}")

# Best Performing Model

In [None]:
# Identify the best performing model
best_model = max(test_results, key=lambda x: test_results[x]['accuracy'])
print(f"\nBest performing model: {best_model}")
print(f"Best model accuracy: {test_results[best_model]['accuracy'] * 100:.2f}%")

# Misclassified Pairs

In [None]:
# Additional analysis for the best model
best_model_predictions = predicted_labels[best_model]

# Top misclassified classes
misclassified = true_labels != best_model_predictions
misclassified_df = pd.DataFrame({
    'True': [class_labels[i] for i in true_labels[misclassified]],
    'Predicted': [class_labels[i] for i in best_model_predictions[misclassified]]
})
top_misclassified = misclassified_df.groupby(['True', 'Predicted']).size().sort_values(ascending=False).head(10)

print("\nTop 10 misclassified pairs:")
print(top_misclassified)

In [None]:
# Save top misclassified pairs
top_misclassified.to_csv('top_misclassified.csv')

# Class-wise accuracy
class_accuracy = classification_report(true_labels, best_model_predictions, target_names=class_labels, output_dict=True)
class_accuracy_df = pd.DataFrame(class_accuracy).transpose()
class_accuracy_df = class_accuracy_df.sort_values('f1-score', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=class_accuracy_df.index, y=class_accuracy_df['f1-score'])
plt.title('F1-score by Class for Best Model')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('f1_score_by_class.png')
plt.show()
plt.close()

print("\nAnalysis completed. All results and visualizations have been saved.")

# Model Comparison

In [None]:
# Dictionary to store the training histories
histories = {}

# Load the training histories from saved pickle files
for model_name in models:
    file_path = f'history_{model_name}.pkl'  # Adjust path if saved elsewhere (e.g., '/kaggle/working/history_{model_name}.pkl')
    try:
        with open(file_path, 'rb') as f:
            histories[model_name] = pickle.load(f)
        print(f"Loaded history for {model_name}")
    except FileNotFoundError:
        print(f"Error: History file for {model_name} not found at {file_path}")
        continue

# Extract the final validation accuracy (converted to percentage)
val_accuracies = {model: histories[model]['val_accuracy'][-1] * 100 for model in histories}

# Sort models by validation accuracy
sorted_models = sorted(val_accuracies.items(), key=lambda x: x[1], reverse=True)

# Prepare data for plotting
model_names = [model for model, _ in sorted_models]
accuracies = [acc for _, acc in sorted_models]

# Create the horizontal bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=accuracies, y=model_names, palette="viridis")
plt.title('Model Comparison - Validation Accuracy (%)')
plt.xlabel('Validation Accuracy (%)')
plt.ylabel('Model')
plt.xlim(0, 100)  # Accuracy in percentage (0 to 100)
for i, v in enumerate(accuracies):
    plt.text(v + 1, i, f'{v:.1f}%', va='center')  # Add percentage labels
plt.grid(axis='x')

# Save the plot to a file
plt.savefig('/kaggle/working/model_comparison.png', dpi=300)  # Adjust path as needed
plt.show()
plt.close()