# **Classify Images based on *master category* label from *styles* dataframe**

# **Load Libraries**

In [None]:
pip install keras

In [None]:
!pip install tensorflow

In [None]:
pip install pillow # https://pypi.org/project/pillow/

In [None]:
!pip install opencv-python

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.utils import image_dataset_from_directory
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import cv2  # OpenCV
from PIL import Image  # Pillow
from skimage.feature import hog, local_binary_pattern
import random
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# **EDA on Styles Data**

In [None]:
# Load labels of the images from styles.csv into a dataframe
df = pd.read_csv('data/styles.csv')

In [None]:
# 'id' in this dataframe represents the name of the image file (ex.49543.jpg; The 49543 is the id of the image in the dataframe)
df.sample(5)

In [None]:
df.info()

In [None]:
# Missing values in Dataframe by percentage
df_missing = df.isnull().sum()/df.shape[0]*100
df_missing = df_missing.sort_values(ascending=False)
df_missing
#df_missing.plot.hist()

In [None]:
#Remove missing values even though the missing values percentage is very low and does not affect the classification
df = df.dropna(axis=0)

In [None]:
df.shape

In [None]:
# DataFrame of Master Category Data Distribution
df_master_category = df.groupby(['masterCategory'])['masterCategory'].count().reset_index(name='count')
df_master_category

In [None]:
# Graphical representation of Master Category Data Distribution
plt.figure(figsize=(15,6))
barplot = sns.barplot(x='masterCategory', y='count', hue= 'masterCategory', data=df_master_category)
plt.title('Master Category Data Distribution')
plt.xticks(rotation=90)

# Add count annotations
for bar in barplot.patches:
    height = bar.get_height()
    barplot.annotate(f'{int(height)}',  # Display as an integer
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 5),  # Offset position slightly above the bar
                     textcoords="offset points",
                     ha='center', va='bottom')  # Centered horizontally

plt.show()

In [None]:
# DataFrame of Sub Category Data Distribution
df_sub_category = df.groupby(['subCategory'])['subCategory'].count().reset_index(name='count')
df_sub_category

In [None]:
# Graphical representation of Sub Category Data Distribution
plt.figure(figsize=(20,6))
barplot = sns.barplot(x='subCategory', y='count', hue='subCategory', data=df_sub_category)
plt.title('Sub Category Data Distribution')
plt.xticks(rotation=90)

# Add count annotations
for bar in barplot.patches:
    height = bar.get_height()
    barplot.annotate(f'{int(height)}',  # Display as an integer
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 5),  # Offset position slightly above the bar
                     textcoords="offset points",
                     ha='center', va='bottom')  # Centered horizontally

plt.show()

In [None]:
# DataFrame of Article Type Data Distribution
df_articleType = df.groupby(['articleType'])['articleType'].count().reset_index(name='count')
df_articleType

In [None]:
# Graphical representation of Article Type Data Distribution
plt.figure(figsize=(40,6))
barplot = sns.barplot(x='articleType', y='count', hue= 'articleType',data=df_articleType)
plt.title('Article Type Data Distribution')
plt.xticks(rotation=90)

# Add count annotations
for bar in barplot.patches:
    height = bar.get_height()
    barplot.annotate(f'{int(height)}',  # Display as an integer
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 5),  # Offset position slightly above the bar
                     textcoords="offset points",
                     ha='center', va='bottom')  # Centered horizontally

plt.show()

In [None]:
# Dataframe: colour Distribution by Gender
df_gender_colour = df.groupby(['gender','baseColour'])['gender'].count().reset_index(name='count')
df_gender_colour

In [None]:
# Graphical representation of colour distribution by Gender
df_gender_colour_sorted = df_gender_colour.sort_values(by='count', ascending=False)

# Create the barplot
plt.figure(figsize=(25, 6))
df_gender_colour_sorted['log_count'] = np.log10(df_gender_colour_sorted['count'])
sns.lineplot(x='baseColour', y='log_count', hue='gender', data=df_gender_colour_sorted)
plt.title('Colour Distribution by Gender (Sorted by Preference)')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Images colour distribution by Master Category
df_master_category = df.groupby(['masterCategory','baseColour'])['masterCategory'].count().reset_index(name='count')
df_master_category

In [None]:
# Images colour distribution by Master Category
df_master_category_sorted = df_master_category.sort_values(by='count', ascending=False)

# Create the barplot
plt.figure(figsize=(25, 6))
df_master_category_sorted['log_count'] = np.log10(df_master_category_sorted['count'])
sns.barplot(x='baseColour', y='log_count', hue='masterCategory', data=df_master_category_sorted)
plt.title('Colour Distribution by Master Category (Sorted by Preference)')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Images colour distribution by usage
df_usage = df.groupby(['usage','baseColour'])['usage'].count().reset_index(name='count')
df_usage

In [None]:
# Graphical representation of colour distribution by usage

# Create the heatmap
plt.figure(figsize=(35, 6))
df_usage['log_count'] = np.log10(df_usage['count'])
# Pass arguments as keywords to pivot
pivot_table = df_usage.pivot(index='usage', columns='baseColour', values='log_count')
sns.heatmap(pivot_table, annot=True, fmt=".0f", cmap="YlGnBu")
plt.title('Colour Distribution Heatmap by Usage')
plt.xticks(rotation=90)
plt.show()

# **EDA on Images Data**

In [None]:
# Display sample pictures
import matplotlib.pyplot as plt
import os
from keras.preprocessing.image import load_img, img_to_array

# Function to display sample images
def display_sample_images(image_folder, num_samples=5):
    plt.figure(figsize=(10, 5))
    for i, filename in enumerate(os.listdir(image_folder)[:num_samples]):
        img_path = os.path.join(image_folder, filename)
        img = load_img(img_path)
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(img)
        plt.axis('off')
    plt.show()

# Display sample images
display_sample_images('images', num_samples=10)


In [None]:
# Plot colour distribution of a sample image
def plot_color_distribution(image_folder, sample_image_filename):
    img_path = os.path.join(image_folder, sample_image_filename)
    img = img_to_array(load_img(img_path))

    # Display the image
    plt.figure(figsize=(5, 5))
    plt.imshow(img/255.) # Display the image (normalized for display)
    plt.axis('off')
    plt.title(f"Sample Image: {sample_image_filename}")
    plt.show()

    # Plot color distribution
    colors = ('r', 'g', 'b')
    plt.figure(figsize=(8, 6))
    for i, color in enumerate(colors):
        plt.hist(img[:, :, i].ravel(), bins=20, color=color, alpha=0.5, label=color.upper())
    plt.title('Color Distribution')
    plt.xlabel('Pixel Value')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

# Plot color distribution for a sample image
image_files = os.listdir('images')
sample_image_filename = image_files[3]  # Take the first image as a sample
plot_color_distribution('images', sample_image_filename)  # Pass the filename to the function

# **Pre-processing of data: Images and Labels**

In [None]:
import numpy as np
import pandas as pd
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import image_dataset_from_directory
# Load images from the images folder
def load_images_from_folder(folder, img_size=(224, 224), df=None):  # Pass the DataFrame to the function
    images = []
    labels = []

    for subdir, dirs, files in os.walk(folder):
        for file in files:
            try:
                # Extract image ID from filename (assuming filename format: id_*.jpg)
                image_id = int(file.split('.')[0])

                # Get the corresponding 'masterCategory' from the DataFrame
                label_series = df.loc[df['id'] == image_id, 'masterCategory']
                if not label_series.empty:  # Check if the series is not empty
                    label = label_series.values[0]
                else:
                    print(f"Warning: No label found for image ID {image_id} in file {file}. Skipping this image.")
                    continue  # Skip this image and go to the next one

                img_path = os.path.join(subdir, file)
                img = load_img(img_path, target_size=img_size)
                img_array = img_to_array(img)
                images.append(img_array)
                labels.append(label)
            except ValueError:
                 print(f"Warning: Unable to process file {file}. Is the file in the correct format? Skipping this image.")
                 continue
            except Exception as e:
                print(f"Warning: Error loading or processing {file}. Error: {e}. Skipping this image.")
                continue
    return np.array(images), np.array(labels)

# Load the dataframe (make sure this matches where the actual file is located)
df = pd.read_csv('data/styles.csv')

# Load images, passing the DataFrame to get labels
folder = "images"
X, y = load_images_from_folder(folder, df=df)  # Pass the DataFrame here

# Perform train-test split using the new 'y'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# Normalize image data
X_train = X_train / 255.0
X_test = X_test / 255.0

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

In [None]:
# 4. Sampling few images
plt.figure(figsize=(10, 10))
for i in range(10):
    plt.subplot(1, 10, i + 1)
    plt.xticks()
    plt.yticks()
    plt.grid(False)
#    plt.imshow(X_train[i], cmap=plt.cm.binary)
    plt.imshow(X_train[i], cmap=plt.cm.magma)
    plt.xlabel(y_train[i])
plt.show()

# **Modeling**

In [None]:
# Evaluate_model function:
def evaluate_model(model, X_test, y_test, model_name):
   y_pred = model.predict(X_test)
   accuracy = accuracy_score(y_test, y_pred)
   print(f"{model_name} Accuracy: {accuracy}")
   print(classification_report(y_test, y_pred))

    # Confusion Matrix Visualization (Seaborn)
   cm = confusion_matrix(y_test, y_pred)

    # Get unique class names from y_test and y_pred
   class_names_unique = np.unique(np.concatenate((y_test, y_pred)))

   cm_df = pd.DataFrame(cm, index=class_names_unique, columns=class_names_unique)  # Use unique class names
   print(f"{model_name} Confusion Matrix:")
   print(cm_df)

In [None]:
def evaluate_model1(model, X_test, y_test, model_name):
    # Predict probabilities and get predicted class labels
    y_pred_probs = model.predict(X_test)
    y_pred = np.argmax(y_pred_probs, axis=1)  # Get the class with highest probability

    # If using label encoding, inverse transform to original labels
    if isinstance(model, keras.Sequential):  # Assuming CNN model is a keras.Sequential
        # Get the label encoder
        label_encoder = globals().get('label_encoder')
        if label_encoder:
            y_pred = label_encoder.inverse_transform(y_pred)  # Convert back to original labels

    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

    # Confusion Matrix Visualization (Seaborn)
    cm = confusion_matrix(y_test, y_pred)

    # Get unique class names from y_test and y_pred
    class_names_unique = np.unique(np.concatenate((y_test, y_pred)))

    cm_df = pd.DataFrame(cm, index=class_names_unique, columns=class_names_unique)  # Use unique class names
    print(f"{model_name} Confusion Matrix:")
    print(cm_df)

In [None]:
# Model1: SVM model
 # SVM with Raw Images (Reshaped)
# Reshape images for SVM (flatten)
x_train_reshaped = X_train.reshape(X_train.shape[0], -1)
x_test_reshaped = X_test.reshape(X_test.shape[0], -1)

svm_images = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_images.fit(x_train_reshaped, y_train)

In [None]:
# Evaluate model1 (SVM performance)

evaluate_model(svm_images, x_test_reshaped, y_test, "SVM with Raw Images")

In [None]:
# Get predictions from the trained model
y_pred = svm_images.predict(x_test_reshaped)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder
# Assuming y_train contains your original training labels
label_encoder = LabelEncoder()  # Create a LabelEncoder instance
label_encoder.fit(y_train)  # Fit the encoder to your training labels

confusionMatrix = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=label_encoder.classes_)
plt.figure(figsize=(20, 20))
cmd.plot()
plt.title("SVC Confusion Matrix")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

In [None]:
# Model2: KNN Model
# KNN classifier
# Train a KNN model
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(x_train_reshaped, y_train)

In [None]:
# Evalauate model2 (KNN performance)
evaluate_model(knn_model, x_test_reshaped, y_test, "KNN model evaluation")

In [None]:
# Get predictions from the trained model
y_pred = knn_model.predict(x_test_reshaped)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder
# Assuming y_train contains your original training labels
label_encoder = LabelEncoder()  # Create a LabelEncoder instance
label_encoder.fit(y_train)  # Fit the encoder to your training labels

confusionMatrix = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=label_encoder.classes_)
plt.figure(figsize=(20, 20))
cmd.plot()
plt.title("KNN Confusion Matrix")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

In [None]:
# Model3: RandomForest model
# RandomForest classifier
# Train a RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x_train_reshaped, y_train)

In [None]:
# Evalauate model3 (RandomForest performance)
# Evalauate RandomForest model
evaluate_model(rf_model, x_test_reshaped, y_test, "RandomForest model evaluation")

In [None]:
# Get predictions from the trained model
y_pred = rf_model.predict(x_test_reshaped)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder
# Assuming y_train contains your original training labels
label_encoder = LabelEncoder()  # Create a LabelEncoder instance
label_encoder.fit(y_train)  # Fit the encoder to your training labels

confusionMatrix = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=label_encoder.classes_)
plt.figure(figsize=(20, 20))
cmd.plot()
plt.title("RandomForest Confusion Matrix")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

# **HyperParameter Tuning and GridSearachCV**

**The step execution takes long time atleast 20 minutes for 500+ images, so plan accordingly to run.**

In [None]:
# Only SVC, KNN, RandormForest are being selected. GradientBoosting was excluded as it is taking lot longer time to train the images.
# Define the parameter grid for each model
param_grid = {
    'SVC': {
        'kernel': ['linear', 'rbf', 'poly'],
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }
}

# Create a dictionary to store the models
models = {
    'SVC': SVC(),
    'KNN': KNeighborsClassifier(),
    'RandomForest': RandomForestClassifier()
}

# Create a dictionary to store the best models and parameters
best_models = {}

# Perform GridSearchCV for each model
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy')
    grid_search.fit(x_train_reshaped, y_train)  # Use your training data here

    best_models[model_name] = {
        'model': grid_search.best_estimator_,
        'params': grid_search.best_params_,
        'score': grid_search.best_score_
    }

# Print the best model and parameters for each algorithm
for model_name, results in best_models.items():
    print(f"Best {model_name}:")
    print(f"  Parameters: {results['params']}")
    print(f"  Score: {results['score']}")


# Evaluate the best models on the test set
for model_name, results in best_models.items():
  evaluate_model(results['model'], x_test_reshaped, y_test, f"Best {model_name}")


# **OpenCV** - Model performance evaluation using OpenCV, HOG, LBP via features extraction and SVC

In [None]:
# 6. Feature Extraction (Pillow/OpenCV, with Texture)
def extract_features(images):
     # Initialize an empty list to store features
    features = []
    for image in images:

        # Color Histogram (using OpenCV)
        if len(image.shape) == 3:  # If the image has 3 channels (RGB/BGR)
            image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

        image = image.astype(np.uint8)
        hist = cv2.calcHist([image], [0], None, [256], [0, 256])  # Correct syntax for calcHist
        hist = hist.flatten()

        # HOG (Histogram of Oriented Gradients) - Texture
        hog_features, hog_image = hog(image, orientations=8, pixels_per_cell=(4, 4),
                                      cells_per_block=(2, 2), visualize=True)

        # LBP (Local Binary Patterns) - Texture
        radius = 1
        n_points = 8 * radius
        lbp = local_binary_pattern(image, n_points, radius, method='uniform')
        lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_points + 2), range=(0, n_points + 1))
        lbp_hist = lbp_hist.astype("float")
        lbp_hist /= (lbp_hist.sum() + 1e-7)

        # Combine all features into a single array
        image_features = np.concatenate([hist, hog_features, lbp_hist])
        features.append(image_features)

    return np.array(features)

In [None]:
# Add feature for the Xtrain and Xtest

x_train_features = extract_features(X_train)
x_test_features = extract_features(X_test)

In [None]:
# 7. Feature Scaling (NumPy):
scaler = StandardScaler()
x_train_features_scaled = scaler.fit_transform(x_train_features)  # Input and output are NumPy arrays
x_test_features_scaled = scaler.transform(x_test_features)

In [None]:
# OpenCV : SVM with Extracted Features
svm_features = SVC(kernel='linear', C=0.1, gamma='auto') # Updated parameters as per the GridSearchCV in previous steps
svm_features.fit(x_train_features_scaled, y_train)

In [None]:
# Evaluate OpenCV with SVM model

evaluate_model(svm_features, x_test_features_scaled, y_test, "SVM with Features")

In [None]:
# Get predictions from the trained model
y_pred = svm_features.predict(x_test_features_scaled)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder
# Assuming y_train contains your original training labels
label_encoder = LabelEncoder()  # Create a LabelEncoder instance
label_encoder.fit(y_train)  # Fit the encoder to your training labels

confusionMatrix = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=label_encoder.classes_)
plt.figure(figsize=(20, 20))
cmd.plot()
plt.title("OpenCV Confusion Matrix")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

# **Pillow** - Model performance evaluation using Pillow and RandomForest

In [None]:
# Assuming you have a directory named 'images' with subdirectories for each class.

def load_images_from_folder(folder, img_size=(224, 224), df=None): # Pass the DataFrame to the function
    images = []
    labels = []
    for subdir, dirs, files in os.walk(folder):
        for file in files:
            # Extract image ID from filename (assuming filename format: id_*.jpg)
            image_id = int(file.split('.')[0])

            # Get the corresponding 'masterCategory' from the DataFrame
            label = df.loc[df['id'] == image_id, 'masterCategory'].values[0]
            image_path = os.path.join(subdir, file)

            try:
                img = Image.open(image_path).convert('RGB') # convert to RGB
                img = img.resize((64, 64)) # resize to a consistent size
                img_array = np.array(img).flatten() # flatten the image array
                images.append(img_array)
                labels.append(label)
            except Exception as e:
                 print(f"Error loading image {image_path}: {e}")
    return np.array(images), np.array(labels)

# Load images, passing the DataFrame to get labels
folder = "images"
X, y = load_images_from_folder(folder, df=df) # Pass the DataFrame here

# Perform train-test split using the new 'y'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM classifier
#clf = SVC(kernel='linear') # you can change the kernel
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=None, min_samples_split=2, n_estimators=50, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
# Evalauate Pillow model
evaluate_model(clf, X_test, y_test, "RandomForest with Pillow")

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=label_encoder.classes_)
plt.figure(figsize=(20, 20))
cmd.plot()
plt.title("Pillow Confusion Matrix")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

#**CNN** - Model performance evaluation using Convolutional Neural Network (CNN)

In [None]:
# Adding CNN model to the problem
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Fit the encoder on the training labels and transform both training and testing labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the CNN model
model = keras.Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)), # Updated input shape
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(len(np.unique(y_train_encoded)), activation='softmax') # Output layer with softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Use sparse categorical crossentropy for integer labels
              metrics=['accuracy'])

X_train = X_train.reshape(-1, 64, 64, 3)  # Reshape to match the input shape of the first Conv2D layer
X_test = X_test.reshape(-1, 64, 64, 3)  # Reshape to match the input shape of the first Conv2D layer


# Train the model using the encoded labels
model.fit(X_train, y_train_encoded, epochs=9, validation_data=(X_test, y_test_encoded)) # Adjust epochs as needed

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test_encoded, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions using the encoded labels
y_pred_encoded = np.argmax(model.predict(X_test), axis=1)

# Inverse transform the predictions to get the original labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Evalauate CNN model
evaluate_model1(model, X_test, y_test, "CNN Model")

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
cmd = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=label_encoder.classes_)
plt.figure(figsize=(20, 20))
cmd.plot()
plt.title("CNN Confusion Matrix")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

## **Pillow package prediction is perfect with few or no false positives. The CNN model classification also has very decent prediction . See the confusion matrix for each one**

# **Update DataFrame with Predicted Label for *master category* for comparision**

In [None]:
# Create a new column in your DataFrame for model predictions
df['model_predicted_masterCategory'] = None  # Initialize with None

# Function to predict masterCategory for images in a directory
def predict_masterCategory(image_dir, model, df):
    for filename in os.listdir(image_dir):
        if filename.endswith(('.jpg', '.jpeg', '.png')):  # Check for image files
            image_id = int(filename.split('.')[0])  # Extract image ID from filename
            image_path = os.path.join(image_dir, filename)

            try:
                img = Image.open(image_path).convert('RGB')
                img = img.resize((64, 64))
                img_array = np.array(img).flatten()
                prediction = model.predict([img_array])  # Predict using trained model

                # Update the DataFrame with predictions
                df.loc[df['id'] == image_id, 'model_predicted_masterCategory'] = prediction[0]

            except Exception as e:
                print(f"Error processing image {filename}: {e}")

# Call the function to predict and update the DataFrame
predict_masterCategory("images", clf, df)


In [None]:
# Show only df with updated rows only
df = df.dropna(subset=['model_predicted_masterCategory'])
# Print the updated DataFrame to verify
print(df.sample(10))

# **Plotting the results to show model accuracy. Actual vs Predicted values of Master Category**

In [None]:
# Count occurrences of masterCategory and model_predicted_masterCategory
master_category_counts = df['masterCategory'].value_counts().reset_index()
predicted_master_category_counts = df['model_predicted_masterCategory'].value_counts().reset_index()

# Rename columns for merging
master_category_counts.columns = ['Category', 'Actual Count']
predicted_master_category_counts.columns = ['Category', 'Predicted Count']

# Merge the dataframes
plot_df = pd.merge(master_category_counts, predicted_master_category_counts, on='Category', how='outer').fillna(0)

# Melt the dataframe for easier plotting with seaborn
plot_df_melted = pd.melt(plot_df, id_vars=['Category'], value_vars=['Actual Count', 'Predicted Count'], var_name='Type', value_name='Count')

# Plotting
plt.figure(figsize=(12, 8))
barplot = sns.barplot(x='Category', y='Count', hue='Type', data=plot_df_melted)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Master Category')
plt.ylabel('Count')
plt.title('Comparison of Actual and Predicted Master Category Counts')
plt.tight_layout()

# Add count annotations
for bar in barplot.patches:
    height = bar.get_height()
    barplot.annotate(f'{int(height)}',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 5),  # Slightly above the bar
                     textcoords='offset points',
                     ha='center', va='bottom')  # Center and align text

plt.show()


# **Next Steps and Future Enhancements...**
## **Now that we created many models and identified two best models (Pillow, CNN) to classify images and label them correctly to ~99% accuracy, the Next Steps would be as follows...**

## 1) Enhance the models to classify and predict labels at a detailed level like 'sub-category' or 'product name'.

## 2) Create an endpoint to leverage it for use in real world application

## 3 Additionally, Enhance the models to extract additional features like color, style, shape, text  during image classification
