# **Fashion MNIST: Feature Engineering**

***
***

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import cv2
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

2025-04-28 14:24:12.699821: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745850252.712526   34706 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745850252.716240   34706 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745850252.727362   34706 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745850252.727381   34706 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745850252.727382   34706 computation_placer.cc:177] computation placer alr

In [2]:
# 1. Load Fashion MNIST dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

# 2. Define class names
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [3]:
# 3. Combine data for processing
x_all = np.concatenate([x_train, x_test])
y_all = np.concatenate([y_train, y_test])
split_labels = ['train'] * len(x_train) + ['test'] * len(x_test)

In [4]:
# 4. Calculate PCA components (for dimension reduction)
# Flatten images for PCA
x_flat = x_all.reshape(x_all.shape[0], -1)

# Fit PCA on a subset to save memory
sample_size = 10000
random_indices = np.random.choice(len(x_flat), sample_size, replace=False)
pca = PCA(n_components=10)
pca.fit(x_flat[random_indices])

# Transform all data
pca_results = pca.transform(x_flat)

In [5]:
# 5. Feature extraction function
def extract_features(image):
    # Basic statistics
    mean_brightness = np.mean(image)
    std_deviation = np.std(image)
    
    # Edge detection
    sobelx = cv2.Sobel(image, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(image, cv2.CV_64F, 0, 1, ksize=3)
    edge_magnitude = np.sqrt(sobelx**2 + sobely**2)
    edge_density = np.mean(edge_magnitude)
    
    # Histogram features (5 bins)
    hist, _ = np.histogram(image, bins=5, range=(0, 255))
    hist = hist / np.sum(hist)  # Normalize
    
    return {
        'mean_brightness': mean_brightness,
        'std_deviation': std_deviation,
        'edge_density': edge_density,
        'hist_bin1': hist[0],
        'hist_bin2': hist[1],
        'hist_bin3': hist[2],
        'hist_bin4': hist[3],
        'hist_bin5': hist[4]
    }

In [6]:
# 6. Process all images
image_features = []
metadata_features = []

for i in range(len(x_all)):
    # Generate unique image ID
    image_id = f"img_{i}"
    
    # Extract features
    features = extract_features(x_all[i])
    
    # Image features
    image_features.append({
        'image_id': image_id,
        'mean_brightness': features['mean_brightness'],
        'std_deviation': features['std_deviation'],
        'edge_density': features['edge_density'],
        'pca_component_1': pca_results[i][0],
        'pca_component_2': pca_results[i][1],
        'pca_component_3': pca_results[i][2],
        'pca_component_4': pca_results[i][3],
        'pca_component_5': pca_results[i][4],
        'hist_bin1': features['hist_bin1'],
        'hist_bin2': features['hist_bin2'],
        'hist_bin3': features['hist_bin3'],
        'hist_bin4': features['hist_bin4'],
        'hist_bin5': features['hist_bin5']
    })

    
    # Metadata features
    metadata_features.append({
        'image_id': image_id,
        'class_id': int(y_all[i]),
        'class_name': class_names[y_all[i]],
        'data_split': split_labels[i]
    })
    
    # Show progress
    if i % 10000 == 0:
        print(f"Processed {i}/{len(x_all)} images")

Processed 0/70000 images
Processed 10000/70000 images
Processed 20000/70000 images
Processed 30000/70000 images
Processed 40000/70000 images
Processed 50000/70000 images
Processed 60000/70000 images


In [7]:
# 7. Convert to dataframes
image_features_df = pd.DataFrame(image_features)
metadata_features_df = pd.DataFrame(metadata_features)

# 8. Save to CSV
image_features_df.to_csv('./features/image_features.csv', index=False)
metadata_features_df.to_csv('./features/metadata_features.csv', index=False)

In [8]:
# 9. Display sample rows
print("\nImage Features (first 5 rows):")
print(image_features_df.head())

print("\nMetadata Features (first 5 rows):")
print(metadata_features_df.head())


Image Features (first 5 rows):
  image_id  mean_brightness  std_deviation  edge_density  pca_component_1  \
0    img_0        97.253827     101.792346    192.181315      -133.960372   
1    img_1       107.905612     100.831448    225.351588      1420.510590   
2    img_2        36.558673      49.698752     83.301051      -692.044930   
3    img_3        59.501276      64.849295    136.712344        60.543575   
4    img_4        78.044643     103.843248    190.411598       838.742679   

   pca_component_2  pca_component_3  pca_component_4  pca_component_5  \
0      1634.674432     -1180.050450      -351.047373         9.991461   
1      -425.358405      -224.046959      -361.054253       290.012937   
2     -1123.716759       107.366663      -201.745258       -94.288999   
3      -990.493523       218.350244      -360.377514        43.654988   
4     -1185.264650      -771.009579       227.522258       398.429202   

   hist_bin1  hist_bin2  hist_bin3  hist_bin4  hist_bin5  
0   0.5

In [9]:
# 10. Display dataset statistics
print("\nImage Features Statistics:")
print(image_features_df.describe())


Image Features Statistics:
       mean_brightness  std_deviation  edge_density  pca_component_1  \
count     70000.000000   70000.000000  70000.000000     70000.000000   
mean         72.969811      81.649481    176.495030        17.998457   
std          32.134516      20.019305     43.877835      1134.817214   
min           4.943878      16.525658     32.577617     -2026.518314   
25%          47.405293      66.923739    147.085311      -945.189249   
50%          69.336735      84.541055    173.466071         0.600875   
75%          97.354911      98.158086    203.357451       914.400001   
max         191.820153     121.286206    475.178819      2798.851306   

       pca_component_2  pca_component_3  pca_component_4  pca_component_5  \
count     70000.000000     70000.000000     70000.000000     70000.000000   
mean         -6.440443         4.424498        -5.566805         4.030693   
std         886.547650       516.046603       468.677947       412.592766   
min       -1705

In [10]:
# 11. Count classes in metadata
print("\nClass Distribution:")
print(metadata_features_df['class_name'].value_counts())


Class Distribution:
class_name
Ankle boot     7000
T-shirt/top    7000
Dress          7000
Pullover       7000
Sneaker        7000
Sandal         7000
Trouser        7000
Shirt          7000
Coat           7000
Bag            7000
Name: count, dtype: int64


In [11]:
# 12. Verify file sizes
import os
print(f"\nCSV File Sizes:")
print(f"image_features.csv: {os.path.getsize('./features/image_features.csv') / (1024*1024):.2f} MB")
print(f"metadata_features.csv: {os.path.getsize('./features/metadata_features.csv') / (1024*1024):.2f} MB")


CSV File Sizes:
image_features.csv: 17.20 MB
metadata_features.csv: 1.69 MB


***
***