# **Fashion MNIST: Raw Data Analysis**

***
***

### **Import Libraries and Data**

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.datasets import fashion_mnist
import os
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import cv2
import json
from datetime import datetime

2025-04-27 10:00:15.026358: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745748015.038239   17397 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745748015.041383   17397 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745748015.050863   17397 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745748015.050875   17397 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745748015.050877   17397 computation_placer.cc:177] computation placer alr

In [2]:
# Create directory for analysis results
os.makedirs('./analysis_results', exist_ok=True)

# Set visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [3]:
# Load Fashion MNIST dataset
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

# Define class names
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

Training data shape: (60000, 28, 28)
Test data shape: (10000, 28, 28)


### **1. Dataset Overview**

In [4]:
# 1. Dataset Overview
def analyze_dataset_overview():
    # Class distribution
    train_dist = np.bincount(y_train)
    test_dist = np.bincount(y_test)
    
    # Create distribution visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    ax1.bar(class_names, train_dist)
    ax1.set_title('Training Set Class Distribution')
    ax1.set_xlabel('Class')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=45)
    
    ax2.bar(class_names, test_dist)
    ax2.set_title('Test Set Class Distribution')
    ax2.set_xlabel('Class')
    ax2.set_ylabel('Count')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('./analysis_results/class_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save distribution data
    distribution_data = {
        'train_distribution': dict(zip(class_names, train_dist.tolist())),
        'test_distribution': dict(zip(class_names, test_dist.tolist())),
        'image_shape': X_train[0].shape,
        'total_train_samples': len(X_train),
        'total_test_samples': len(X_test)
    }
    
    with open('./analysis_results/dataset_overview.json', 'w') as f:
        json.dump(distribution_data, f, indent=4)
    
    return distribution_data

overview_data = analyze_dataset_overview()

### **2. Statistical Analysis**

In [5]:
# 2. Statistical Analysis
def statistical_analysis():
    # Global statistics
    global_mean = np.mean(X_train)
    global_std = np.std(X_train)
    
    # Class-specific statistics
    class_stats = {}
    for i in range(10):
        class_data = X_train[y_train == i]
        class_stats[class_names[i]] = {
            'mean': float(np.mean(class_data)),
            'std': float(np.std(class_data)),
            'min': float(np.min(class_data)),
            'max': float(np.max(class_data)),
            'median': float(np.median(class_data))
        }
    
    # Pixel intensity distribution
    plt.figure(figsize=(10, 6))
    plt.hist(X_train.flatten(), bins=50, alpha=0.7)
    plt.title('Pixel Intensity Distribution')
    plt.xlabel('Pixel Value')
    plt.ylabel('Frequency')
    plt.savefig('./analysis_results/pixel_intensity_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Class-wise brightness analysis
    fig, ax = plt.subplots(figsize=(12, 6))
    brightness_data = []
    for i in range(10):
        class_data = X_train[y_train == i]
        brightness_data.append(np.mean(class_data, axis=(1, 2)))
    
    ax.boxplot(brightness_data, labels=class_names)
    ax.set_title('Class-wise Brightness Distribution')
    ax.set_ylabel('Mean Brightness')
    ax.tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.savefig('./analysis_results/class_brightness_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save statistical data
    stats_data = {
        'global_statistics': {
            'mean': float(global_mean),
            'std': float(global_std)
        },
        'class_statistics': class_stats
    }
    
    with open('./analysis_results/statistical_analysis.json', 'w') as f:
        json.dump(stats_data, f, indent=4)
    
    return stats_data

stats_data = statistical_analysis()

  ax.boxplot(brightness_data, labels=class_names)


### **3. Visual Pattern Analysis**

In [6]:
# 3. Visual Pattern Analysis
def visual_pattern_analysis():
    # Sample images for each class
    fig, axes = plt.subplots(10, 10, figsize=(15, 15))
    for i in range(10):
        class_indices = np.where(y_train == i)[0][:10]
        for j in range(10):
            axes[i, j].imshow(X_train[class_indices[j]], cmap='gray')
            axes[i, j].axis('off')
            if j == 0:
                axes[i, j].set_ylabel(class_names[i], rotation=90, size='large')
    
    plt.tight_layout()
    plt.savefig('./analysis_results/class_samples.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Edge detection analysis
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    for i in range(10):
        class_example = X_train[y_train == i][0]
        edges = cv2.Canny(class_example, 100, 200)
        
        ax = axes[i // 5, i % 5]
        ax.imshow(edges, cmap='gray')
        ax.set_title(class_names[i])
        ax.axis('off')
    
    plt.tight_layout()
    plt.savefig('./analysis_results/edge_detection_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Average image per class
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    average_images = []
    for i in range(10):
        class_images = X_train[y_train == i]
        avg_img = np.mean(class_images, axis=0)
        average_images.append(avg_img)
        
        ax = axes[i // 5, i % 5]
        ax.imshow(avg_img, cmap='gray')
        ax.set_title(f'Average {class_names[i]}')
        ax.axis('off')
    
    plt.tight_layout()
    plt.savefig('./analysis_results/average_class_images.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return average_images

average_images = visual_pattern_analysis()

### **4. Dimensionality Analysis**

In [7]:
# 4. Dimensionality Analysis
def dimensionality_analysis():
    # Reshape data for analysis
    X_train_flat = X_train.reshape(X_train.shape[0], -1)
    
    # PCA Analysis
    pca = PCA(n_components=50)
    X_pca = pca.fit_transform(X_train_flat)
    
    # Plot explained variance
    plt.figure(figsize=(10, 6))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('PCA: Explained Variance vs Components')
    plt.grid(True)
    plt.savefig('./analysis_results/pca_explained_variance.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # t-SNE visualization (using a subset for efficiency)
    np.random.seed(42)
    subset_indices = np.random.choice(len(X_train_flat), 5000, replace=False)
    X_subset = X_train_flat[subset_indices]
    y_subset = y_train[subset_indices]
    
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X_subset)
    
    plt.figure(figsize=(12, 10))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_subset, cmap='tab10', alpha=0.6)
    plt.colorbar(scatter)
    plt.title('t-SNE Visualization of Fashion MNIST')
    
    # Add class labels
    for i in range(10):
        plt.annotate(class_names[i], 
                    xy=(np.mean(X_tsne[y_subset == i, 0]), 
                        np.mean(X_tsne[y_subset == i, 1])),
                    xytext=(5, 5), textcoords='offset points',
                    bbox=dict(boxstyle='round,pad=0.3', fc='yellow', alpha=0.7))
    
    plt.savefig('./analysis_results/tsne_visualization.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save dimensionality data
    dim_data = {
        'pca_explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
        'pca_components_needed_95': int(np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1)
    }
    
    with open('./analysis_results/dimensionality_analysis.json', 'w') as f:
        json.dump(dim_data, f, indent=4)
    
    return dim_data

dim_data = dimensionality_analysis()

### **5. Class Relationship Analysis**

In [8]:
# 5. Class Relationship Analysis
def class_relationship_analysis():
    # Calculate class centroids
    centroids = []
    for i in range(10):
        class_images = X_train[y_train == i]
        centroid = np.mean(class_images, axis=0)
        centroids.append(centroid.flatten())
    
    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(centroids)
    
    # Plot similarity heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, annot=True, cmap='coolwarm', 
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Class Similarity Matrix (Cosine Similarity)')
    plt.savefig('./analysis_results/class_similarity_matrix.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Identify most similar class pairs
    similar_pairs = []
    for i in range(10):
        for j in range(i + 1, 10):
            similar_pairs.append({
                'class1': class_names[i],
                'class2': class_names[j],
                'similarity': float(similarity_matrix[i, j])
            })
    
    similar_pairs.sort(key=lambda x: x['similarity'], reverse=True)
    
    # Save relationship data
    relationship_data = {
        'similarity_matrix': similarity_matrix.tolist(),
        'most_similar_pairs': similar_pairs[:5],
        'least_similar_pairs': similar_pairs[-5:]
    }
    
    with open('./analysis_results/class_relationships.json', 'w') as f:
        json.dump(relationship_data, f, indent=4)
    
    return relationship_data

relationship_data = class_relationship_analysis()

### **6. Data Quality Assessment**

In [9]:
# 6. Data Quality Assessment
def data_quality_assessment():
    # Check for outliers using z-score
    X_train_flat = X_train.reshape(X_train.shape[0], -1)
    mean_intensity = np.mean(X_train_flat, axis=1)
    z_scores = (mean_intensity - np.mean(mean_intensity)) / np.std(mean_intensity)
    
    outlier_indices = np.where(np.abs(z_scores) > 3)[0]
    
    # Plot potential outliers
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    axes = axes.ravel()
    for i, idx in enumerate(outlier_indices[:10]):
        axes[i].imshow(X_train[idx], cmap='gray')
        axes[i].set_title(f'Potential Outlier\n{class_names[y_train[idx]]}')
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig('./analysis_results/potential_outliers.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Check data consistency
    quality_data = {
        'outlier_count': len(outlier_indices),
        'outlier_percentage': float(len(outlier_indices) / len(X_train) * 100),
        'data_range': {
            'min': float(np.min(X_train)),
            'max': float(np.max(X_train))
        },
        'null_values': bool(np.isnan(X_train).any()),
        'data_type': str(X_train.dtype)
    }
    
    with open('./analysis_results/data_quality_assessment.json', 'w') as f:
        json.dump(quality_data, f, indent=4)
    
    return quality_data

quality_data = data_quality_assessment()

### **7. Preprocessing Insights**

In [10]:
# 7. Preprocessing Insights
def preprocessing_insights():
    # Compare normalization techniques
    sample_image = X_train[0]
    
    # Different normalization methods
    normalizations = {
        'Original': sample_image,
        'Min-Max [0,1]': sample_image / 255.0,
        'Mean Normalization': (sample_image - np.mean(sample_image)) / np.std(sample_image),
        'Standard Scaling': (sample_image - np.mean(X_train)) / np.std(X_train)
    }
    
    fig, axes = plt.subplots(2, 2, figsize=(10, 10))
    axes = axes.ravel()
    
    for i, (name, img) in enumerate(normalizations.items()):
        axes[i].imshow(img, cmap='gray')
        axes[i].set_title(name)
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig('./analysis_results/normalization_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Analyze brightness and contrast effects
    augmentation_examples = {
        'Original': sample_image,
        'Brightness +50': np.clip(sample_image + 50, 0, 255),
        'Brightness -50': np.clip(sample_image - 50, 0, 255),
        'Contrast x1.5': np.clip(sample_image * 1.5, 0, 255)
    }
    
    fig, axes = plt.subplots(2, 2, figsize=(10, 10))
    axes = axes.ravel()
    
    for i, (name, img) in enumerate(augmentation_examples.items()):
        axes[i].imshow(img, cmap='gray')
        axes[i].set_title(name)
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.savefig('./analysis_results/augmentation_examples.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    preprocessing_recommendations = {
        'recommended_normalization': 'Min-Max [0,1]',
        'reason': 'Maintains relative pixel intensities while scaling to standard range',
        'augmentation_suggestions': [
            'Random rotation (±10 degrees)',
            'Random horizontal flip',
            'Random brightness adjustment (±10%)',
            'Random zoom (0.9-1.1x)'
        ]
    }
    
    with open('./analysis_results/preprocessing_insights.json', 'w') as f:
        json.dump(preprocessing_recommendations, f, indent=4)
    
    return preprocessing_recommendations

preprocessing_data = preprocessing_insights()

### **8. Performance Predictions**

In [11]:
# 8. Performance Predictions
def performance_predictions():
    # Based on analysis, predict difficulty of classification
    class_difficulty = []
    
    for i in range(10):
        class_images = X_train[y_train == i]
        
        # Calculate intra-class variance
        variance = np.mean(np.var(class_images, axis=0))
        
        # Get similarity to other classes
        similar_classes = []
        for j in range(10):
            if i != j:
                similarity = relationship_data['similarity_matrix'][i][j]
                similar_classes.append((class_names[j], similarity))
        
        similar_classes.sort(key=lambda x: x[1], reverse=True)
        
        class_difficulty.append({
            'class': class_names[i],
            'variance': float(variance),
            'most_similar_to': similar_classes[0][0],
            'similarity_score': float(similar_classes[0][1])
        })
    
    # Sort by difficulty (higher variance and similarity = more difficult)
    class_difficulty.sort(key=lambda x: x['variance'] * x['similarity_score'], reverse=True)
    
    # Create visualization
    fig, ax = plt.subplots(figsize=(12, 6))
    
    classes = [item['class'] for item in class_difficulty]
    difficulty_scores = [item['variance'] * item['similarity_score'] for item in class_difficulty]
    
    bars = ax.bar(classes, difficulty_scores)
    ax.set_ylabel('Difficulty Score')
    ax.set_title('Predicted Classification Difficulty by Class')
    ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('./analysis_results/classification_difficulty_prediction.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    performance_data = {
        'class_difficulty_ranking': class_difficulty,
        'expected_confusion_pairs': [
            {'pair': [item['class'], item['most_similar_to']], 
             'similarity': item['similarity_score']} 
            for item in class_difficulty[:5]
        ],
        'estimated_accuracy_range': {
            'simple_model': '85-90%',
            'complex_model': '92-95%',
            'ensemble': '94-97%'
        }
    }
    
    with open('./analysis_results/performance_predictions.json', 'w') as f:
        json.dump(performance_data, f, indent=4)
    
    return performance_data

performance_data = performance_predictions()

### **9. Comparative Analysis**

In [12]:
# 9. Comparative Analysis with Original MNIST
def comparative_analysis():
    # Load original MNIST for comparison
    from tensorflow.keras.datasets import mnist
    (mnist_train, _), _ = mnist.load_data()
    
    # Compare complexity
    fashion_complexity = np.mean(np.std(X_train, axis=(1, 2)))
    mnist_complexity = np.mean(np.std(mnist_train, axis=(1, 2)))
    
    # Compare edge density
    fashion_edges = np.mean([np.mean(cv2.Canny(img, 100, 200) > 0) for img in X_train[:1000]])
    mnist_edges = np.mean([np.mean(cv2.Canny(img, 100, 200) > 0) for img in mnist_train[:1000]])
    
    # Visual comparison
    fig, axes = plt.subplots(2, 10, figsize=(15, 4))
    
    for i in range(10):
        axes[0, i].imshow(X_train[i], cmap='gray')
        axes[0, i].axis('off')
        if i == 0:
            axes[0, 0].set_ylabel('Fashion\nMNIST', rotation=0, labelpad=40)
        
        axes[1, i].imshow(mnist_train[i], cmap='gray')
        axes[1, i].axis('off')
        if i == 0:
            axes[1, 0].set_ylabel('Original\nMNIST', rotation=0, labelpad=40)
    
    plt.tight_layout()
    plt.savefig('./analysis_results/mnist_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    comparison_data = {
        'complexity_comparison': {
            'fashion_mnist': float(fashion_complexity),
            'original_mnist': float(mnist_complexity),
            'complexity_ratio': float(fashion_complexity / mnist_complexity)
        },
        'edge_density_comparison': {
            'fashion_mnist': float(fashion_edges),
            'original_mnist': float(mnist_edges),
            'edge_ratio': float(fashion_edges / mnist_edges)
        },
        'key_differences': [
            'Fashion MNIST has more complex patterns and textures',
            'Higher intra-class variability in Fashion MNIST',
            'More challenging due to visual similarity between classes',
            'Requires more sophisticated feature extraction'
        ],
        'architecture_recommendations': [
            'Use deeper CNNs for Fashion MNIST',
            'Consider attention mechanisms',
            'Implement data augmentation',
            'Use ensemble methods for best performance'
        ]
    }
    
    with open('./analysis_results/comparative_analysis.json', 'w') as f:
        json.dump(comparison_data, f, indent=4)
    
    return comparison_data

comparison_data = comparative_analysis()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


### **Executive Summary Generation**

In [13]:
# Generate Executive Summary
def generate_executive_summary():
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    summary = {
        'analysis_timestamp': timestamp,
        'dataset_summary': {
            'total_samples': overview_data['total_train_samples'] + overview_data['total_test_samples'],
            'image_dimensions': overview_data['image_shape'],
            'number_of_classes': 10,
            'class_balance': 'Balanced (6,000 samples per class)'
        },
        'key_findings': {
            'data_quality': f"{quality_data['outlier_percentage']:.2f}% potential outliers detected",
            'dimensionality': f"{dim_data['pca_components_needed_95']} components explain 95% variance",
            'class_relationships': f"Most similar classes: {relationship_data['most_similar_pairs'][0]['class1']} and {relationship_data['most_similar_pairs'][0]['class2']}",
            'complexity': f"{comparison_data['complexity_comparison']['complexity_ratio']:.2f}x more complex than original MNIST"
        },
        'recommendations': {
            'preprocessing': preprocessing_data['recommended_normalization'],
            'architecture': comparison_data['architecture_recommendations'][0],
            'expected_performance': performance_data['estimated_accuracy_range']['complex_model']
        }
    }
    
    with open('./analysis_results/executive_summary.json', 'w') as f:
        json.dump(summary, f, indent=4)
    
    # Create a visual summary
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # Class distribution
    train_dist = np.bincount(y_train)
    ax1.bar(class_names, train_dist)
    ax1.set_title('Class Distribution')
    ax1.tick_params(axis='x', rotation=45)
    
    # PCA variance
    pca_variance = dim_data['pca_explained_variance_ratio'][:20]
    ax2.plot(range(1, 21), np.cumsum(pca_variance))
    ax2.set_title('PCA Explained Variance')
    ax2.set_xlabel('Components')
    ax2.set_ylabel('Cumulative Variance')
    ax2.grid(True)
    
    # Class similarity
    im = ax3.imshow(relationship_data['similarity_matrix'], cmap='coolwarm')
    ax3.set_xticks(range(10))
    ax3.set_yticks(range(10))
    ax3.set_xticklabels(class_names, rotation=45)
    ax3.set_yticklabels(class_names)
    ax3.set_title('Class Similarity Matrix')
    plt.colorbar(im, ax=ax3)
    
    # Difficulty prediction
    classes = [item['class'] for item in performance_data['class_difficulty_ranking']][:5]
    scores = [item['variance'] * item['similarity_score'] for item in performance_data['class_difficulty_ranking']][:5]
    ax4.bar(classes, scores)
    ax4.set_title('Top 5 Most Difficult Classes')
    ax4.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('./analysis_results/executive_summary_visualization.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print("Analysis complete! All results saved to ./analysis_results/")
    return summary

executive_summary = generate_executive_summary()

Analysis complete! All results saved to ./analysis_results/


In [14]:
def run_complete_analysis():
    print("Starting Fashion MNIST Analysis...")
    
    print("1. Dataset Overview...")
    analyze_dataset_overview()
    
    print("2. Statistical Analysis...")
    statistical_analysis()
    
    print("3. Visual Pattern Analysis...")
    visual_pattern_analysis()
    
    print("4. Dimensionality Analysis...")
    dimensionality_analysis()
    
    print("5. Class Relationship Analysis...")
    class_relationship_analysis()
    
    print("6. Data Quality Assessment...")
    data_quality_assessment()
    
    print("7. Preprocessing Insights...")
    preprocessing_insights()
    
    print("8. Performance Predictions...")
    performance_predictions()
    
    print("9. Comparative Analysis...")
    comparative_analysis()
    
    print("10. Generating Executive Summary...")
    generate_executive_summary()
    
    print("\nAnalysis Complete! Check ./analysis_results/ for all outputs.")

# Run the complete analysis
if __name__ == "__main__":
    run_complete_analysis()

Starting Fashion MNIST Analysis...
1. Dataset Overview...
2. Statistical Analysis...


  ax.boxplot(brightness_data, labels=class_names)


3. Visual Pattern Analysis...
4. Dimensionality Analysis...
5. Class Relationship Analysis...
6. Data Quality Assessment...
7. Preprocessing Insights...
8. Performance Predictions...
9. Comparative Analysis...
10. Generating Executive Summary...
Analysis complete! All results saved to ./analysis_results/

Analysis Complete! Check ./analysis_results/ for all outputs.


***
***