In [None]:
%pip install torch

In [None]:
import numpy as np

image_dataset = '../datasets/unlabelled_train_data_images.npy'


In [None]:
import numpy as np
import torch
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import os

class MNISTPreprocessor:
    """
    A class to preprocess and label an unlabeled MNIST-like dataset using PyTorch and scikit-learn,
    with multiple clustering algorithms, t-SNE visualization, and comprehensive metrics for analysis.
    """
    
    def __init__(self, data_path, n_clusters=10, random_state=42):
        """
        Initialize the preprocessor.
        
        Args:
            data_path (str): Path to the .npy file containing the unlabeled images.
            n_clusters (int): Number of clusters for clustering algorithms (default: 10 for digits 0-9).
            random_state (int): Random seed for reproducibility.
        """
        self.data_path = data_path
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.images = None
        self.preprocessed_images = None
        self.labels = None
        self.metrics = {}  # Store metrics for analysis
        
    def load_data(self):
        """
        Load the dataset from the .npy file and compute initial data metrics.
        
        Returns:
            numpy.ndarray: Loaded images with shape (n_samples, 1, 28, 28).
        """
        self.images = np.load(self.data_path)
        print(f"Loaded data shape: {self.images.shape}")
        
        # Validate data
        if self.images.shape[1:] != (1, 28, 28):
            raise ValueError("Expected images with shape (n_samples, 1, 28, 28)")
        if np.any(np.isnan(self.images)) or np.any(np.isinf(self.images)):
            raise ValueError("Data contains NaN or infinite values")
        
        # Compute initial pixel statistics
        self.metrics['raw_pixel_stats'] = {
            'mean': np.mean(self.images),
            'std': np.std(self.images),
            'min': np.min(self.images),
            'max': np.max(self.images)
        }
        print("Raw pixel statistics:", self.metrics['raw_pixel_stats'])
        
        return self.images
    
    def preprocess_images(self):
        """
        Preprocess the images: normalize and convert to PyTorch tensors, compute pixel statistics.
        
        Returns:
            torch.Tensor: Preprocessed images with shape (n_samples, 1, 28, 28).
        """
        if self.images is None:
            self.load_data()
        
        # Normalize to [0, 1]
        self.preprocessed_images = self.images.astype(np.float32) / 255.0
        print(f"Preprocessed data shape: {self.preprocessed_images.shape}, range: [{self.preprocessed_images.min()}, {self.preprocessed_images.max()}]")
        
        # Compute preprocessed pixel statistics
        self.metrics['preprocessed_pixel_stats'] = {
            'mean': np.mean(self.preprocessed_images),
            'std': np.std(self.preprocessed_images),
            'min': np.min(self.preprocessed_images),
            'max': np.max(self.preprocessed_images)
        }
        print("Preprocessed pixel statistics:", self.metrics['preprocessed_pixel_stats'])
        
        # Convert to PyTorch tensor
        tensor_images = torch.from_numpy(self.preprocessed_images)
        return tensor_images
    
    def flatten_for_clustering(self, apply_pca=False, n_components=50):
        """
        Flatten images for clustering and optionally apply PCA, compute PCA metrics.
        
        Args:
            apply_pca (bool): Whether to apply PCA for dimensionality reduction.
            n_components (int): Number of PCA components if apply_pca is True.
        
        Returns:
            numpy.ndarray: Flattened or PCA-transformed images with shape (n_samples, n_features).
        """
        if self.preprocessed_images is None:
            self.preprocess_images()
        
        # Flatten images: (n_samples, 1, 28, 28) -> (n_samples, 784)
        flattened = self.preprocessed_images.reshape(self.preprocessed_images.shape[0], -1)
        print(f"Flattened data shape: {flattened.shape}")
        
        if apply_pca:
            pca = PCA(n_components=n_components, random_state=self.random_state)
            flattened = pca.fit_transform(flattened)
            print(f"PCA-transformed data shape: {flattened.shape}")
            
            # Store PCA metrics
            self.metrics['pca_metrics'] = {
                'n_components': n_components,
                'explained_variance_ratio': np.sum(pca.explained_variance_ratio_),
                'individual_explained_variance': pca.explained_variance_ratio_.tolist()
            }
            print(f"PCA explained variance ratio: {self.metrics['pca_metrics']['explained_variance_ratio']:.4f}")
        
        return flattened
    
    def compute_clustering_metrics(self, flattened_images, labels, algorithm, subset_ratio=0.1):
        """
        Compute clustering quality metrics: silhouette score, Davies-Bouldin index, cluster sizes,
        and algorithm-specific metrics.
        
        Args:
            flattened_images (numpy.ndarray): Flattened or PCA-transformed images.
            labels (numpy.ndarray): Cluster labels.
            algorithm (str): Name of the clustering algorithm ('kmeans', 'agglomerative', 'dbscan', 'gmm').
            subset_ratio (float): Fraction of data to use for silhouette and DB scores (default: 0.1).
        
        Returns:
            dict: Clustering metrics.
        """
        # Use a subset for silhouette and DB scores
        n_subset = int(flattened_images.shape[0] * subset_ratio)
        subset_indices = np.random.choice(flattened_images.shape[0], size=n_subset, replace=False)
        subset_images = flattened_images[subset_indices]
        subset_labels = labels[subset_indices]
        
        # Initialize metrics
        clustering_metrics = {}
        
        # Cluster sizes
        unique_labels = np.unique(labels)
        if -1 in unique_labels:  # Handle DBSCAN noise
            cluster_sizes = np.bincount(labels[labels != -1], minlength=len(unique_labels) - 1)
            clustering_metrics['n_noise_points'] = np.sum(labels == -1)
        else:
            cluster_sizes = np.bincount(labels, minlength=self.n_clusters)
        clustering_metrics['cluster_sizes'] = cluster_sizes.tolist()
        clustering_metrics['n_clusters'] = len(unique_labels) - (1 if -1 in unique_labels else 0)
        
        # Silhouette score (requires at least 2 clusters)
        if clustering_metrics['n_clusters'] >= 2:
            clustering_metrics['silhouette_score'] = silhouette_score(subset_images, subset_labels)
        else:
            clustering_metrics['silhouette_score'] = None
            print("Silhouette score not computed (fewer than 2 clusters)")
        
        # Davies-Bouldin index (requires at least 2 clusters)
        if clustering_metrics['n_clusters'] >= 2:
            clustering_metrics['davies_bouldin_score'] = davies_bouldin_score(subset_images, subset_labels)
        else:
            clustering_metrics['davies_bouldin_score'] = None
            print("Davies-Bouldin index not computed (fewer than 2 clusters)")
        
        # Algorithm-specific metrics
        if algorithm == 'kmeans' or algorithm == 'agglomerative':
            # Compute inertia (within-cluster sum of squares)
            inertia = 0
            for label in unique_labels:
                if label == -1:  # Skip noise for DBSCAN
                    continue
                cluster_points = flattened_images[labels == label]
                centroid = np.mean(cluster_points, axis=0)
                inertia += np.sum((cluster_points - centroid) ** 2)
            clustering_metrics['inertia'] = inertia
        elif algorithm == 'gmm':
            # Log-likelihood and BIC are stored during clustering
            clustering_metrics['log_likelihood'] = self.metrics.get('gmm_log_likelihood', None)
            clustering_metrics['bic'] = self.metrics.get('gmm_bic', None)
        
        print(f"Clustering metrics for {algorithm} (subset {subset_ratio*100}%):")
        if clustering_metrics['silhouette_score'] is not None:
            print(f"  Silhouette Score: {clustering_metrics['silhouette_score']:.4f}")
        if clustering_metrics['davies_bouldin_score'] is not None:
            print(f"  Davies-Bouldin Index: {clustering_metrics['davies_bouldin_score']:.4f}")
        print(f"  Number of Clusters: {clustering_metrics['n_clusters']}")
        print(f"  Cluster Sizes: {clustering_metrics['cluster_sizes']}")
        if 'inertia' in clustering_metrics:
            print(f"  Inertia: {clustering_metrics['inertia']:.2f}")
        if 'n_noise_points' in clustering_metrics:
            print(f"  Noise Points: {clustering_metrics['n_noise_points']}")
        if 'log_likelihood' in clustering_metrics:
            print(f"  Log-Likelihood: {clustering_metrics['log_likelihood']:.2f}")
        if 'bic' in clustering_metrics:
            print(f"  BIC: {clustering_metrics['bic']:.2f}")
        
        return clustering_metrics
    
    def label_with_clustering(self, algorithm='kmeans', apply_pca=False, n_components=50, dbscan_eps=0.5, dbscan_min_samples=5):
        """
        Label the dataset using the specified clustering algorithm and compute metrics.
        
        Args:
            algorithm (str): Clustering algorithm ('kmeans', 'agglomerative', 'dbscan', 'gmm').
            apply_pca (bool): Whether to apply PCA before clustering.
            n_components (int): Number of PCA components if apply_pca is True.
            dbscan_eps (float): DBSCAN epsilon parameter (distance threshold).
            dbscan_min_samples (int): DBSCAN minimum samples for a core point.
        
        Returns:
            numpy.ndarray: Cluster labels with shape (n_samples,).
        """
        flattened_images = self.flatten_for_clustering(apply_pca, n_components)
        
        # Apply clustering
        if algorithm == 'kmeans':
            model = KMeans(n_clusters=self.n_clusters, random_state=self.random_state, n_init=10)
            self.labels = model.fit_predict(flattened_images)
            self.metrics['inertia'] = model.inertia_
            print(f"Generated {self.n_clusters} clusters with K-Means")
            print(f"K-Means Inertia: {self.metrics['inertia']:.2f}")
        
        elif algorithm == 'agglomerative':
            model = AgglomerativeClustering(n_clusters=self.n_clusters, linkage='ward')
            self.labels = model.fit_predict(flattened_images)
            print(f"Generated {self.n_clusters} clusters with Agglomerative Clustering")
        
        elif algorithm == 'dbscan':
            model = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples, n_jobs=-1)
            self.labels = model.fit_predict(flattened_images)
            n_clusters = len(np.unique(self.labels)) - (1 if -1 in self.labels else 0)
            print(f"Generated {n_clusters} clusters with DBSCAN (eps={dbscan_eps}, min_samples={dbscan_min_samples})")
        
        elif algorithm == 'gmm':
            model = GaussianMixture(n_components=self.n_clusters, covariance_type='full', random_state=self.random_state)
            self.labels = model.fit_predict(flattened_images)
            self.metrics['gmm_log_likelihood'] = model.score(flattened_images) * flattened_images.shape[0]  # Total log-likelihood
            self.metrics['gmm_bic'] = model.bic(flattened_images)
            print(f"Generated {self.n_clusters} clusters with GMM")
            print(f"GMM Log-Likelihood: {self.metrics['gmm_log_likelihood']:.2f}")
            print(f"GMM BIC: {self.metrics['gmm_bic']:.2f}")
        
        else:
            raise ValueError("Unsupported algorithm. Choose 'kmeans', 'agglomerative', 'dbscan', or 'gmm'.")
        
        # Compute clustering metrics
        self.metrics[f'{algorithm}_clustering_metrics'] = self.compute_clustering_metrics(flattened_images, self.labels, algorithm)
        
        return self.labels
    
    def visualize_clusters(self, save_path=None):
        """
        Visualize a few images from each cluster.
        
        Args:
            save_path (str, optional): Path to save the visualization plot.
        """
        if self.labels is None:
            raise ValueError("Run label_with_clustering() first to generate labels")
        
        unique_labels = np.unique(self.labels)
        n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
        fig, axes = plt.subplots(n_clusters, 5, figsize=(15, 2 * n_clusters))
        axes = np.array(axes).reshape(n_clusters, 5) if n_clusters > 1 else np.array([axes])
        
        for i, cluster in enumerate(unique_labels):
            if cluster == -1:  # Skip noise for DBSCAN
                continue
            cluster_indices = np.where(self.labels == cluster)[0]
            selected_indices = np.random.choice(cluster_indices, size=min(5, len(cluster_indices)), replace=False)
            for j, idx in enumerate(selected_indices):
                axes[i, j].imshow(self.preprocessed_images[idx, 0], cmap='gray')
                axes[i, j].axis('off')
                if j == 0:
                    axes[i, j].set_title(f"Cluster {cluster}")
        
        plt.tight_layout()
        if save_path:
            plt.savefig(save_path)
            print(f"Cluster visualization saved to {save_path}")
        plt.show()
    
    def visualize_tsne(self, subset_ratio=0.2, apply_pca=True, n_components=50, save_path=None):
        """
        Apply t-SNE to visualize 20% of the dataset in 2D, colored by cluster labels.
        
        Args:
            subset_ratio (float): Fraction of the dataset to use (default: 0.2 for 20%).
            apply_pca (bool): Whether to apply PCA before t-SNE.
            n_components (int): Number of PCA components if apply_pca is True.
            save_path (str, optional): Path to save the t-SNE visualization plot.
        
        Returns:
            numpy.ndarray: 2D t-SNE embeddings.
        """
        if self.labels is None:
            raise ValueError("Run label_with_clustering() first to generate labels")
        
        n_samples = int(self.preprocessed_images.shape[0] * subset_ratio)
        indices = np.random.choice(self.preprocessed_images.shape[0], size=n_samples, replace=False)
        subset_images = self.preprocessed_images[indices]
        subset_labels = self.labels[indices]
        print(f"Selected {n_samples} samples for t-SNE visualization")
        
        flattened = subset_images.reshape(n_samples, -1)
        if apply_pca:
            pca = PCA(n_components=n_components, random_state=self.random_state)
            flattened = pca.fit_transform(flattened)
            self.metrics['tsne_pca_metrics'] = {
                'n_components': n_components,
                'explained_variance_ratio': np.sum(pca.explained_variance_ratio_)
            }
            print(f"t-SNE PCA explained variance ratio: {self.metrics['tsne_pca_metrics']['explained_variance_ratio']:.4f}")
        
        tsne = TSNE(n_components=2, random_state=self.random_state, n_jobs=-1)
        tsne_embeddings = tsne.fit_transform(flattened)
        print(f"t-SNE embeddings shape: {tsne_embeddings.shape}")
        
        plt.figure(figsize=(10, 8))
        scatter = plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1], c=subset_labels, cmap='tab10', alpha=0.6)
        plt.colorbar(scatter, label='Cluster Label')
        plt.title('t-SNE Visualization of 20% MNIST Dataset')
        plt.xlabel('t-SNE Component 1')
        plt.ylabel('t-SNE Component 2')
        
        if save_path:
            plt.savefig(save_path)
            print(f"t-SNE visualization saved to {save_path}")
        plt.show()
        
        return tsne_embeddings
    
    def save_labeled_dataset(self, output_path):
        """
        Save the preprocessed images, labels, and metrics to a .npz file.
        
        Args:
            output_path (str): Path to save the .npz file.
        """
        if self.preprocessed_images is None or self.labels is None:
            raise ValueError("Preprocess images and generate labels first")
        
        np.savez(output_path, images=self.preprocessed_images, labels=self.labels)
        print(f"Labeled dataset saved to {output_path}")
    
    def get_metrics_summary(self):
        """
        Return a summary of all computed metrics.
        
        Returns:
            dict: Dictionary containing all metrics.
        """
        return self.metrics



In [None]:

def processImages():
    data_path = '../datasets/unlabelled_train_data_images.npy'
    output_path = '../datasets/labeled_train_data.npz'
    cluster_vis_path = '../datasets/cluster_visualization.png'
    tsne_vis_path = '../datasets/tsne_visualization.png'
    
    preprocessor = MNISTPreprocessor(data_path, n_clusters=10, random_state=42)
    
    # Load and preprocess
    preprocessor.load_data()
    tensor_images = preprocessor.preprocess_images()
    
    # Test multiple clustering algorithms
    # algorithms = ['kmeans', 'agglomerative', 'dbscan', 'gmm']
    algorithms = ['kmeans', 'agglomerative', 'gmm']
    for algo in algorithms:
        print(f"\nRunning {algo} clustering...")
        labels = preprocessor.label_with_clustering(
            algorithm=algo,
            apply_pca=True,
            n_components=50,
            dbscan_eps=0.5,  # Adjust based on experimentation
            dbscan_min_samples=5
        )
        
        # Visualize clusters
        preprocessor.visualize_clusters(save_path=cluster_vis_path.replace('.png', f'_{algo}.png'))
        
        # Visualize t-SNE
        preprocessor.visualize_tsne(
            subset_ratio=0.2,
            apply_pca=True,
            n_components=50,
            save_path=tsne_vis_path.replace('.png', f'_{algo}.png')
        )
        
        # Print metrics summary for this algorithm
        metrics_summary = preprocessor.get_metrics_summary()
        print(f"\nMetrics Summary for {algo}:")
        for key, value in metrics_summary.items():
            if algo in key or 'pixel_stats' in key or 'pca_metrics' in key:
                print(f"{key}: {value}")
    
    # Save the labeled dataset (using the last algorithm's labels)
    preprocessor.save_labeled_dataset(output_path)
    
    # Verify saved data
    with np.load(output_path) as data:
        saved_images = data['images']
        saved_labels = data['labels']
        print(f"Saved data: images shape {saved_images.shape}, labels shape {saved_labels.shape}")

processImages()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os

class ImageLabelModifier:
    """
    A class to manually modify labels in a labeled MNIST-like dataset saved as a .npz file,
    with support for iterative cluster-to-digit mapping updates.
    """
    
    def __init__(self, npz_path, random_state=42):
        """
        Initialize the label modifier.
        
        Args:
            npz_path (str): Path to the .npz file containing images and labels.
            random_state (int): Random seed for reproducibility in visualizations.
        """
        self.npz_path = npz_path
        self.random_state = random_state
        self.images = None
        self.labels = None
        self.n_clusters = 10  # Assuming 10 digit classes
        
    def load_npz_data(self):
        """
        Load images and labels from the .npz file.
        
        Returns:
            tuple: (images, labels) as numpy arrays.
        """
        with np.load(self.npz_path) as data:
            self.images = data['images']
            self.labels = data['labels']
        
        print(f"Loaded data: images shape {self.images.shape}, labels shape {self.labels.shape}")
        
        # Validate data
        if self.images.shape[1:] != (1, 28, 28):
            raise ValueError("Expected images with shape (n_samples, 1, 28, 28)")
        if self.labels.shape != (self.images.shape[0],):
            raise ValueError("Labels shape does not match images")
        if not np.all(np.isin(self.labels, np.arange(self.n_clusters))):
            raise ValueError("Labels contain values outside expected range [0, 9]")
        
        return self.images, self.labels
    
    def visualize_current_labels(self, samples_per_cluster=5, save_path=None):
        """
        Visualize a few images from each cluster to inspect current labels.
        
        Args:
            samples_per_cluster (int): Number of images to show per cluster.
            save_path (str, optional): Path to save the visualization plot.
        """
        if self.labels is None:
            self.load_npz_data()
        
        np.random.seed(self.random_state)
        fig, axes = plt.subplots(self.n_clusters, samples_per_cluster, figsize=(3 * samples_per_cluster, 2 * self.n_clusters))
        
        for cluster in range(self.n_clusters):
            # Get indices of images in this cluster
            cluster_indices = np.where(self.labels == cluster)[0]
            # Select up to samples_per_cluster random images
            selected_indices = np.random.choice(cluster_indices, size=min(samples_per_cluster, len(cluster_indices)), replace=False)
            for i, idx in enumerate(selected_indices):
                axes[cluster, i].imshow(self.images[idx, 0], cmap='gray')
                axes[cluster, i].axis('off')
                if i == 0:
                    axes[cluster, i].set_title(f"Cluster {cluster}")
        
        plt.tight_layout()
        if save_path:
            plt.savefig(save_path)
            print(f"Current labels visualization saved to {save_path}")
        plt.show()
    
    def map_cluster_to_digit(self, cluster_to_digit_map):
        """
        Apply a manual mapping from cluster IDs to true digit labels.
        
        Args:
            cluster_to_digit_map (dict): Mapping from cluster ID to digit (e.g., {0: 5, 1: 2, ...}).
        
        Returns:
            numpy.ndarray: Updated labels with shape (n_samples,).
        """
        if self.labels is None:
            self.load_npz_data()
        
        # Validate mapping
        if not all(k in range(self.n_clusters) for k in cluster_to_digit_map.keys()):
            raise ValueError("Cluster IDs in mapping must be in range [0, 9]")
        if not all(v in range(10) for v in cluster_to_digit_map.values()):
            raise ValueError("Digit values in mapping must be in range [0, 9]")
        if len(set(cluster_to_digit_map.values())) != len(cluster_to_digit_map):
            raise ValueError("Mapping must assign unique digits to clusters")
        
        # Create new labels array
        new_labels = np.copy(self.labels)
        for cluster, digit in cluster_to_digit_map.items():
            new_labels[self.labels == cluster] = digit
        
        self.labels = new_labels
        print("Applied cluster-to-digit mapping")
        return self.labels
    
    def get_user_mapping(self):
        """
        Prompt the user to input a cluster-to-digit mapping.
        
        Returns:
            dict: Mapping from cluster ID to digit, or None if user skips.
        """
        print("\nEnter cluster-to-digit mapping (e.g., '0:5, 1:2, ...') or press Enter to skip.")
        print("Format: 'cluster:digit' pairs separated by commas, covering all clusters 0-9.")
        user_input = input("Mapping: ").strip()
        
        if not user_input:
            print("No mapping provided, keeping current labels")
            return None
        
        try:
            # Parse input (e.g., "0:5, 1:2, 2:1, ...")
            mapping = {}
            pairs = user_input.split(',')
            for pair in pairs:
                cluster, digit = map(int, pair.strip().split(':'))
                if cluster not in range(self.n_clusters) or digit not in range(10):
                    raise ValueError("Invalid cluster or digit value")
                mapping[cluster] = digit
            
            # Validate completeness and uniqueness
            if len(mapping) != self.n_clusters:
                raise ValueError(f"Mapping must include all {self.n_clusters} clusters")
            if len(set(mapping.values())) != self.n_clusters:
                raise ValueError("Each cluster must map to a unique digit")
            
            return mapping
        except Exception as e:
            print(f"Error parsing mapping: {e}. Please try again.")
            return None
    
    def iterative_update_labels(self, samples_per_cluster=5, vis_path_prefix='labels_visualization'):
        """
        Iteratively visualize clusters, prompt for mapping, and update labels.
        
        Args:
            samples_per_cluster (int): Number of images to show per cluster.
            vis_path_prefix (str): Prefix for visualization save paths.
        
        Returns:
            numpy.ndarray: Final updated labels.
        """
        if self.labels is None:
            self.load_npz_data()
        
        iteration = 0
        while True:
            # Visualize current labels
            vis_path = f"../datasets/{vis_path_prefix}_iter{iteration}.png"
            self.visualize_current_labels(samples_per_cluster=samples_per_cluster, save_path=vis_path)
            
            # Get user mapping
            mapping = self.get_user_mapping()
            if mapping:
                # Apply mapping
                self.map_cluster_to_digit(mapping)
            else:
                print("No changes made to labels")
            
            # Ask if user wants to continue
            response = input("\nDo you want to refine the mapping further? (yes/no): ").strip().lower()
            if response != 'yes':
                break
            
            iteration += 1
        
        # Final visualization
        final_vis_path = f"../datasets/{vis_path_prefix}_final.png"
        self.visualize_current_labels(samples_per_cluster=samples_per_cluster, save_path=final_vis_path)
        print("Final labels visualization completed")
        return self.labels
    
    def save_updated_dataset(self, output_path):
        """
        Save the images and updated labels to a new .npz file.
        
        Args:
            output_path (str): Path to save the updated .npz file.
        """
        if self.images is None or self.labels is None:
            raise ValueError("Load data and modify labels first")
        
        np.savez(output_path, images=self.images, labels=self.labels)
        print(f"Updated dataset saved to {output_path}")





In [None]:


# def processImages():
#     data_path = '../datasets/unlabelled_train_data_images.npy'
#     output_path = '../datasets/labeled_train_data.npz'
#     cluster_vis_path = '../datasets/cluster_visualization.png'
#     tsne_vis_path = '../datasets/tsne_visualization.png'
    
#     preprocessor = MNISTPreprocessor(data_path, n_clusters=10, random_state=42)
    
#     # Load and preprocess
#     preprocessor.load_data()
#     tensor_images = preprocessor.preprocess_images()
    
#     # Label using K-Means with PCA
#     labels = preprocessor.label_with_kmeans(apply_pca=True, n_components=50)
    
#     # Visualize clusters
#     preprocessor.visualize_clusters(save_path=cluster_vis_path)
    
#     # Visualize t-SNE on 20% of the data
#     tsne_embeddings = preprocessor.visualize_tsne(subset_ratio=0.2, apply_pca=True, n_components=50, save_path=tsne_vis_path)
    
#     # Save the labeled dataset
#     preprocessor.save_labeled_dataset(output_path)
    
#     # Print metrics summary
#     metrics_summary = preprocessor.get_metrics_summary()
#     print("\nMetrics Summary:")
#     for key, value in metrics_summary.items():
#         print(f"{key}: {value}")
    
#     # Verify saved data
#     with np.load(output_path) as data:
#         saved_images = data['images']
#         saved_labels = data['labels']
#         print(f"Saved data: images shape {saved_images.shape}, labels shape {saved_labels.shape}")

# processImages()

In [None]:

def postProcessImages():
    npz_path = '../datasets/labeled_train_data.npz'
    output_path = '../datasets/updated_labeled_train_data.npz'
    vis_path = '../datasets/current_labels_visualization.png'
    
    modifier = ImageLabelModifier(npz_path, random_state=42)
    
    # Load the .npz file
    images, labels = modifier.load_npz_data()
    
    # Visualize current labels to inspect clusters
    modifier.visualize_current_labels(samples_per_cluster=5, save_path=vis_path)
    
    # Example: Define a cluster-to-digit mapping based on visualization
    # This is a placeholder; replace with actual mapping after inspecting visualization
    cluster_to_digit_map = {
        0: 1,  # Cluster 0 corresponds to digit 5
        1: 6,  # Cluster 1 corresponds to digit 2
        2: 2,  # Cluster 2 corresponds to digit 1
        3: 0,  # Cluster 3 corresponds to digit 7
        4: 3,  # Cluster 4 corresponds to digit 4
        5: 8,  # Cluster 5 corresponds to digit 0
        6: 9,  # Cluster 6 corresponds to digit 9
        7: 4,  # Cluster 7 corresponds to digit 3
        8: 1,  # Cluster 8 corresponds to digit 6
        9: 0   # Cluster 9 corresponds to digit 8
    }
    
    # Apply the mapping
    updated_labels = modifier.map_cluster_to_digit(cluster_to_digit_map)
    
    # Optional: Manually correct a few samples
    # updated_labels = modifier.manually_correct_samples(n_samples=10)
    
    # Visualize updated labels to confirm changes
    modifier.visualize_current_labels(samples_per_cluster=5, save_path=vis_path.replace('current', 'updated'))
    
    # Save the updated dataset
    modifier.save_updated_dataset(output_path)
    
    # Verify saved data
    with np.load(output_path) as data:
        saved_images = data['images']
        saved_labels = data['labels']
        print(f"Saved data: images shape {saved_images.shape}, labels shape {saved_labels.shape}")


def postProcessImages():
    npz_path = '../datasets/labeled_train_data.npz'
    output_path = '../datasets/updated_labeled_train_data.npz'
    
    modifier = ImageLabelModifier(npz_path, random_state=42)
    
    # Iteratively update labels
    updated_labels = modifier.iterative_update_labels(samples_per_cluster=5, vis_path_prefix='labels_visualization')
    
    # Save the updated dataset
    modifier.save_updated_dataset(output_path)
    
    # Verify saved data
    with np.load(output_path) as data:
        saved_images = data['images']
        saved_labels = data['labels']
        print(f"Saved data: images shape {saved_images.shape}, labels shape {saved_labels.shape}")

postProcessImages()