In [None]:
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
from collections import defaultdict

# Load the MNIST dataset    
    """
    Loads the MNIST dataset from a CSV file.

    Parameters:
    - fileName (str): The name of the CSV file.
    - labeled (bool): Whether the dataset includes labels.

    Returns:
    - data (numpy.ndarray): The dataset.
    - label (numpy.ndarray): The labels (if labeled is True).
    """

In [None]:
def loadData(fileName, labeled=True):
    mnist_data = pd.read_csv(fileName)
    if labeled:
        label = np.array(mnist_data["label"])
        data = np.array(mnist_data.iloc[:, 1:])
        return data, label
    else:
        return np.array(mnist_data.iloc[:, 1:])

# Perfrom PCA
    """
    Perform Principal Component Analysis (PCA) on the dataset.

    Parameters:
    - X (numpy.ndarray): The dataset.
    - num_components (int): The number of PCA components.

    Returns:
    - reduced_data (numpy.ndarray): The reduced dataset.
    """

In [None]:
def pca(X, num_components):
    # Center the data
    mean = np.mean(X, axis=0)
    centered_data = X - mean

    # Calculate the covariance matrix
    cov_matrix = np.cov(centered_data, rowvar=False)

    # Compute the eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

    # Sort eigenvectors and eigenvalues in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]

    # Select the top 'num_components' eigenvectors
    top_eigenvectors = eigenvectors[:, :num_components]

    # Project the data onto the top eigenvectors
    reduced_data = np.dot(centered_data, top_eigenvectors)

    return reduced_data

# Perform GMM clustering
    """
    Perform Gaussian Mixture Model (GMM) clustering on the dataset.

    Parameters:
    - data (numpy.ndarray): The dataset.
    - num_clusters (int): The number of clusters.

    Returns:
    - labels (numpy.ndarray): Cluster labels.
    """

In [None]:
def gmm_clustering(data, num_clusters):
    gmm = GaussianMixture(n_components=num_clusters, random_state=0)
    gmm.fit(data)
    labels = gmm.predict(data)
    return labels

# Show clustered images in a grid
    """
    Display images in a grid.

    Parameters:
    - digitsData (numpy.ndarray): Image data.
    - digitsLabel (numpy.ndarray): Labels.
    - num_rows (int): Number of rows in the grid.
    - num_cols (int): Number of columns in the grid.
    """

In [None]:
def showDigitInGrid(digitsData, digitsLabel=None, num_rows=None, num_cols=None):
    num_images = len(digitsData)
    
    if num_rows is None or num_cols is None:
        num_rows = int(np.sqrt(num_images))
        num_cols = int(np.ceil(num_images / num_rows))
    
    plt.figure(figsize=(12, 8))
    for i in range(num_images):
        plt.subplot(num_rows, num_cols, i + 1)
        plt.title(str(digitsLabel[i]) if digitsLabel is not None else '')
        plt.imshow(digitsData[i].reshape(28, 28), interpolation='bicubic', cmap='Greys')
        plt.xticks([])
        plt.yticks([])

    plt.tight_layout()

# Summary
## Usage

    Load the MNIST Dataset:
        The loadData function loads the MNIST dataset from a CSV file.

    Perform PCA:
        The pca function performs Principal Component Analysis on the dataset.
        It centers the data, calculates the covariance matrix, and selects the top 'num_components' eigenvectors.

    Perform GMM Clustering:
        The gmm_clustering function applies Gaussian Mixture Model clustering on the reduced dataset.

    Show Clustered Images:
        The showDigitInGrid function displays clustered images in a grid.
        It can be used to visualize the results of clustering.

    Main Function:
        The main function defines combinations of PCA components and GMM clusters.
        It calls the functions to perform PCA and clustering for each combination.

    Run the Script:
        Execute the script to perform PCA, GMM clustering, and image visualization.
        Images of clustered digits will be saved in files with appropriate names.

## Example Combinations

    PCA Components: [32, 64, 128]
    GMM Clusters: [10, 7, 4]

## Output

The script generates images that visualize the clustered digits for various combinations of PCA components and GMM clusters. Each image is saved with a filename indicating the specific combination.