In [2]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Load the data
digits = np.load('unlabelled_train_data_images.npy')

# Preprocess the data
X = digits.reshape(digits.shape[0], -1)  # Flatten images to 1D arrays
X = X / 255.0  # Normalize pixel values to [0, 1]

# 1. PCA Embeddings
def create_pca_embeddings(X, n_components=50):
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply PCA
    pca = PCA(n_components=n_components)
    embeddings = pca.fit_transform(X_scaled)
    
    print(f"PCA embedding shape: {embeddings.shape}")
    print(f"Explained variance ratio: {sum(pca.explained_variance_ratio_):.4f}")
    
    return embeddings, pca

pca_embeddings, pca_model = create_pca_embeddings(X)

# 2. Autoencoder Embeddings
def create_autoencoder_embeddings(X, encoding_dim=32):
    # Reshape for CNN
    input_shape = (28, 28, 1)
    X_reshaped = X.reshape(-1, 28, 28, 1)
    
    # Define the encoder
    input_img = tf.keras.Input(shape=input_shape)
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    # Encoder output shape is (4, 4, 8) = 128 dimensions
    x = layers.Flatten()(x)
    encoded = layers.Dense(encoding_dim, activation='relu')(x)
    
    # Define the decoder
    x = layers.Dense(128, activation='relu')(encoded)
    x = layers.Reshape((4, 4, 8))(x)
    x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
    x = layers.UpSampling2D((2, 2))(x)
    x = layers.Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    x = layers.UpSampling2D((2, 2))(x)
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = layers.UpSampling2D((2, 2))(x)
    decoded = layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)
    
    # Create models
    autoencoder = models.Model(input_img, decoded)
    encoder = models.Model(input_img, encoded)
    
    # Compile and train
    autoencoder.compile(optimizer='adam', loss='mse')
    
    # Train the autoencoder
    history = autoencoder.fit(X_reshaped, X_reshaped, 
                             epochs=10, 
                             batch_size=256, 
                             shuffle=True,
                             validation_split=0.1,
                             verbose=1)
    
    # Create embeddings using the encoder
    embeddings = encoder.predict(X_reshaped)
    
    print(f"Autoencoder embedding shape: {embeddings.shape}")
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Autoencoder Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')
    
    # Display sample reconstructions
    plt.subplot(1, 2, 2)
    n = 10
    reconstructed = autoencoder.predict(X_reshaped[:n])
    plt.figure(figsize=(20, 4))
    for i in range(n):
        # Original
        ax = plt.subplot(2, n, i+1)
        plt.imshow(X_reshaped[i].reshape(28, 28), cmap='gray')
        plt.axis('off')
        
        # Reconstructed
        ax = plt.subplot(2, n, i+n+1)
        plt.imshow(reconstructed[i].reshape(28, 28), cmap='gray')
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()
    
    return embeddings, encoder

autoencoder_embeddings, encoder_model = create_autoencoder_embeddings(X)

# Attempt to cluster the embeddings
def cluster_and_visualize(embeddings, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    
    # Visualize using t-SNE for dimensionality reduction
    from sklearn.manifold import TSNE
    
    tsne = TSNE(n_components=2, random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)
    
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap='tab10', alpha=0.6)
    plt.colorbar(scatter)
    plt.title('t-SNE visualization of embeddings with KMeans labels')
    plt.show()
    
    # Display some images from each cluster
    plt.figure(figsize=(15, 10))
    for i in range(n_clusters):
        cluster_indices = np.where(labels == i)[0][:5]  # Get first 5 images from cluster
        for j, idx in enumerate(cluster_indices):
            if j < 5:  # Show up to 5 images per cluster
                plt.subplot(n_clusters, 5, i*5 + j + 1)
                plt.imshow(digits[idx].squeeze(), cmap='gray')
                plt.axis('off')
                if j == 0:
                    plt.title(f'Cluster {i}')
    
    plt.tight_layout()
    plt.show()
    
    return labels

# Cluster and visualize using autoencoder embeddings
autoencoder_labels = cluster_and_visualize(autoencoder_embeddings)

# Save the labels
np.save('digit_labels_autoencoder.npy', autoencoder_labels)

# You can also try clustering the PCA embeddings
pca_labels = cluster_and_visualize(pca_embeddings)
np.save('digit_labels_pca.npy', pca_labels)

# Function to evaluate clustering quality (if you have ground truth labels)
def evaluate_clusters(true_labels, predicted_labels):
    from sklearn import metrics
    
    # Various cluster evaluation metrics
    ari = metrics.adjusted_rand_score(true_labels, predicted_labels)
    nmi = metrics.normalized_mutual_info_score(true_labels, predicted_labels)
    
    print(f"Adjusted Rand Index: {ari:.4f}")
    print(f"Normalized Mutual Information: {nmi:.4f}")

# Note: Since we don't have ground truth labels, you'll need to visually inspect the clusters
# to determine how well they match the expected digit classes

PCA embedding shape: (60000, 50)
Explained variance ratio: 0.5511
Epoch 1/10
Epoch 1/10


ValueError: Dimensions must be equal, but are 28 and 32 for '{{node compile_loss/mse/sub}} = Sub[T=DT_FLOAT](data_1, functional_2_1/conv2d_13_1/Sigmoid)' with input shapes: [?,28,28,1], [?,32,32,1].