In [1]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.stats import mode
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import umap.umap_ as umap

import warnings
warnings.filterwarnings("ignore")

In [2]:
X = np.load("datasets/unlabelled_train_data_images.npy")

In [3]:
def generate_pseudo_labels(X, max_iterations=5):
    # Preprocess
    X_flat = X.reshape(X.shape[0], -1)
    X_scaled = StandardScaler().fit_transform(X_flat)
    pca = PCA(n_components=50)
    X_pca = pca.fit_transform(X_scaled)

    print(f"PCA reduced to {pca.n_components_} components to preserve 90% variance.")
    best_score = -np.inf
    best_labels = None

    for iteration in range(max_iterations):
        print(f"\n--- Iteration {iteration+1} ---")

        # Step 1: Clustering
        kmeans = KMeans(n_clusters=10, random_state=iteration)
        cluster_labels = kmeans.fit_predict(X_pca)

        # Step 2: Structure-Based Score
        sil_score = silhouette_score(X_pca, cluster_labels)
        ch_score = calinski_harabasz_score(X_pca, cluster_labels)
        structure_score = (sil_score + ch_score / 1000)  # normalize CH

        # Step 3: Classifier on Pseudo-labels
        X_train, X_test, y_train, y_test = train_test_split(X_flat, cluster_labels, test_size=0.2, random_state=42)
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        clf_score = accuracy_score(y_test, y_pred)

        # Step 4: Combined Score
        final_score = 0.5 * structure_score + 0.5 * clf_score
        print(f"Silhouette: {sil_score:.4f}, CH: {ch_score:.1f}, Classifier Acc: {clf_score:.4f}, Combined: {final_score:.4f}")

        if final_score > best_score:
            best_score = final_score
            best_labels = cluster_labels.copy()

    print(f"\nBest Combined Score: {best_score:.4f}")
    return best_labels

In [9]:
def generate_psuedo_labels_umap_GMM(X, max_iterations=5):
    X_flat = X.reshape(X.shape[0],-1)
    X_scaled = StandardScaler().fit_transform(X_flat)
    
    reducer = umap.UMAP(n_components=10, n_neighbors=30, min_dist=0.1, random_state=42)
    X_umap = reducer.fit_transform(X_scaled)
    print(f"UMAP reduced to {X_umap.shape[1]} dimensions.")
    
    best_score = -np.inf
    best_labels = None
    
    for iteration in range(max_iterations):
        print(f"\n--- Iteration {iteration+1} ---")
        gmm = GaussianMixture(n_components=10, init_params='kmeans',max_iter=500 ,covariance_type='full', random_state=iteration)
        cluster_labels = gmm.fit_predict(X_umap)
    
        sil_score = silhouette_score(X_umap, cluster_labels)
        ch_score = calinski_harabasz_score(X_umap, cluster_labels)
        
        structure_score = (sil_score + ch_score / 1000)
        
        X_train, X_test, y_train, y_test = train_test_split(X_flat, cluster_labels, test_size=0.2, random_state=42)
        clf = LogisticRegression(max_iter=1000)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        clf_score = accuracy_score(y_test, y_pred)

        # Step 4: Combined Score
        final_score = 0.5 * structure_score + 0.5 * clf_score
        print(f"Silhouette: {sil_score:.4f}, CH: {ch_score:.1f}, Classifier Acc: {clf_score:.4f}, Combined: {final_score:.4f}")

        if final_score > best_score:
            best_score = final_score
            best_labels = cluster_labels.copy()
    print(f"\nBest Combined Score: {best_score:.4f}")
    return best_labels

In [7]:
y = generate_pseudo_labels(X, max_iterations = 5)

PCA reduced to 50 components to preserve 90% variance.

--- Iteration 1 ---
Silhouette: 0.0741, CH: 2582.4, Classifier Acc: 0.9701, Combined: 1.8133

--- Iteration 2 ---
Silhouette: 0.0742, CH: 2582.2, Classifier Acc: 0.9701, Combined: 1.8133

--- Iteration 3 ---
Silhouette: 0.0732, CH: 2582.5, Classifier Acc: 0.9723, Combined: 1.8140

--- Iteration 4 ---
Silhouette: 0.0740, CH: 2582.4, Classifier Acc: 0.9708, Combined: 1.8137

--- Iteration 5 ---
Silhouette: 0.0739, CH: 2582.3, Classifier Acc: 0.9722, Combined: 1.8142

Best Combined Score: 1.8142


In [10]:
%%time
y1 = generate_psuedo_labels_umap_GMM(X, max_iterations=10)
print("Complete")

UMAP reduced to 10 dimensions.

--- Iteration 1 ---
Silhouette: 0.4668, CH: 62191.5, Classifier Acc: 0.9339, Combined: 31.7961

--- Iteration 2 ---
Silhouette: 0.4534, CH: 64107.1, Classifier Acc: 0.9281, Combined: 32.7443

--- Iteration 3 ---
Silhouette: 0.4593, CH: 66695.5, Classifier Acc: 0.9367, Combined: 34.0458

--- Iteration 4 ---
Silhouette: 0.4833, CH: 61064.9, Classifier Acc: 0.9347, Combined: 31.2415

--- Iteration 5 ---
Silhouette: 0.4668, CH: 62181.4, Classifier Acc: 0.9337, Combined: 31.7909

--- Iteration 6 ---
Silhouette: 0.4619, CH: 64731.2, Classifier Acc: 0.9263, Combined: 33.0596

--- Iteration 7 ---
Silhouette: 0.4820, CH: 65901.4, Classifier Acc: 0.9358, Combined: 33.6596

--- Iteration 8 ---
Silhouette: 0.4830, CH: 61069.7, Classifier Acc: 0.9348, Combined: 31.2438

--- Iteration 9 ---
Silhouette: 0.4387, CH: 67217.4, Classifier Acc: 0.9286, Combined: 34.2923

--- Iteration 10 ---
Silhouette: 0.4387, CH: 67218.0, Classifier Acc: 0.9283, Combined: 34.2925

Best Co

236