In [3]:
pip install opencv-python numpy matplotlib scikit-image scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [3]:
# Import necessary libraries
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import json
from skimage.segmentation import slic
from skimage.color import rgb2lab
from skimage.measure import regionprops
from tqdm import tqdm  # For progress bar
# Define dataset path
dataset_path = r"C:\Users\Admin\Downloads\dataverse_files\ZT111_4_A\ZT111_4_A"
output_folder = os.path.join(dataset_path, "processed_data")
# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
# Check if the path exists
if not os.path.exists(dataset_path):
    print(f"Error: The path '{dataset_path}' does not exist.")
else:
    print(f"Path exists: {dataset_path}")
    # List available files
    all_files = os.listdir(dataset_path)
    print("Files in dataset folder:", all_files)
    # Filter only image files
    image_files = [f for f in all_files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    if not image_files:
        print("No image files found in the directory!")
    else:
        dataset_info = {}
        # Loop through each image
        for image_file in tqdm(image_files, desc="Processing Images"):
            image_path = os.path.join(dataset_path, image_file)
            # Load and convert image
            image = cv2.imread(image_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            # Generate superpixels using SLIC algorithm
            num_segments = 1000  # You can adjust this number
            segments = slic(image, n_segments=num_segments, compactness=30, start_label=1)
            # Store information for each superpixel
            superpixel_info = {
                "image_name": image_file,
                "superpixels": []
            }
            # Create a copy for the superpixel image
            superpixel_image = np.zeros_like(image)
            for region in regionprops(segments, intensity_image=rgb2lab(image)):
                centroid = region.centroid
                coords = region.coords  # Pixel coordinates in superpixel
                # Extract pixel values
                pixel_values = image[coords[:, 0], coords[:, 1]]
                # Calculate color statistics
                mean_color = np.mean(pixel_values, axis=0).astype(int).tolist()
                std_color = np.std(pixel_values, axis=0).tolist()
                cov_color = np.cov(pixel_values.T).tolist()
                # Fill superpixel region with mean color
                for coord in coords:
                    superpixel_image[coord[0], coord[1]] = mean_color
                superpixel_info["superpixels"].append({
                    "centroid": [int(centroid[0]), int(centroid[1])],
                    "mean_color": mean_color,
                    "std_color": std_color,
                    "covariance_color": cov_color
                })
            # Add label map (superpixel segmentation mask)
            superpixel_info["label_map"] = segments.tolist()
            # Save the superpixel information into a JSON file
            json_filename = os.path.splitext(image_file)[0] + "_superpixel_info.json"
            json_output_path = os.path.join(output_folder, json_filename)
            with open(json_output_path, 'w') as json_file:
                json.dump(superpixel_info, json_file, indent=4)
            # Save the superpixel image (each superpixel filled with mean color)
            superpixel_image_path = os.path.join(output_folder, os.path.splitext(image_file)[0] + "_superpixel.png")
            plt.imsave(superpixel_image_path, superpixel_image)
        print(f"Superpixel images and data saved to {output_folder}")

Path exists: C:\Users\Admin\Downloads\dataverse_files\ZT111_4_A\ZT111_4_A
Files in dataset folder: ['clustered_data', 'processed_data', 'ZT111_4_A_1_12.jpg', 'ZT111_4_A_1_13.jpg', 'ZT111_4_A_1_14.jpg', 'ZT111_4_A_1_2.jpg', 'ZT111_4_A_1_5.jpg', 'ZT111_4_A_1_6.jpg', 'ZT111_4_A_1_7.jpg', 'ZT111_4_A_1_8.jpg', 'ZT111_4_A_1_9.jpg', 'ZT111_4_A_2_1.jpg', 'ZT111_4_A_2_10.jpg', 'ZT111_4_A_2_13.jpg', 'ZT111_4_A_2_14.jpg', 'ZT111_4_A_2_2.jpg', 'ZT111_4_A_2_8.jpg', 'ZT111_4_A_3_1.jpg', 'ZT111_4_A_3_10.jpg', 'ZT111_4_A_3_11.jpg', 'ZT111_4_A_3_12.jpg', 'ZT111_4_A_3_13.jpg', 'ZT111_4_A_3_2.jpg', 'ZT111_4_A_3_4.jpg', 'ZT111_4_A_3_5.jpg', 'ZT111_4_A_3_7.jpg', 'ZT111_4_A_3_8.jpg', 'ZT111_4_A_3_9.jpg', 'ZT111_4_A_4_11.jpg', 'ZT111_4_A_4_13.jpg', 'ZT111_4_A_4_14.jpg', 'ZT111_4_A_4_2.jpg', 'ZT111_4_A_4_4.jpg', 'ZT111_4_A_4_5.jpg', 'ZT111_4_A_4_6.jpg', 'ZT111_4_A_4_8.jpg', 'ZT111_4_A_4_9.jpg', 'ZT111_4_A_5_10.jpg', 'ZT111_4_A_5_11.jpg', 'ZT111_4_A_5_12.jpg', 'ZT111_4_A_5_13.jpg', 'ZT111_4_A_5_14.jpg', 'ZT111

Processing Images: 100%|██████████| 77/77 [2:06:54<00:00, 98.89s/it]  

Superpixel images and data saved to C:\Users\Admin\Downloads\dataverse_files\ZT111_4_A\ZT111_4_A\processed_data





In [7]:
pip install opencv-python scikit-image scikit-learn matplotlib tqdm





In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans, MeanShift, estimate_bandwidth
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from tqdm import tqdm  # Progress bar
 
# Define dataset paths
dataset_path = r"C:\Users\Admin\Downloads\dataverse_files\ZT111_4_A\ZT111_4_A"
processed_folder = os.path.join(dataset_path, "processed_data")
clustered_folder = os.path.join(dataset_path, "clustered_data")
 
# Create output folder if it doesn't exist
os.makedirs(clustered_folder, exist_ok=True)
 
# List available images
all_files = os.listdir(processed_folder)
superpixel_images = [f for f in all_files if f.endswith("_superpixel.png")]
 
if not superpixel_images:
    print("No superpixel images found in the directory!")
    exit()
 
print(f"Found {len(superpixel_images)} superpixel images. Starting clustering...")
 
# Parameters
N_CLUSTERS = 3         # Number of clusters for KMeans & GMM
N_TREES = 50           # Random Forest trees
TEST_SIZE = 0.005      # Use 0.5% pixels for RF training
BATCH_SIZE = 5000      # MiniBatchKMeans batch size
 
# Process each image
for image_file in tqdm(superpixel_images, desc="Clustering Images"):
    image_path = os.path.join(processed_folder, image_file)
 
    try:
        # Load image with reduced resolution (2x smaller)
        image = cv2.imread(image_path, cv2.IMREAD_REDUCED_COLOR_2)
 
        if image is None:
            print(f"Skipping {image_file} (could not load).")
            continue
 
        # Convert to float and normalize
        pixels = image.reshape(-1, 3).astype(np.float32) / 255.0
 
        # --- Step 1: Apply MiniBatch KMeans ---
        kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, batch_size=BATCH_SIZE, random_state=42)
        kmeans_labels = kmeans.fit_predict(pixels)
        clustered_kmeans = kmeans_labels.reshape(image.shape[:2])
 
        # --- Step 2: Train Random Forest on KMeans labels ---
        X_train, _, y_train, _ = train_test_split(pixels, kmeans_labels, test_size=TEST_SIZE, random_state=42)
        rf = RandomForestClassifier(n_estimators=N_TREES, random_state=42, n_jobs=-1)
        rf.fit(X_train, y_train)
        rf_labels = rf.predict(pixels)
        clustered_rf = rf_labels.reshape(image.shape[:2])
 
        # --- Step 3: Apply Gaussian Mixture Model (GMM) ---
        gmm = GaussianMixture(n_components=N_CLUSTERS, covariance_type="full", random_state=42)
        gmm_labels = gmm.fit_predict(pixels)
        clustered_gmm = gmm_labels.reshape(image.shape[:2])
 
        # --- Step 4: Apply Mean Shift ---
        print(f"Applying Mean Shift to {image_file}...")
 
        # Estimate bandwidth automatically (adaptive clustering)
        bandwidth = estimate_bandwidth(pixels, quantile=0.1, n_samples=5000)
        if bandwidth is None or bandwidth <= 0:
            print(f"Skipping {image_file}: Bandwidth estimation failed.")
            continue
 
        # Mean Shift clustering
        mean_shift = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1)
        mean_shift_labels = mean_shift.fit_predict(pixels)
        clustered_meanshift = mean_shift_labels.reshape(image.shape[:2])
 
        # --- Step 5: Save Clustered Images ---
        output_kmeans = os.path.join(clustered_folder, f"clustered_kmeans_{image_file}")
        output_rf = os.path.join(clustered_folder, f"clustered_rf_{image_file}")
        output_gmm = os.path.join(clustered_folder, f"clustered_gmm_{image_file}")
        output_meanshift = os.path.join(clustered_folder, f"clustered_meanshift_{image_file}")
 
        plt.imsave(output_kmeans, clustered_kmeans, cmap='viridis')
        plt.imsave(output_rf, clustered_rf, cmap='viridis')
        plt.imsave(output_gmm, clustered_gmm, cmap='viridis')
        plt.imsave(output_meanshift, clustered_meanshift, cmap='viridis')
 
    except Exception as e:
        print(f"Error processing {image_file}: {e}")
 
print(f"✅ Clustering completed! Results saved in {clustered_folder}")

Found 77 superpixel images. Starting clustering...


Clustering Images:   0%|                                                                                                                        | 0/77 [00:00<?, ?it/s]

Applying Mean Shift to ZT111_4_A_1_12_superpixel.png...


Clustering Images:   1%|█▍                                                                                                           | 1/77 [03:21<4:14:41, 201.07s/it]

Applying Mean Shift to ZT111_4_A_1_13_superpixel.png...


Clustering Images:   3%|██▊                                                                                                          | 2/77 [07:09<4:31:17, 217.03s/it]

Applying Mean Shift to ZT111_4_A_1_14_superpixel.png...


Clustering Images:   4%|████▏                                                                                                        | 3/77 [10:32<4:19:51, 210.70s/it]

Applying Mean Shift to ZT111_4_A_1_2_superpixel.png...


Clustering Images:   5%|█████▋                                                                                                       | 4/77 [13:46<4:08:32, 204.29s/it]

Applying Mean Shift to ZT111_4_A_1_5_superpixel.png...


Clustering Images:   6%|███████                                                                                                      | 5/77 [17:21<4:09:48, 208.17s/it]

Applying Mean Shift to ZT111_4_A_1_6_superpixel.png...


Clustering Images:   8%|████████▍                                                                                                    | 6/77 [20:24<3:56:11, 199.60s/it]

Applying Mean Shift to ZT111_4_A_1_7_superpixel.png...


Clustering Images:   9%|█████████▉                                                                                                   | 7/77 [23:57<3:57:52, 203.89s/it]

In [1]:
import pandas as pd
import os
import numpy as np
import cv2
 
# Define the clustered image directory
clustered_folder = r"C:\Users\Admin\Downloads\dataverse_files\ZT111_4_A\ZT111_4_A\clustered_data"
 
# Ensure the directory exists before proceeding
if not os.path.exists(clustered_folder):
    print(f"❌ Directory not found: {clustered_folder}")
else:
    # List all clustered images
    all_files = os.listdir(clustered_folder)
    unique_images = set(f.split("_", 2)[-1] for f in all_files if f.endswith(".png"))  # Extract unique base image names
 
    # Prepare results storage
    results = []
 
    # Iterate over each unique image and compute standard deviation for clustering methods
    for image_name in unique_images:
        image_paths = {
            "KMeans": os.path.join(clustered_folder, f"clustered_kmeans_{image_name}"),
            "RandomForest": os.path.join(clustered_folder, f"clustered_rf_{image_name}"),
            "GMM": os.path.join(clustered_folder, f"clustered_gmm_{image_name}"),
            "MeanShift": os.path.join(clustered_folder, f"clustered_meanshift_{image_name}")  # Updated Mean Shift path
        }
 
        std_values = {}
 
        for method, path in image_paths.items():
            if os.path.exists(path):
                clustered_image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
                if clustered_image is not None:
                    std_values[method] = np.std(clustered_image)
                else:
                    std_values[method] = None
            else:
                std_values[method] = None
 
        # Find the best clustering method for this image
        valid_methods = {k: v for k, v in std_values.items() if v is not None}
 
        if valid_methods:
            best_method = min(valid_methods, key=valid_methods.get)
            best_std = valid_methods[best_method]
        else:
            best_method = None
            best_std = None
 
        # Store results
        results.append({
            "Image": image_name,
            "KMeans Std": std_values.get("KMeans"),
            "RandomForest Std": std_values.get("RandomForest"),
            "GMM Std": std_values.get("GMM"),
            "MeanShift Std": std_values.get("MeanShift"),
            "Best Method": best_method,
            "Best Std Dev": best_std
        })
 
    # Convert to DataFrame and display results
    df_results = pd.DataFrame(results)
    print(df_results)  # Print the DataFrame for console viewing
    df_results.to_csv("clustering_results.csv", index=False)  # Save results as CSV for easier access
 
print("✅ Clustering standard deviation analysis completed! Results saved in clustering_results.csv")

                            Image  KMeans Std  RandomForest Std    GMM Std  \
0    ZT111_4_A_7_6_superpixel.png   60.528326         60.528326  63.714327   
1    ZT111_4_A_5_7_superpixel.png   62.023179         62.023179  63.853873   
2    ZT111_4_A_1_5_superpixel.png   61.797490         61.797490  73.754928   
3    ZT111_4_A_3_1_superpixel.png   79.401160         79.401160  56.202391   
4    ZT111_4_A_6_8_superpixel.png   74.369709         74.369709  75.746186   
..                            ...         ...               ...        ...   
72   ZT111_4_A_3_8_superpixel.png   76.914896         76.914896  77.777664   
73   ZT111_4_A_4_6_superpixel.png   76.532258         76.532258  74.663435   
74   ZT111_4_A_1_8_superpixel.png   81.746193         81.746193  67.150645   
75  ZT111_4_A_5_11_superpixel.png   74.026812         74.026812  75.221828   
76   ZT111_4_A_7_5_superpixel.png   81.971927         81.971927  68.311263   

    MeanShift Std Best Method  Best Std Dev  
0       44.425900