In [None]:
import os
import pickle
import numpy as np
import tensorflow as tf
import cv2
from tensorflow.keras.applications import EfficientNetB7
from tensorflow.keras.applications.efficientnet import preprocess_input
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# Set image size expected by EfficientNetB7
IMG_SIZE = (600, 600)  # Adjust if needed
FEATURES_PATH = "path/to/save/features.pkl" #Provide the path to save the feature

# Load Pretrained EfficientNetB7
def load_cnn_model():
    base_model = EfficientNetB7(weights="imagenet", include_top=False, pooling="avg")
    return base_model

# Extract Features from an Image
def extract_features(img_path, model):
    img = cv2.imread(img_path)
    img = cv2.resize(img, IMG_SIZE)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)  # EfficientNet preprocessing
    features = model.predict(img)
    return features.flatten()  # Convert to 1D vector

# Process All Images in a Folder
def process_images(folder_path, model):
    features = []
    image_paths = []
    
    for img_name in os.listdir(folder_path):
        img_path = os.path.join(folder_path, img_name)
        if img_path.lower().endswith(('.png', '.jpg', '.jpeg')):
            try:
                feat = extract_features(img_path, model)
                features.append(feat)
                image_paths.append(img_path)
            except Exception as e:
                print(f"Error processing {img_name}: {e}")
    
    return np.array(features), image_paths

# Extract Features for Training Images
def extract_train_data(folder_path):
    model = load_cnn_model()
    print("Extracting features...")
    features, image_paths = process_images(folder_path, model)

    # Save extracted features for future use
    with open(FEATURES_PATH, "wb") as f:
        pickle.dump((features, image_paths), f)

    print("Feature extraction complete. Saved to 'features.pkl'.")
    return features, image_paths

In [None]:
# Set the folder path containing images
folder_path = "path/for/folder/containing/images"  # Replace with your actual image folder path

# Extract features from images in the folder
features, image_paths = extract_train_data(folder_path)

In [None]:
# Load extracted features
with open("path/for/features.pkl", "rb") as f:
    features, image_paths = pickle.load(f)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.26, min_samples=2, metric="cosine") # The parameters eps=0.26 and min samples=2 work well the current building data and 
cluster_labels = dbscan.fit_predict(features)             # sample size of 200. Users are encouraged to fine-tune these parameters 
                                                          # for their data and sample size 
# Assign DBSCAN Clusters to Images
cluster_dict = {}
for idx, cluster_id in enumerate(cluster_labels):
    if cluster_id not in cluster_dict:
        cluster_dict[cluster_id] = []
    cluster_dict[cluster_id].append(image_paths[idx])

# Display Cluster Information and Images
print("\n--- Clustering Results ---")
for cluster_id, img_list in cluster_dict.items():
    print(f"\nCluster {cluster_id}: {len(img_list)} images")

    if cluster_id == -1:
        print("Noise points detected (not assigned to any cluster). Skipping visualization.")
        continue

    # Show first 5 images in the cluster
    fig, axes = plt.subplots(1, min(5, len(img_list)), figsize=(15, 5))
    if len(img_list) == 1:
        axes = [axes]  # Ensure it's iterable if only one image

    for ax, img_path in zip(axes, img_list[:5]):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        ax.imshow(img)
        ax.axis("off")
        ax.set_title(f"Cluster {cluster_id}")

    plt.show()

In [5]:
import os
import shutil
import pickle

# Define paths
FEATURES_PATH = "path/for/saved/features.pkl"
ORIGINAL_FOLDER = "path to save the original image"
DUPLICATE_FOLDER = "path to save the duplicate images"

# Load extracted features and image paths
with open(FEATURES_PATH, "rb") as f:
    features, image_paths = pickle.load(f)

# Apply DBSCAN
from sklearn.cluster import DBSCAN

# dbscan = DBSCAN(eps=0.1, min_samples=2, metric="cosine")
# cluster_labels = dbscan.fit_predict(features)

# Create cluster dictionary
cluster_dict = {}
for idx, cluster_id in enumerate(cluster_labels):
    if cluster_id not in cluster_dict:
        cluster_dict[cluster_id] = []
    cluster_dict[cluster_id].append(image_paths[idx])

# Create directories
os.makedirs(ORIGINAL_FOLDER, exist_ok=True)
os.makedirs(DUPLICATE_FOLDER, exist_ok=True)

# Move images based on clustering
for cluster_id, img_list in cluster_dict.items():
    if cluster_id == -1:  # Noise images
        for img in img_list:
            shutil.move(img, os.path.join(ORIGINAL_FOLDER, os.path.basename(img)))
    else:  # Clustered images
        # Move first image to 'original'
        shutil.move(img_list[0], os.path.join(ORIGINAL_FOLDER, os.path.basename(img_list[0])))

        # Move the rest to 'duplicate'
        for img in img_list[1:]:
            shutil.move(img, os.path.join(DUPLICATE_FOLDER, os.path.basename(img)))

print("Image sorting complete!")

Image sorting complete!
