In [15]:
import os
from PIL import Image
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import random
import shutil
import imagehash

In [2]:
# Main process
directory = '/mnt/nis_lab_research/data/class_data/neg/far_shah_b1-b5_b8_train_neg_cln/Accept Button'

In [20]:
# Load images
def load_images(directory):
    images = []
    filenames = []
    for filename in os.listdir(directory):
        if filename.endswith('.png'):  # Assuming images are in JPG format
            img = Image.open(os.path.join(directory, filename))
            img = img.resize((224, 224))  # Resize images
            img = img.convert('RGB')  # Convert to RGB
            images.append(img)
            filenames.append(os.path.join(directory, filename))
    return images, filenames

In [4]:
# Compute average cosine similarity with random images
def compute_average_similarity(images, num_random_images=5):
    features = []
    for i, image in enumerate(images):
        similarities = []
        for _ in range(num_random_images):
            random_index = random.randint(0, len(images) - 1)
            while random_index == i:  # Ensure the random image is not the same as the current image
                random_index = random.randint(0, len(images) - 1)
            similarity = cosine_similarity(image.reshape(1, -1), images[random_index].reshape(1, -1))[0][0]
            similarities.append(similarity)
        average_similarity = np.mean(similarities)
        features.append([average_similarity])  # Append the average similarity as a feature
    return np.array(features)

In [12]:
# Compute average pHash similarity with random images
def compute_average_phash_similarity(images, num_random_images=5):
    features = []
    for i, image in enumerate(images):
        similarities = []
        hash1 = imagehash.phash(image)
        for _ in range(num_random_images):
            random_index = random.randint(0, len(images) - 1)
            while random_index == i:  # Ensure the random image is not the same as the current image
                random_index = random.randint(0, len(images) - 1)
            hash2 = imagehash.phash(images[random_index])
            similarity = 1 - (hash1 - hash2) / len(hash1.hash) ** 2  # Normalize Hamming distance
            similarities.append(similarity)
        average_similarity = np.mean(similarities)
        features.append([average_similarity])  # Append the average similarity as a feature
    return np.array(features)

In [21]:
images, filenames = load_images(directory)

In [22]:
# Extract features based on average cosine similarity
features = compute_average_phash_similarity(images)

In [23]:
# Apply K-means clustering
kmeans = KMeans(n_clusters=2)
clusters = kmeans.fit_predict(features)

In [24]:
# Create a directory for clusters
cluster_directory = 'clusters'
if not os.path.exists(cluster_directory):
    os.makedirs(cluster_directory)

In [25]:
# Create directories for clusters and copy images
for i in range(2):  # Assuming 2 clusters
    cluster_dir = os.path.join(cluster_directory, f'cluster_{i}')
    os.makedirs(cluster_dir, exist_ok=True)
    
    cluster_indices = np.where(clusters == i)[0]
    for index in cluster_indices:
        image_path = filenames[index]
        shutil.copy(image_path, cluster_dir)
        print(f'Copied {image_path} to {cluster_dir}')

Copied /mnt/nis_lab_research/data/class_data/neg/far_shah_b1-b5_b8_train_neg_cln/Accept Button/SJWFJ3ohKzVg5mwC-kaskus_ss-128349.png to clusters/cluster_0
Copied /mnt/nis_lab_research/data/class_data/neg/far_shah_b1-b5_b8_train_neg_cln/Accept Button/B5z1A4pK0uX6L5js-microwebapp_ss-30962.png to clusters/cluster_0
Copied /mnt/nis_lab_research/data/class_data/neg/far_shah_b1-b5_b8_train_neg_cln/Accept Button/PGHHq5dguh8h8Vtx-nsone_ss-12630.png to clusters/cluster_0
Copied /mnt/nis_lab_research/data/class_data/neg/far_shah_b1-b5_b8_train_neg_cln/Accept Button/vrQ8KjP2msAeO0zp-akipharma_ss-108593.png to clusters/cluster_0
Copied /mnt/nis_lab_research/data/class_data/neg/far_shah_b1-b5_b8_train_neg_cln/Accept Button/NpUCqckNtCVxSjWQ-emaze_ss-7550.png to clusters/cluster_0
Copied /mnt/nis_lab_research/data/class_data/neg/far_shah_b1-b5_b8_train_neg_cln/Accept Button/ceVOTZ0RRVRteAg5-gwdocs_ss-64504.png to clusters/cluster_0
Copied /mnt/nis_lab_research/data/class_data/neg/far_shah_b1-b5_b8_tr