In [3]:

from google.colab import drive
drive.mount("/FAISS/")

Mounted at /FAISS/


In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [4]:

import os
import shutil
import numpy as np
import faiss
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img

# Load the VGG16 model without the classification layer
model = VGG16(weights='imagenet', include_top=False)

def extract_features(image_path, model):
    """Extract features for an image using a pretrained model."""
    img = load_img(image_path, target_size=(224, 224))
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = model.predict(img_array)
    return features.flatten()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [5]:


def store_images_by_similarity(image_folder, output_folder, cluster_threshold=0.5):
    """
    Store images in subfolders based on similarity using FAISS.

    Args:
    - image_folder (str): Path to the folder containing images.
    - output_folder (str): Path to the folder where results will be stored.
    - cluster_threshold (float): Distance threshold for clustering similar images.
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Extract features for all images
    image_paths = []
    feature_vectors = []
    for file_name in os.listdir(image_folder):
        file_path = os.path.join(image_folder, file_name)
        if os.path.isfile(file_path):
            try:
                features = extract_features(file_path, model)
                feature_vectors.append(features)
                image_paths.append(file_path)
            except Exception as e:
                print(f"Error processing {file_name}: {e}")

    # Convert feature vectors to a numpy array
    feature_vectors = np.array(feature_vectors, dtype='float32')

    # Create a FAISS index for L2 distance (Euclidean)
    dimension = feature_vectors.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(feature_vectors)

    # Group images based on similarity
    visited = set()
    cluster_id = 0

    for i in range(len(feature_vectors)):
        if i in visited:
            continue

        # Search for similar images
        distances, indices = index.search(feature_vectors[i].reshape(1, -1), len(feature_vectors))
        similar_images = [idx for idx, dist in zip(indices[0], distances[0]) if dist < cluster_threshold]

        # Mark these images as visited
        visited.update(similar_images)

        # Create a subfolder for the current cluster
        cluster_folder = os.path.join(output_folder, f"cluster_{cluster_id}")
        if not os.path.exists(cluster_folder):
            os.makedirs(cluster_folder)

        # Copy similar images to the cluster folder
        for idx in similar_images:
            src_path = image_paths[idx]
            dst_path = os.path.join(cluster_folder, os.path.basename(src_path))
            shutil.copyfile(src_path, dst_path)

        print(f"Cluster {cluster_id}: {len(similar_images)} images")
        cluster_id += 1




In [7]:
# Example usage
image_folder = "/FAISS/MyDrive/Find_similarity/ascending_cluster/cluster_0"  # Folder containing images
output_folder = "/FAISS/MyDrive/Find_similarity/faiss"  # Folder to save clustered images
cluster_threshold = 200.0  # Adjust this based on your dataset and use case

store_images_by_similarity(image_folder, output_folder, cluster_threshold)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 557ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 647ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 986ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 972ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 963ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 546ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 562ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 532ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 563ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 529ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 572ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s