In [3]:
!pip install -q opencv-python matplotlib


In [36]:
import os, cv2, csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from cv2 import GaussianBlur
from image_hclustering import clusterMatrixMain

In [37]:
# Generates a RGB histogram for a single image
def plot_rgb_histogram(image_path):    
    ddepth = cv2.CV_16S
    kernel_size = 3
    
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) # Load an image

    # Apply Gaussian Blur - reduce noise
    image = cv2.GaussianBlur(image, (3, 3), 0)
    
    # Apply Laplace function
    dst = cv2.Laplacian(image, ddepth, ksize=kernel_size) 
    
    # converting back to uint8
    abs_dst = cv2.convertScaleAbs(dst)
    
    hist_gray = cv2.calcHist([abs_dst], [0], None, [256], [0, 256])
    return hist_gray

#Creating distance matrix between the histograms contained in the directory
def create_distance_matrix(histograms_list, image_files, folder_path):
    distance_matrix = pd.DataFrame(0, index=image_files, columns=image_files, dtype='float64')
    image_files = str(image_files)
    distancesFile = []

    # Calculating distance between images histograms
    for i in range(len(histograms_list)):
        for j in range(i, len(histograms_list)):
            image1, hist1 = histograms_list[i]
            image2, hist2 = histograms_list[j]
            
            total_distance = 0
            #total_distance +=  cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL) 
            total_distance +=  cv2.compareHist(hist1, hist2, cv2.HISTCMP_BHATTACHARYYA) # Distance between histograms
            #total_distance = 1 - total_distance    # Distance = 0 means images are equals

            distance_matrix.loc[image1, image2] = total_distance
            distance_matrix.loc[image2, image1] = total_distance

            distancesFile.append([image1, image2, total_distance])

    df = pd.DataFrame(distancesFile, columns=["Image1", "Image2", "Distance"])

    df = df[df["Distance"] > 0.0] #removing 0.0 distances rows
    df.to_csv(os.path.join(folder_path, "distances.csv"), index=False, header=False)
    
    return distance_matrix

#Process the images in a directory creating histogram and distance matrix
def process_images_from_folder(folder_path):
    if not os.path.exists(folder_path):
        print("Error: Specified folder does not exists.")
        return None
    
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('png', 'jpg', 'jpeg', 'webp', 'tiff'))]
    if not image_files:
        print("No images found in the folder.")
        return None
    
    histograms_list = []
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        hist = plot_rgb_histogram(image_path)  
        histograms_list.append((image_file, hist))
    
    return create_distance_matrix(histograms_list, image_files, folder_path)

def main():
    folder_path = "datasetSuddiviso"
    subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
    total_distance_matrix = None

    for subfolder in subfolders: 
        subfolder_path = os.path.join(folder_path, subfolder)
        distance_matrix = process_images_from_folder(subfolder_path) # Distance matrix between images in the subfolder

        if distance_matrix is not None:
            distance_matrix.to_csv(os.path.join(subfolder_path, "distance_matrix.csv"))

            # Total matrix between all the images
            if total_distance_matrix is None:
                total_distance_matrix = distance_matrix
            else:
                total_distance_matrix = pd.concat([total_distance_matrix, distance_matrix], axis=0, join="outer").fillna(0)

    total_distance_matrix_path = os.path.join(folder_path, "total_distance_matrix.csv")
    if total_distance_matrix is not None:
        total_distance_matrix.to_csv(total_distance_matrix_path)
    
    clusterMatrixMain(total_distance_matrix_path) # Hierarchical Clustering

main()