In [1]:
!pip install -q opencv-python matplotlib


You should consider upgrading via the '/Users/filipporeucci/oilSpotRecognition/myvenv/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import os, cv2, csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from cv2 import GaussianBlur
#from image_hclustering import clusterMatrixMain

In [3]:
def plot_rgb_histogram(image_path):    
    ddepth = cv2.CV_16S
    kernel_size = 3
    
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) # Load an image

    # Apply Gaussian Blur - reduce noise
    image = cv2.GaussianBlur(image, (3, 3), 0)
    
    # Apply Laplace function
    dst = cv2.Laplacian(image, ddepth, ksize=kernel_size) 
    
    # converting back to uint8
    abs_dst = cv2.convertScaleAbs(dst)
    
    hist_gray = cv2.calcHist([abs_dst], [0], None, [256], [0, 256])
    return hist_gray

In [4]:
#Creating distance matrix between the histograms contained in the directory
def create_distance_matrix(histograms_list, image_files, folder_path):
    distance_matrix = pd.DataFrame(0, index=image_files, columns=image_files, dtype='float64')
    image_files = str(image_files)
    distancesFile = []

    # Calculating distance between images histograms
    for i in range(len(histograms_list)):
        for j in range(i, len(histograms_list)):
            image1, hist1 = histograms_list[i]
            image2, hist2 = histograms_list[j]
            
            total_distance = 0
            total_distance +=  cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL) 
            #total_distance +=  cv2.compareHist(hist1, hist2, cv2.HISTCMP_BHATTACHARYYA) # Distance between histograms
            total_distance = 1 - total_distance    # Distance = 0 means images are equals

            distance_matrix.loc[image1, image2] = total_distance
            distance_matrix.loc[image2, image1] = total_distance

            distancesFile.append([image1, image2, total_distance])

    df = pd.DataFrame(distancesFile, columns=["Image1", "Image2", "Distance"])

    df = df[df["Distance"] > 0.0] #removing 0.0 distances rows
    df.to_csv(os.path.join(folder_path, "distances.csv"), index=False, header=False)
    
    return distance_matrix


In [5]:
#Process the images in a directory creating histogram and distance matrix
def process_images_from_folder(folder_path):
    if not os.path.exists(folder_path):
        print("Error: Specified folder does not exists.")
        return None
    
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('png', 'jpg', 'jpeg', 'webp', 'tiff'))]
    if not image_files:
        print("No images found in the folder.")
        return None
    
    histograms_list = []
    label = []
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        hist = plot_rgb_histogram(image_path)  
        histograms_list.append((image_file, hist))
        label.append([image_file, image_file.split("_")[1].split(".")[0]]) # Creating label record for labels.csv file
    
    #labeldf = pd.DataFrame(label, columns=["File", "Label"])
    #print(labeldf)
    return create_distance_matrix(histograms_list, image_files, folder_path), label


In [6]:
import sys
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.cluster import *
from sklearn import metrics
import os


def clusterMatrixMain(total_distance_matrix_path,total_labels_path):
    # Path of the total distance matrix
    distance_matrix = pd.read_csv(total_distance_matrix_path, sep=",", header=None, index_col=None)
    labels = pd.read_csv(total_labels_path, sep=",")

    distance_matrix = distance_matrix.drop(index=distance_matrix.index[0], columns=distance_matrix.columns[0])
    n_clusters = 3
    
    # Read the true lables assigned to every Id 
    labels_true = list(labels["Label"])
    
    # Execute clustering with single linkage and determines the predicted labels for each molecule
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage ='single').fit(distance_matrix)

    labels_pred = model.fit_predict(distance_matrix)

    # Compute the metrics and print the evaluations
    print("Method: single")
    print("Rand_score", metrics.rand_score(labels_true, labels_pred))
    print("Homogeneity_score", metrics.homogeneity_score(labels_true, labels_pred))
    print("Completeness_score", metrics.completeness_score(labels_true, labels_pred))
        
    # Execute clustering with complete linkage and determines the predicted labels for each molecule

    model = AgglomerativeClustering(n_clusters=n_clusters, linkage ='complete').fit(distance_matrix)
    labels_pred = model.fit_predict(distance_matrix)

    # Compute the metrics and print the evaluations
    print("Method: complete")
    print("Rand_score", metrics.rand_score(labels_true, labels_pred))
    print("Homogeneity_score", metrics.homogeneity_score(labels_true, labels_pred))
    print("Completeness_score", metrics.completeness_score(labels_true, labels_pred))


    # Execute clustering with average linkage and determines the predicted labels for each molecule

    model = AgglomerativeClustering(n_clusters=n_clusters, linkage ='average').fit(distance_matrix)
    labels_pred = model.fit_predict(distance_matrix)

    # Compute the metrics and print the evaluations
    print("Method: average")
    print("Rand_score", metrics.rand_score(labels_true, labels_pred))
    print("Homogeneity_score", metrics.homogeneity_score(labels_true, labels_pred))
    print("Completeness_score", metrics.completeness_score(labels_true, labels_pred))


In [7]:
folder_path = "datasetSuddiviso"
subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
total_distance_matrix = None
total_labels = []

for subfolder in subfolders: 
    subfolder_path = os.path.join(folder_path, subfolder)
    distance_matrix, label = process_images_from_folder(subfolder_path) # Distance matrix between images in the subfolder

    if distance_matrix is not None:
        distance_matrix.to_csv(os.path.join(subfolder_path, "distance_matrix.csv"))

        if total_distance_matrix is None: # Total matrix between all the images
            total_distance_matrix = distance_matrix
        else:
            total_distance_matrix = pd.concat([total_distance_matrix, distance_matrix], axis=0, join="outer").fillna(0)   
        
    total_labels.extend(label)

total_distance_matrix_path = os.path.join(folder_path, "total_distance_matrix.csv")
total_labels_path = os.path.join(folder_path, "labels.csv")

pd.DataFrame(total_labels, columns=["File", "Label"]).to_csv(total_labels_path,index=False)

if total_distance_matrix is not None:
    total_distance_matrix.to_csv(total_distance_matrix_path)



In [8]:
clusterMatrixMain(total_distance_matrix_path=total_distance_matrix_path,total_labels_path=total_labels_path)

Method: single
Rand_score 0.3421923937360179
Homogeneity_score 0.011587603285713899
Completeness_score 0.1149259971743168
Method: complete
Rand_score 0.3421923937360179
Homogeneity_score 0.011587603285713899
Completeness_score 0.1149259971743168
Method: average
Rand_score 0.3421923937360179
Homogeneity_score 0.011587603285713899
Completeness_score 0.1149259971743168


  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)
  out = hierarchy.linkage(X, method=linkage, metric=affinity)


In [9]:
'''# Generates a RGB histogram for a single image
def plot_rgb_histogram(image_path):    
    ddepth = cv2.CV_16S
    kernel_size = 3
    
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) # Load an image

    # Apply Gaussian Blur - reduce noise
    image = cv2.GaussianBlur(image, (3, 3), 0)
    
    # Apply Laplace function
    dst = cv2.Laplacian(image, ddepth, ksize=kernel_size) 
    
    # converting back to uint8
    abs_dst = cv2.convertScaleAbs(dst)
    
    hist_gray = cv2.calcHist([abs_dst], [0], None, [256], [0, 256])
    return hist_gray'''

'''#Creating distance matrix between the histograms contained in the directory
def create_distance_matrix(histograms_list, image_files, folder_path):
    distance_matrix = pd.DataFrame(0, index=image_files, columns=image_files, dtype='float64')
    image_files = str(image_files)
    distancesFile = []

    # Calculating distance between images histograms
    for i in range(len(histograms_list)):
        for j in range(i, len(histograms_list)):
            image1, hist1 = histograms_list[i]
            image2, hist2 = histograms_list[j]
            
            total_distance = 0
            #total_distance +=  cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL) 
            total_distance +=  cv2.compareHist(hist1, hist2, cv2.HISTCMP_BHATTACHARYYA) # Distance between histograms
            #total_distance = 1 - total_distance    # Distance = 0 means images are equals

            distance_matrix.loc[image1, image2] = total_distance
            distance_matrix.loc[image2, image1] = total_distance

            distancesFile.append([image1, image2, total_distance])

    df = pd.DataFrame(distancesFile, columns=["Image1", "Image2", "Distance"])

    df = df[df["Distance"] > 0.0] #removing 0.0 distances rows
    df.to_csv(os.path.join(folder_path, "distances.csv"), index=False, header=False)
    
    return distance_matrix
'''

'''
#Process the images in a directory creating histogram and distance matrix
def process_images_from_folder(folder_path):
    if not os.path.exists(folder_path):
        print("Error: Specified folder does not exists.")
        return None
    
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('png', 'jpg', 'jpeg', 'webp', 'tiff'))]
    if not image_files:
        print("No images found in the folder.")
        return None
    
    histograms_list = []
    label = []
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        hist = plot_rgb_histogram(image_path)  
        histograms_list.append((image_file, hist))
        label.append([image_file, image_file.split("_")[1].split(".")[0]]) # Creating label record for labels.csv file
    
    #labeldf = pd.DataFrame(label, columns=["File", "Label"])
    #print(labeldf)
    return create_distance_matrix(histograms_list, image_files, folder_path), label

def main():
    folder_path = "datasetSuddiviso"
    subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
    total_distance_matrix = None
    total_labels = []

    for subfolder in subfolders: 
        subfolder_path = os.path.join(folder_path, subfolder)
        distance_matrix, label = process_images_from_folder(subfolder_path) # Distance matrix between images in the subfolder

        if distance_matrix is not None:
            distance_matrix.to_csv(os.path.join(subfolder_path, "distance_matrix.csv"))

            if total_distance_matrix is None: # Total matrix between all the images
                total_distance_matrix = distance_matrix
            else:
                total_distance_matrix = pd.concat([total_distance_matrix, distance_matrix], axis=0, join="outer").fillna(0)   
            
        total_labels.extend(label)

    total_distance_matrix_path = os.path.join(folder_path, "total_distance_matrix.csv")
    total_labels_path = os.path.join(folder_path, "labels.csv")

    pd.DataFrame(total_labels, columns=["File", "Label"]).to_csv(total_labels_path,index=False)

    if total_distance_matrix is not None:
        total_distance_matrix.to_csv(total_distance_matrix_path)
    
    clusterMatrixMain(total_distance_matrix_path=total_distance_matrix_path,total_labels_path=total_labels_path)

main()'''

'\n#Process the images in a directory creating histogram and distance matrix\ndef process_images_from_folder(folder_path):\n    if not os.path.exists(folder_path):\n        print("Error: Specified folder does not exists.")\n        return None\n    \n    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith((\'png\', \'jpg\', \'jpeg\', \'webp\', \'tiff\'))]\n    if not image_files:\n        print("No images found in the folder.")\n        return None\n    \n    histograms_list = []\n    label = []\n    for image_file in image_files:\n        image_path = os.path.join(folder_path, image_file)\n        hist = plot_rgb_histogram(image_path)  \n        histograms_list.append((image_file, hist))\n        label.append([image_file, image_file.split("_")[1].split(".")[0]]) # Creating label record for labels.csv file\n    \n    #labeldf = pd.DataFrame(label, columns=["File", "Label"])\n    #print(labeldf)\n    return create_distance_matrix(histograms_list, image_files, folder_