In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import os
import glob

from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score

from scipy.spatial.distance import cdist

import pandas as pd
from pyexcel_ods3 import save_data

def features_to_cluster(db_name):
    
    # Find the first occurrence of "-" in the text
    index_of_dash = db_name.find("-")

    if index_of_dash != -1:
        result = db_name[index_of_dash + 1:]
    else:
        result = ""
        
    prefix_name = "visual_cluster-"
    ods_prefix_name = "visual_cluster_validation-"
    if db_name.startswith("semantic_representation-"):
        prefix_name = "semantic_cluster-"
        ods_prefix_name = "semantic_cluster_validation-"

    output_db_name = prefix_name + result
    output_ods_name = ods_prefix_name + result[:-3] + ".ods"
    
#     print(output_db_name)
#     return 
    
    if os.path.isfile(f"./input/{output_db_name}"):
        return
    
    # ================
    
    image_list, features2DArray = get_features_2d_array(db_name)
    # print(len(features2DArray))
    
    # ================
    
    results_dict = {}
    max_score = 0
    max_score_k = 0
    
    column_k = []
    column_score = []
    
    clustrers_list = [2]
    if MAX_K > 2:
        clustrers_list = np.arange(2, MAX_K)
    
    for n_clusters in clustrers_list:
        
        kmeans = KMeans(
            n_clusters=n_clusters, 
            init='k-means++', 
            random_state=0
        )

        kmeans.fit(features2DArray)
        labels = kmeans.labels_
        score = calinski_harabasz_score(features2DArray, labels)
        
        # print(str(n_clusters))
        
        distances = []
        for index, row in enumerate(kmeans.transform(features2DArray)):
            distances.append(row[labels[index]])
            
        
        results_dict[str(n_clusters)] = {
            "score": score,
            "labels": labels,
            "distances": distances
        }
        
        if score > max_score:
            max_score = score
            max_score_k = n_clusters
        
        column_k.append(n_clusters)
        column_score.append(score)
        
        # print("Calinski-Harabasz Score:", score)
        
    # ================
    
    df = pd.DataFrame({
        "k": column_k,
        "Calinski-Harabasz Score": column_score
    })
    # df.to_ods(f"./input/{output_ods_name}", sheet_name="result")
    save_data(f"./input/{output_ods_name}", {
        "result": [df.columns.tolist()] + df.values.tolist()
    })
    
    # ================
    
    best_result = results_dict[str(max_score_k)]
    
    # print(best_result)
    
    # ================
    
    Cluster_DB = db_cluster(output_db_name)
    
    for index, image_path in enumerate(image_list):
        
        cluster = best_result["labels"][index]
        distance = best_result["distances"][index]
        
        Cluster_DB(image=image_path, cluster=cluster, distance=distance).save()
    

In [7]:
def db_feature_to_cluster():
    for db_path in glob.glob("./input/*_representation-*.db"):
        
        db_name = os.path.basename(db_path)
        
        features_to_cluster(db_name)

In [8]:
# features_to_cluster("semantic_representation-all-MiniLM-L12-v2.db")
# db_feature_to_cluster()