In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy.spatial.distance import pdist
from sklearn.cluster import KMeans

In [2]:
# optimized partitional models
clara_opt = pd.read_csv('../2_models/with_raw_data/solar/results/partitional_opt/csv_labels_raw/clara.csv')
fuzzy_opt = pd.read_csv('../2_models/with_raw_data/solar/results/partitional_opt/csv_labels_raw/fuzzy_c_means.csv')
kmeans_opt = pd.read_csv('../2_models/with_raw_data/solar/results/partitional_opt/csv_labels_raw/kmeans.csv')

# partitionalal models
clara = pd.read_csv('../2_models/with_raw_data/solar/results/partitional/csv_labels_raw/clara.csv')
fuzzy = pd.read_csv('../2_models/with_raw_data/solar/results/partitional/csv_labels_raw/fuzzy_c_means.csv')
kmeans = pd.read_csv('../2_models/with_raw_data/solar/results/partitional/csv_labels_raw/kmeans.csv')

# model based
bgmm = pd.read_csv('../2_models/with_raw_data/solar/results/model_based/csv_labels_cut_raw/bgmm.csv')
dpmm = pd.read_csv('../2_models/with_raw_data/solar/results/model_based/csv_labels_cut_raw/dpmm.csv')
gaussian_mix = pd.read_csv('../2_models/with_raw_data/solar/results/model_based/csv_labels_cut_raw/gaussian_mix.csv')

# model based
bgmm_opt = pd.read_csv('../2_models/with_raw_data/solar/results/model_based_opt/csv_labels_cut_raw/bgmm.csv')
gaussian_mix_opt = pd.read_csv('../2_models/with_raw_data/solar/results/model_based_opt/csv_labels_cut_raw/gaussian_mix.csv')

# optimized model based
bgmm_opt = pd.read_csv('../2_models/with_raw_data/solar/results/model_based_opt/csv_labels_cut_raw/bgmm.csv')
gaussian_mix_opt = pd.read_csv('../2_models/with_raw_data/solar/results/model_based_opt/csv_labels_cut_raw/gaussian_mix.csv')

# optimized hierarchical models
average_link_opt = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical_opt/csv_labels_cut_raw/average_link.csv')
birch_opt = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical_opt/csv_labels_cut_raw/birch.csv')
single_link_opt = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical_opt/csv_labels_cut_raw/single_link.csv')
centroid_link_opt = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical_opt/csv_labels_cut_raw/centroid_link.csv')
ward_link_opt = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical_opt/csv_labels_cut_raw/ward_link.csv')

# hierarchical models
average_link = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical/csv_labels_cut_raw/average_link.csv')
birch = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical/csv_labels_cut_raw/birch.csv')
single_link = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical/csv_labels_cut_raw/single_link.csv')
centroid_link = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical/csv_labels_cut_raw/centroid_link.csv')
ward_link = pd.read_csv('../2_models/with_raw_data/solar/results/hierarchical/csv_labels_cut_raw/ward_link.csv')

# density based models
dbscan = pd.read_csv('../2_models/with_raw_data/solar/results/density_based/csv_labels_cut_raw/DBSCAN.csv')
optics = pd.read_csv('../2_models/with_raw_data/solar/results/density_based/csv_labels_cut_raw/OPTICS.csv')

In [3]:
def dunn_index(df):
    min_intercluster_distances = []
    max_intracluster_distances = []
    for label in df['cluster_label'].unique():
        cluster_points = df[df['cluster_label'] == label][['PC1', 'PC2']]
        cluster_distances = pdist(cluster_points)
        num_clusters = min(len(df['cluster_label'].unique()), len(cluster_points))
        kmeans = KMeans(n_clusters=num_clusters)
        kmeans.fit(cluster_points)
        min_intercluster_distances.append(min([np.linalg.norm(cluster_points.values - centroid) for centroid in kmeans.cluster_centers_]))
        if len(cluster_distances) > 0:  # Check if cluster_distances has any elements
            max_intracluster_distances.append(max(cluster_distances))
        else:
            max_intracluster_distances.append(0)  # Append a default value
    return min(min_intercluster_distances) / max(max_intracluster_distances)

In [4]:
def xie_beni_index(df):
    # Compute the total scatter
    total_scatter = np.sum(pdist(df[['PC1', 'PC2']]) ** 2) / (2 * len(df))
    
    # Compute the intra-cluster scatter
    cluster_labels = df['cluster_label'].unique()
    intra_cluster_scatter = 0
    for label in cluster_labels:
        cluster_points = df[df['cluster_label'] == label][['PC1', 'PC2']]
        cluster_center = np.mean(cluster_points, axis=0)
        intra_cluster_scatter += np.sum(np.linalg.norm(cluster_points - cluster_center, axis=1) ** 2)
    intra_cluster_scatter /= len(df)
    return intra_cluster_scatter / total_scatter

In [5]:
# Define a function to compute evaluation metrics
def compute_metrics(df):
    metrics = {}
    
    if df['cluster_label'].nunique() > 1:
        metrics['Silhouette Score'] = silhouette_score(df[['PC1', 'PC2']], df['cluster_label'])
        metrics['Davies-Bouldin Index'] = davies_bouldin_score(df[['PC1', 'PC2']], df['cluster_label'])
        metrics['Calinski-Harabasz Index'] = calinski_harabasz_score(df[['PC1', 'PC2']], df['cluster_label'])
        metrics['Dunn Index'] = dunn_index(df)
        metrics['Xie-Beni Index'] = xie_beni_index(df)
    else:
        metrics['Silhouette Score'] = 0
        metrics['Davies-Bouldin Index'] = 0
        metrics['Calinski-Harabasz Index'] = 0
        metrics['Dunn Index'] = 0
        metrics['Xie-Beni Index'] = 0

    return metrics


In [6]:
# Create a dictionary with dataframes and their names
dataframes = {
    'clara_opt': clara_opt, 'fuzzy_opt': fuzzy_opt, 'kmeans_opt': kmeans_opt,
    'clara': clara, 'fuzzy': fuzzy, 'kmeans': kmeans,
    'bgmm': bgmm, 'dpmm': dpmm, 'gaussian_mix': gaussian_mix,
    'average_link_opt': average_link_opt, 'birch_opt': birch_opt,
    'single_link_opt': single_link_opt, 'centroid_link_opt': centroid_link_opt,
    'ward_link_opt': ward_link_opt,
    'average_link': average_link, 'birch': birch, 'single_link': single_link,
    'centroid_link': centroid_link, 'ward_link': ward_link,
    'dbscan': dbscan, 'optics': optics,
    'bgmm_opt': bgmm_opt, 'gaussian_mix_opt': gaussian_mix_opt
 }

In [7]:
# Iterate over each dataframe and print the name of the ones without a 'cluster_label' column
for model_name, df in dataframes.items():
    if 'cluster_label' not in df.columns:
        print(f"The dataframe '{model_name}' does not have a column named 'cluster_label'.")
        print(df.columns)

The dataframe 'fuzzy_opt' does not have a column named 'cluster_label'.
Index(['PC1', 'PC2', 'cluster', 'asset_id'], dtype='object')
The dataframe 'kmeans_opt' does not have a column named 'cluster_label'.
Index(['PC1', 'PC2', 'cluster', 'asset_id'], dtype='object')
The dataframe 'clara' does not have a column named 'cluster_label'.
Index(['PC1', 'PC2', 'cluster', 'asset_id'], dtype='object')
The dataframe 'fuzzy' does not have a column named 'cluster_label'.
Index(['PC1', 'PC2', 'cluster', 'asset_id'], dtype='object')
The dataframe 'kmeans' does not have a column named 'cluster_label'.
Index(['PC1', 'PC2', 'cluster', 'asset_id'], dtype='object')


In [8]:
# Loop through each dataframe and rename the 'cluster' column to 'cluster_label'
for model_name, df in dataframes.items():
    if 'cluster' in df.columns:
        df.rename(columns={'cluster': 'cluster_label'}, inplace=True)

In [9]:
# Initialize an empty list to store the results
results = []

# Compute and store the evaluation metrics for each dataframe
for model_name, df in dataframes.items():
    print(f'Evaluating {model_name}')
    metrics = compute_metrics(df)
    results.append({'Model': model_name, **metrics})

# Create a DataFrame from the results list
results_df = pd.DataFrame(results)

Evaluating clara_opt


Evaluating fuzzy_opt
Evaluating kmeans_opt
Evaluating clara
Evaluating fuzzy
Evaluating kmeans
Evaluating bgmm
Evaluating dpmm
Evaluating gaussian_mix
Evaluating average_link_opt
Evaluating birch_opt
Evaluating single_link_opt
Evaluating centroid_link_opt
Evaluating ward_link_opt
Evaluating average_link
Evaluating birch
Evaluating single_link
Evaluating centroid_link
Evaluating ward_link
Evaluating dbscan
Evaluating optics
Evaluating bgmm_opt
Evaluating gaussian_mix_opt


In [10]:
# Display the results
results_df

Unnamed: 0,Model,Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Index,Dunn Index,Xie-Beni Index
0,clara_opt,0.404663,0.974011,29073.554411,19.71588,2.9e-05
1,fuzzy_opt,0.058591,1.228405,14600.475391,0.27819,2.9e-05
2,kmeans_opt,0.432954,0.699864,38599.725526,8.056062,1e-05
3,clara,0.413417,0.722929,35983.682794,6.954973,1.1e-05
4,fuzzy,0.327391,1.06303,24270.127669,8.014751,1.5e-05
5,kmeans,0.432954,0.699864,38599.725526,8.113263,1e-05
6,bgmm,0.391356,1.008046,22842.25375,5.356896,1.8e-05
7,dpmm,0.126792,1.208988,15332.37052,0.011412,5e-06
8,gaussian_mix,0.416887,0.888387,26242.259563,5.580253,1.7e-05
9,average_link_opt,0.648253,0.332897,24.194469,0.113092,5e-05


In [11]:
results_df.to_csv('results/raw_solar.csv', index=False)

In [12]:
# Define the weights for each metric
weights = {
    'Silhouette Score': 0.3,
    'Davies-Bouldin Index': 0.1,
    'Calinski-Harabasz Index': 0.3,
    'Dunn Index': 0.2,
    'Xie-Beni Index': 0.1
}

# Convert columns to numeric data types
normalized_df = results_df.copy()
for col in normalized_df.columns[1:]:
    normalized_df[col] = pd.to_numeric(normalized_df[col], errors='coerce')

# Drop rows with missing or non-numeric values
normalized_df = normalized_df.dropna()

# Define ideal and anti-ideal solutions for each metric
ideal_solution = normalized_df.copy()
anti_ideal_solution = normalized_df.copy()

# Define metrics where higher values are better
higher_is_better = ['Silhouette Score', 'Dunn Index', 'Calinski-Harabasz Index']

# Define metrics where lower values are better
lower_is_better = ['Davies-Bouldin Index', 'Xie-Beni Index']

# Set ideal and anti-ideal solutions for metrics where higher values are better
for metric in higher_is_better:
    ideal_solution[metric] = normalized_df[metric].max()
    anti_ideal_solution[metric] = normalized_df[metric].min()

# Set ideal and anti-ideal solutions for metrics where lower values are better
for metric in lower_is_better:
    ideal_solution[metric] = normalized_df[metric].min()
    anti_ideal_solution[metric] = normalized_df[metric].max()

# Calculate the distance from each alternative to the ideal and anti-ideal solutions
dist_to_ideal = ((normalized_df.iloc[:,1:] - ideal_solution) ** 2).sum(axis=1) ** 0.5
dist_to_anti_ideal = ((normalized_df.iloc[:,1:] - anti_ideal_solution) ** 2).sum(axis=1) ** 0.5

# Calculate the relative closeness to the ideal solution
normalized_df['Closeness'] = dist_to_anti_ideal / (dist_to_ideal + dist_to_anti_ideal)

# Sort the models based on their relative closeness to the ideal solution
ranked_models = normalized_df.sort_values(by='Closeness', ascending=False)

# Print the ranked models
ranked_models[['Model', 'Closeness']]

Unnamed: 0,Model,Closeness
5,kmeans,0.999699
2,kmeans_opt,0.999698
3,clara,0.932187
18,ward_link,0.817618
13,ward_link_opt,0.782413
0,clara_opt,0.753065
22,gaussian_mix_opt,0.721307
8,gaussian_mix,0.679673
4,fuzzy,0.628552
21,bgmm_opt,0.591539


3 best models: kmeans, clara, ward

(not including kmeans_opt since the best model is kmeans)

In [14]:
results_df.iloc[[5,3,18]]

Unnamed: 0,Model,Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Index,Dunn Index,Xie-Beni Index
5,kmeans,0.432954,0.699864,38599.725526,8.113263,1e-05
3,clara,0.413417,0.722929,35983.682794,6.954973,1.1e-05
18,ward_link,0.467547,0.7403,31563.854623,10.02719,1.5e-05
