In [58]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy.spatial.distance import pdist
from sklearn.cluster import KMeans

In [59]:
# optimized partitional models
clara_opt = pd.read_csv('../2_models/with_clean_data/wind/results/partitional_opt/csv_labels_raw/clara.csv')
fuzzy_opt = pd.read_csv('../2_models/with_clean_data/wind/results/partitional_opt/csv_labels_raw/fuzzy_c_means.csv')
kmeans_opt = pd.read_csv('../2_models/with_clean_data/wind/results/partitional_opt/csv_labels_raw/kmeans.csv')

# partitional models
clara = pd.read_csv('../2_models/with_clean_data/wind/results/partitional/csv_labels_raw/clara.csv')
fuzzy = pd.read_csv('../2_models/with_clean_data/wind/results/partitional/csv_labels_raw/fuzzy_c_means.csv')
kmeans = pd.read_csv('../2_models/with_clean_data/wind/results/partitional/csv_labels_raw/kmeans.csv')

# model based
bgmm = pd.read_csv('../2_models/with_clean_data/wind/results/model_based/csv_labels_cut_raw/bgmm.csv')
# dpmm = pd.read_csv('../2_models/with_clean_data/wind/results/model_based/csv_labels_cut_raw/dpmm.csv')
gaussian_mix = pd.read_csv('../2_models/with_clean_data/wind/results/model_based/csv_labels_cut_raw/gaussian_mix.csv')

# optimized model based
bgmm_opt = pd.read_csv('../2_models/with_clean_data/wind/results/model_based_opt/csv_labels_cut_raw/bgmm.csv')
gaussian_mix_opt = pd.read_csv('../2_models/with_clean_data/wind/results/model_based_opt/csv_labels_cut_raw/gaussian_mix.csv')

# optimized hierarchical models
average_link_opt = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical_opt/csv_labels_cut_raw/average_link.csv')
birch_opt = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical_opt/csv_labels_cut_raw/birch.csv')
single_link_opt = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical_opt/csv_labels_cut_raw/single_link.csv')
centroid_link_opt = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical_opt/csv_labels_cut_raw/centroid_link.csv')
ward_link_opt = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical_opt/csv_labels_cut_raw/ward_link.csv')

# hierarchical models
average_link = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical/csv_labels_cut_raw/average_link.csv')
birch = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical/csv_labels_cut_raw/birch.csv')
single_link = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical/csv_labels_cut_raw/single_link.csv')
centroid_link = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical/csv_labels_cut_raw/centroid_link.csv')
ward_link = pd.read_csv('../2_models/with_clean_data/wind/results/hierarchical/csv_labels_cut_raw/ward_link.csv')

# density based models
dbscan = pd.read_csv('../2_models/with_clean_data/wind/results/density_based/csv_labels_cut_raw/DBSCAN.csv')
optics = pd.read_csv('../2_models/with_clean_data/wind/results/density_based/csv_labels_cut_raw/OPTICS.csv')

In [60]:
def dunn_index(df):
    min_intercluster_distances = []
    max_intracluster_distances = []
    for label in df['cluster_label'].unique():
        cluster_points = df[df['cluster_label'] == label][['0', '1', '2']]
        cluster_distances = pdist(cluster_points)
        num_clusters = min(len(df['cluster_label'].unique()), len(cluster_points))
        kmeans = KMeans(n_clusters=num_clusters)
        kmeans.fit(cluster_points)
        min_intercluster_distances.append(min([np.linalg.norm(cluster_points.values - centroid) for centroid in kmeans.cluster_centers_]))
        if len(cluster_distances) > 0:  # Check if cluster_distances has any elements
            max_intracluster_distances.append(max(cluster_distances))
        else:
            max_intracluster_distances.append(0)  # Append a default value
    return min(min_intercluster_distances) / max(max_intracluster_distances)

In [61]:
def xie_beni_index(df):
    # Compute the total scatter
    total_scatter = np.sum(pdist(df[['0', '1', '2']]) ** 2) / (2 * len(df))
    
    # Compute the intra-cluster scatter
    cluster_labels = df['cluster_label'].unique()
    intra_cluster_scatter = 0
    for label in cluster_labels:
        cluster_points = df[df['cluster_label'] == label][['0', '1', '2']]
        cluster_center = np.mean(cluster_points, axis=0)
        intra_cluster_scatter += np.sum(np.linalg.norm(cluster_points - cluster_center, axis=1) ** 2)
    intra_cluster_scatter /= len(df)
    return intra_cluster_scatter / total_scatter

In [62]:
# Define a function to compute evaluation metrics
def compute_metrics(df):
    metrics = {}
    
    if df['cluster_label'].nunique() > 1:
        metrics['Silhouette Score'] = silhouette_score(df[['0', '1', '2']], df['cluster_label'])
        metrics['Davies-Bouldin Index'] = davies_bouldin_score(df[['0', '1', '2']], df['cluster_label'])
        metrics['Calinski-Harabasz Index'] = calinski_harabasz_score(df[['0', '1', '2']], df['cluster_label'])
        metrics['Dunn Index'] = dunn_index(df)
        metrics['Xie-Beni Index'] = xie_beni_index(df)
    else:
        metrics['Silhouette Score'] = 0
        metrics['Davies-Bouldin Index'] = 0
        metrics['Calinski-Harabasz Index'] = 0
        metrics['Dunn Index'] = 0
        metrics['Xie-Beni Index'] = 0

    return metrics


In [63]:
# Create a dictionary with dataframes and their names
dataframes = {
    'clara_opt': clara_opt, 'fuzzy_opt': fuzzy_opt, 'kmeans_opt': kmeans_opt,
    'clara': clara, 'fuzzy': fuzzy, 'kmeans': kmeans,
    'bgmm': bgmm, 'gaussian_mix': gaussian_mix,
    'average_link_opt': average_link_opt, 'birch_opt': birch_opt,
    'single_link_opt': single_link_opt, 'centroid_link_opt': centroid_link_opt,
    'ward_link_opt': ward_link_opt,
    'average_link': average_link, 'birch': birch, 'single_link': single_link,
    'centroid_link': centroid_link, 'ward_link': ward_link,
    'dbscan': dbscan, 'optics': optics, 
    'bgmm_opt': bgmm_opt, 'gaussian_mix_opt': gaussian_mix_opt
 }


In [64]:
results_df = pd.read_csv('results/clean_wind.csv')

In [65]:
results_df = results_df[results_df['Model'] != 'bgmm']


In [66]:
results_df.reset_index(drop=True)

Unnamed: 0,Model,Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Index,Dunn Index,Xie-Beni Index
0,clara_opt,0.29457,0.947067,26265.6238,9.827205,1.6e-05
1,fuzzy_opt,0.322685,1.275782,21230.563071,24.489767,3e-05
2,kmeans_opt,0.382494,0.873168,34122.210623,19.477919,1.8e-05
3,clara,0.29457,0.947067,26265.6238,9.870511,1.6e-05
4,kmeans,0.353255,0.89305,32743.503392,14.387601,1.4e-05
5,dpmm,0.067576,1.328895,13950.770286,0.0,5e-06
6,gaussian_mix,0.355617,0.817321,34746.580918,6.210563,7e-06
7,average_link_opt,0.678148,0.227147,16.004912,0.0,4.5e-05
8,birch_opt,0.308445,1.246492,24247.086838,28.018507,2.9e-05
9,single_link_opt,0.678148,0.227147,16.004912,0.0,4.5e-05


In [67]:
# Iterate over each dataframe and print the name of the ones without a 'cluster_label' column
for model_name, df in dataframes.items():
    # if 'cluster_label' not in df.columns:
        # print(f"The dataframe '{model_name}' does not have a column named 'cluster_label'.")
        print(model_name)
        print(df.columns)

clara_opt
Index(['0', '1', '2', 'cluster_label', 'asset_id'], dtype='object')
fuzzy_opt
Index(['0', '1', '2', 'cluster', 'asset_id'], dtype='object')
kmeans_opt
Index(['0', '1', '2', 'cluster', 'asset_id'], dtype='object')
clara
Index(['0', '1', '2', 'cluster', 'asset_id'], dtype='object')
fuzzy
Index(['0', '1', '2', 'cluster', 'asset_id'], dtype='object')
kmeans
Index(['0', '1', '2', 'cluster', 'asset_id'], dtype='object')
bgmm
Index(['0', '1', '2', 'cluster_label', 'asset_id'], dtype='object')
gaussian_mix
Index(['0', '1', '2', 'cluster_label', 'asset_id'], dtype='object')
average_link_opt
Index(['0', '1', '2', 'cluster_label', 'asset_id'], dtype='object')
birch_opt
Index(['0', '1', '2', 'cluster_label', 'asset_id'], dtype='object')
single_link_opt
Index(['0', '1', '2', 'cluster_label', 'asset_id'], dtype='object')
centroid_link_opt
Index(['0', '1', '2', 'cluster_label', 'asset_id'], dtype='object')
ward_link_opt
Index(['0', '1', '2', 'cluster_label', 'asset_id'], dtype='object')
ave

In [68]:
# Loop through each dataframe and rename the 'cluster' column to 'cluster_label'
for model_name, df in dataframes.items():
    if 'cluster' in df.columns:
        df.rename(columns={'cluster': 'cluster_label'}, inplace=True)

In [70]:
# Initialize an empty list to store the results
results = []

# Compute and store the evaluation metrics for each dataframe
for model_name, df in dataframes.items():
    print(f'Evaluating {model_name}')
    metrics = compute_metrics(df)
    results.append({'Model': model_name, **metrics})

# Create a DataFrame from the results list
# results_df = pd.DataFrame(results)
results_df_to_append = pd.DataFrame(results)

Evaluating clara_opt


Evaluating fuzzy_opt
Evaluating kmeans_opt
Evaluating clara
Evaluating fuzzy
Evaluating kmeans
Evaluating bgmm
Evaluating gaussian_mix
Evaluating average_link_opt
Evaluating birch_opt
Evaluating single_link_opt
Evaluating centroid_link_opt
Evaluating ward_link_opt
Evaluating average_link
Evaluating birch
Evaluating single_link
Evaluating centroid_link
Evaluating ward_link
Evaluating dbscan
Evaluating optics
Evaluating bgmm_opt
Evaluating gaussian_mix_opt


In [71]:
results_df = pd.concat([results_df, results_df_to_append], axis=0)

In [72]:
# Display the results
results_df.reset_index(drop=True)

Unnamed: 0,Model,Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Index,Dunn Index,Xie-Beni Index
0,clara_opt,0.29457,0.947067,26265.6238,9.827205,1.6e-05
1,fuzzy_opt,0.322685,1.275782,21230.563071,24.489767,3e-05
2,kmeans_opt,0.382494,0.873168,34122.210623,19.477919,1.8e-05
3,clara,0.29457,0.947067,26265.6238,9.870511,1.6e-05
4,kmeans,0.353255,0.89305,32743.503392,14.387601,1.4e-05
5,dpmm,0.067576,1.328895,13950.770286,0.0,5e-06
6,gaussian_mix,0.355617,0.817321,34746.580918,6.210563,7e-06
7,average_link_opt,0.678148,0.227147,16.004912,0.0,4.5e-05
8,birch_opt,0.308445,1.246492,24247.086838,28.018507,2.9e-05
9,single_link_opt,0.678148,0.227147,16.004912,0.0,4.5e-05


In [73]:
results_df.to_csv('results/clean_wind.csv', index=False)

In [74]:
# Convert columns to numeric data types
normalized_df = results_df.copy()
for col in normalized_df.columns[1:]:
    normalized_df[col] = pd.to_numeric(normalized_df[col], errors='coerce')

# Drop rows with missing or non-numeric values
normalized_df = normalized_df.dropna()

# Define ideal and anti-ideal solutions for each metric
ideal_solution = normalized_df.copy()
anti_ideal_solution = normalized_df.copy()

# Define metrics where higher values are better
higher_is_better = ['Silhouette Score', 'Dunn Index', 'Calinski-Harabasz Index']

# Define metrics where lower values are better
lower_is_better = ['Davies-Bouldin Index', 'Xie-Beni Index']

# Set ideal and anti-ideal solutions for metrics where higher values are better
for metric in higher_is_better:
    ideal_solution[metric] = normalized_df[metric].max()
    anti_ideal_solution[metric] = normalized_df[metric].min()

# Set ideal and anti-ideal solutions for metrics where lower values are better
for metric in lower_is_better:
    ideal_solution[metric] = normalized_df[metric].min()
    anti_ideal_solution[metric] = normalized_df[metric].max()

# Calculate the distance from each alternative to the ideal and anti-ideal solutions
dist_to_ideal = ((normalized_df.iloc[:,1:] - ideal_solution) ** 2).sum(axis=1) ** 0.5
dist_to_anti_ideal = ((normalized_df.iloc[:,1:] - anti_ideal_solution) ** 2).sum(axis=1) ** 0.5

# Calculate the relative closeness to the ideal solution
normalized_df['Closeness'] = dist_to_anti_ideal / (dist_to_ideal + dist_to_anti_ideal)

# Sort the models based on their relative closeness to the ideal solution
ranked_models = normalized_df.sort_values(by='Closeness', ascending=False)

# Print the ranked models
ranked_models[['Model', 'Closeness']]

Unnamed: 0,Model,Closeness
6,gaussian_mix,0.999372
2,kmeans_opt,0.982024
4,kmeans,0.942333
21,fuzzy,0.932699
20,ward_link,0.830046
11,ward_link_opt,0.803346
3,clara,0.755844
0,clara_opt,0.755844
17,birch,0.741712
15,gaussian_mix_opt,0.725543


3 best models: gaussian_mix, kmeans_opt, fuzzy

(not including kmeans since the 2nd best model is kmeans_opt)

In [76]:
results_df.iloc[[6,2,4]] 

Unnamed: 0,Model,Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Index,Dunn Index,Xie-Beni Index
6,gaussian_mix,0.355617,0.817321,34746.580918,6.210563,7e-06
2,kmeans_opt,0.382494,0.873168,34122.210623,19.477919,1.8e-05
4,kmeans,0.353255,0.89305,32743.503392,14.387601,1.4e-05
