In [1]:
import pandas as pd

In [2]:
clean_wind = pd.read_csv('results/clean_wind.csv')
non_clean_wind = pd.read_csv('results/raw_wind.csv')

In [3]:
clean_wind['Cleaned'] = 'yes'
non_clean_wind['Cleaned'] = 'no'

In [4]:
wind_results = pd.concat([clean_wind, non_clean_wind], ignore_index=True)
wind_results

Unnamed: 0,Model,Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Index,Dunn Index,Xie-Beni Index,Cleaned
0,clara_opt,0.29457,0.947067,26265.6238,9.827205,1.6e-05,yes
1,fuzzy_opt,0.322685,1.275782,21230.563071,24.489767,3e-05,yes
2,kmeans_opt,0.382494,0.873168,34122.210623,19.477919,1.8e-05,yes
3,clara,0.29457,0.947067,26265.6238,9.870511,1.6e-05,yes
4,kmeans,0.353255,0.89305,32743.503392,14.387601,1.4e-05,yes
5,dpmm,0.067576,1.328895,13950.770286,0.0,5e-06,yes
6,gaussian_mix,0.355617,0.817321,34746.580918,6.210563,7e-06,yes
7,average_link_opt,0.678148,0.227147,16.004912,0.0,4.5e-05,yes
8,birch_opt,0.308445,1.246492,24247.086838,28.018507,2.9e-05,yes
9,single_link_opt,0.678148,0.227147,16.004912,0.0,4.5e-05,yes


In [6]:
# Convert columns to numeric data types
normalized_df = wind_results.iloc[:,:-1].copy()
for col in normalized_df.columns[1:]:
    normalized_df[col] = pd.to_numeric(normalized_df[col], errors='coerce')

# Drop rows with missing or non-numeric values
normalized_df = normalized_df.dropna()

# Define ideal and anti-ideal solutions for each metric
ideal_solution = normalized_df.copy()
anti_ideal_solution = normalized_df.copy()

# Define metrics where higher values are better
higher_is_better = ['Silhouette Score', 'Dunn Index', 'Calinski-Harabasz Index']

# Define metrics where lower values are better
lower_is_better = ['Davies-Bouldin Index', 'Xie-Beni Index']

# Set ideal and anti-ideal solutions for metrics where higher values are better
for metric in higher_is_better:
    ideal_solution[metric] = normalized_df[metric].max()
    anti_ideal_solution[metric] = normalized_df[metric].min()

# Set ideal and anti-ideal solutions for metrics where lower values are better
for metric in lower_is_better:
    ideal_solution[metric] = normalized_df[metric].min()
    anti_ideal_solution[metric] = normalized_df[metric].max()

# Calculate the distance from each alternative to the ideal and anti-ideal solutions
dist_to_ideal = ((normalized_df.iloc[:,1:] - ideal_solution) ** 2).sum(axis=1) ** 0.5
dist_to_anti_ideal = ((normalized_df.iloc[:,1:] - anti_ideal_solution) ** 2).sum(axis=1) ** 0.5

# Calculate the relative closeness to the ideal solution
normalized_df['Closeness'] = dist_to_anti_ideal / (dist_to_ideal + dist_to_anti_ideal)

# Sort the models based on their relative closeness to the ideal solution
ranked_models = normalized_df.sort_values(by='Closeness', ascending=False)

# Print the ranked models
ranked_models[['Model', 'Closeness']]

Unnamed: 0,Model,Closeness
28,kmeans,0.999755
25,kmeans_opt,0.999754
27,fuzzy,0.998215
6,gaussian_mix,0.978707
45,gaussian_mix,0.967396
2,kmeans_opt,0.961131
36,birch,0.944672
4,kmeans,0.922296
21,fuzzy,0.912871
39,ward_link,0.860607


In [8]:
merged_df = pd.merge(ranked_models[['Model', 'Closeness']], wind_results['Cleaned'], left_index=True, right_index=True)
merged_df

Unnamed: 0,Model,Closeness,Cleaned
28,kmeans,0.999755,no
25,kmeans_opt,0.999754,no
27,fuzzy,0.998215,no
6,gaussian_mix,0.978707,yes
45,gaussian_mix,0.967396,no
2,kmeans_opt,0.961131,yes
36,birch,0.944672,no
4,kmeans,0.922296,yes
21,fuzzy,0.912871,yes
39,ward_link,0.860607,no
