# Evaluation of clustering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sb
import os

# Import
- features
- diagnoses
- featurs of consultation with at least one of the top 10 diagnoses
- features after dimensionality reduction
- results of all clustering approaches

In [None]:
# import features
features = pd.read_pickle("data/rwanda/features.pickle")

# import diagnoses
diagnoses = pd.read_pickle("data/rwanda/diagnoses.pickle")

# import features in 2D after dimensionality reduction
features_after_dimensionality_reduction = pd.read_pickle("data/rwanda/features_after_dimensionality_reduction.pickle")

# import features of consultations with at least one top 10 diagnosis
features_consultations_with_at_least_one_top_10_diagnosis = pd.read_pickle("data/rwanda/features_consultations_with_at_least_one_top_10_diagnosis")

# import diagnoses of consultations with at least one top 10 diagnosis
diagnoses_consultations_with_at_least_one_top_10_diagnosis = pd.read_pickle("data/rwanda/diagnoses_consultations_with_at_least_one_top_10_diagnosis")

# import clustering results
clustering_result_files = os.listdir("data/rwanda/clustering_results")

clustering_results = []
for file in clustering_result_files:
    clustering_results.append({"method":file.replace(".pickle", "").replace("k_", "K-").replace("_", " "),"result":pd.read_pickle("data/rwanda/clustering_results/"+file)})

# Basic comparison clustering results

In [None]:
for clustering in clustering_results:
    # number of consultations per cluster
    fig = px.bar(
        clustering["result"].groupby("cluster").size().reset_index(name="number_consultations"),
        x="cluster",
        y="number_consultations",
        labels={
            "cluster":"Cluster",
            "number_consultations":"Number consultations",
        },
        title=f"Number of consultations per cluster <br><sup>for clustering approach: '{clustering['method']}'</sup>"
    )
    fig.show()

In [None]:
# which consultations switched their clusters

In [None]:
# comparison silhouette score
from sklearn.metrics import silhouette_score

methods = []
silhouette_score_results = []
for clustering in clustering_results:
     data = clustering["result"]
     methods.append(clustering["method"])
     try:
          score = silhouette_score(data.drop(["cluster"], axis=1), data["cluster"], metric='euclidean') 
          silhouette_score_results.append(score)
     except:
          silhouette_score_results.append(np.nan)

silhouette_score_per_clustering_method = pd.DataFrame({"Clustering approach": methods, "Silhouette score": silhouette_score_results})
silhouette_score_per_clustering_method[" "] = ""
fig_silhouette_scores = px.bar(
     silhouette_score_per_clustering_method, 
     x=" ", 
     y="Silhouette score", 
     title="Silhouette score of clustering approaches",
     color="Clustering approach",
     barmode="group"
)
"""new_labels = {}
for approach in silhouette_score_per_clustering_method["Clustering approach"]:
     new_labels[approach] = " "
fig_silhouette_scores.update_xaxes(labelalias=new_labels)"""
fig_silhouette_scores.show()

# Visualisation after dimensionality reduction 

In [None]:
""""dim_reduced_features_with_labels = dim_reduced_features.merge(consultations_with_top10_diagnoses[top10_diagnoses["diagnosis"]].reset_index(), how="left", left_index=True, right_index=True)
dim_reduced_features_with_labels = pd.melt(dim_reduced_features_with_labels,  id_vars=["Dimension 1","Dimension 2"], value_vars=top10_diagnoses["diagnosis"], var_name="Diagnosis", value_name="Given")
dim_reduced_features_with_labels = dim_reduced_features_with_labels[dim_reduced_features_with_labels["Given"] == 1]""""

In [None]:
diagnoses_consultations_with_at_least_one_top_10_diagnosis.shape

In [None]:
diagnoses_consultations_with_at_least_one_top_10_diagnosis.index.nunique()

In [None]:
features_after_dimensionality_reduction.merge(diagnoses_consultations_with_at_least_one_top_10_diagnosis, how="left", left_index=True, right_index=True).fillna("Not clustered")

In [None]:
diagnoses_consultations_with_at_least_one_top_10_diagnosis

In [None]:
features_consultations_with_at_least_one_top_10_diagnosis

In [None]:
features_after_dimensionality_reduction

In [None]:
for clustering in clustering_results:
    data = features_after_dimensionality_reduction 
    data["Cluster"] = clustering["result"]
    fig = px.scatter(
        data, 
        x="Dimension 1", 
        y="Dimension 2", 
        color="Cluster", 
        title=f"Clustering result after UMAP dimensionality reduction <br><sup>for clustering approach:'{clustering['method']}'</sup>"
    )
    fig.show()

# Descriptive statistics
- feature distribution per cluster --> which feature decides the clustering result?
- diagnoses distribution per cluster --> which cluster corresponds to which diagnosis?

# Supervised learning

In [None]:
# function for building classifier

# function for computing and visualising feature importance through shap

## Feature based classifier for cluster labels
- Accuracy throucg cv f1 score --> how well did the cluster approach cluster?
- feature importance through SHAP value analysis --> how many and which features determine which cluster?

## Diagnosis based classifier for cluster labels
- Accuracy through cv f1 score --> how well do diagnoses explain clustering?
- feature importance through SHAP value analysis --> how many and which diagnoses determine which cluster?

## Missingness based classifier for cluster labels
- Accuracy throuch cv f1 score --> how well does missingness of features explain clustering?
- feature importance through SHAP value analysis --> how many and which features' missingness determine which cluster?