# Clustering

#### Dependencies

In [None]:
import pm4py
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

#### Feature Sets Import

##### Categorical

In [None]:
non_conforming_categorical_features = pd.read_pickle("non_conforming_categorical_features.pkl")
non_conforming_categorical_features = non_conforming_categorical_features.drop(columns='case:concept:name')

##### Numerical

In [None]:
non_conforming_numerical_features = pd.read_pickle("non_conforming_numerical_features.pkl")
non_conf_idx = non_conforming_numerical_features[["case:concept:name"]].copy()
non_conforming_numerical_features = non_conforming_numerical_features.drop(columns='case:concept:name')

#### Log Import

In [None]:
non_conforming_log = pd.read_pickle("filtered-non-conforminlog.pkl")

## Modeling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(non_conforming_numerical_features)

### KMeans

In [None]:
from sklearn.cluster import KMeans

kmeans_3 = KMeans(n_clusters=3, random_state=42)
kmeans_3_labels = kmeans_3.fit_predict(X_num_scaled)

kmeans_4 = KMeans(n_clusters=4, random_state=42)
kmeans_4_labels = kmeans_4.fit_predict(X_num_scaled)

kmeans_5 = KMeans(n_clusters=5, random_state=42)
kmeans_5_labels = kmeans_5.fit_predict(X_num_scaled)

### DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_num_scaled)

### Louvain (numerical)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from networkx.algorithms.community import louvain_communities

similarity_matrix = cosine_similarity(X_num_scaled)

G_num = nx.from_numpy_array(similarity_matrix)

louvain_num_clusters = louvain_communities(G_num, seed=42)

### Louvain (categorical)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import pairwise_distances

encoder = OneHotEncoder()
X_cat_encoded = encoder.fit_transform(non_conforming_categorical_features).toarray()

jaccard_sim = 1 - pairwise_distances(X_cat_encoded, metric='jaccard')

G_cat = nx.from_numpy_array(jaccard_sim)
louvain_cat_clusters = louvain_communities(G_cat, seed=42)

## Evaluation

In [None]:
def evaluate_clustering(name, labels, data=None, is_graph=False, graph=None):

    labels = np.array(labels)

    if is_graph:
        from networkx.algorithms.community.quality import modularity
        communities = [set(np.where(labels == i)[0]) for i in np.unique(labels)]
        mod_score = modularity(graph, communities)
        sil_score = None
    else:
        mod_score = None
        if len(np.unique(labels)) > 1:
            sil_score = silhouette_score(data, labels)
        else:
            sil_score = None 

    cluster_sizes = pd.Series(labels).value_counts().to_dict()

    return {
        'Method': name,
        'Clusters': len(np.unique(labels)),
        'Silhouette': sil_score,
        'Modularity': mod_score,
        'Cluster Sizes': cluster_sizes
    }


In [None]:
results = []

results.append(evaluate_clustering('KMeans_3', kmeans_3_labels, data=X_num_scaled))
results.append(evaluate_clustering('KMeans_4', kmeans_4_labels, data=X_num_scaled))
results.append(evaluate_clustering('KMeans_5', kmeans_5_labels, data=X_num_scaled))

results.append(evaluate_clustering('DBSCAN', dbscan_labels, data=X_num_scaled))

louvain_num_labels = np.zeros(len(X_num_scaled), dtype=int)
for i, group in enumerate(louvain_num_clusters):
    for node in group:
        louvain_num_labels[node] = i
results.append(evaluate_clustering('Louvain (Numerical)', louvain_num_labels, is_graph=True, graph=G_num))

louvain_cat_labels = np.zeros(len(X_cat_encoded), dtype=int)
for i, group in enumerate(louvain_cat_clusters):
    for node in group:
        louvain_cat_labels[node] = i
results.append(evaluate_clustering('Louvain (Categorical)', louvain_cat_labels, is_graph=True, graph=G_cat))

comparison_df = pd.DataFrame(results)

comparison_df

# Graph Visualization

In [None]:
node_labels = {}
for i, community in enumerate(louvain_num_clusters):
    for node in community:
        node_labels[node] = i

node_colors = [node_labels[n] for n in G_num.nodes()]

In [None]:
node_cat_labels = {}
for i, community in enumerate(louvain_cat_clusters):
    for node in community:
        node_cat_labels[node] = i

node_cat_colors = [node_cat_labels[n] for n in G_cat.nodes()]

In [None]:
plt.figure(figsize=(12, 10), facecolor='white')
pos = nx.spring_layout(G_num, seed=42)

nx.draw_networkx_nodes(G_num, pos, node_color=node_colors, cmap=plt.cm.tab10, node_size=40, alpha=0.8)
nx.draw_networkx_edges(G_num, pos, alpha=0.05, width=0.5)
plt.title("Louvain Graph Clustering")
plt.axis("off")
plt.show()

In [None]:
cluster_label_series = pd.Series(node_labels).sort_index()

non_conforming_numerical_features['cluster'] = cluster_label_series.values

non_conforming_categorical_features['cluster'] = cluster_label_series.values

## Export Clusters

In [None]:
non_conforming_numerical_features.to_pickle("clustered_non_conforming_numerical2.pkl")
non_conforming_categorical_features.to_pickle("clustered_non_conforming_categorical2.pkl")

##  Create & Export Sub-Eventlogs From Clusters

In [None]:
non_conf_idx["cluster"] = cluster_label_series.values

non_conforming_log_with_clusters = non_conforming_log.merge(
    non_conf_idx,
    on='case:concept:name',
    how='inner'  
)

cluster_0_log = non_conforming_log_with_clusters[
    non_conforming_log_with_clusters['cluster'] == 0
]

cluster_1_log = non_conforming_log_with_clusters[
    non_conforming_log_with_clusters['cluster'] == 1
]

cluster_2_log = non_conforming_log_with_clusters[
    non_conforming_log_with_clusters['cluster'] == 2
]

non_conforming_cluster_0_log = cluster_0_log.drop(columns='cluster')
non_conforming_cluster_1_log = cluster_1_log.drop(columns='cluster')
non_conforming_cluster_2_log = cluster_2_log.drop(columns='cluster')

In [None]:
non_conforming_cluster_0_log.to_pickle("non_conforming_cluster_0_log.pkl")
non_conforming_cluster_1_log.to_pickle("non_conforming_cluster_1_log.pkl")
non_conforming_cluster_2_log.to_pickle("non_conforming_cluster_2_log.pkl")