In [14]:
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_samples, silhouette_score
import math
import time

# Install KSil from GitHub: !pip install git+https://github.com/semoglou/ksil.git

from ksil import KSil

### Silhouette Class for Clustering Evaluation
The `Silhouette` class provides methods for computing different types of silhouette scores to evaluate clustering quality.

*   **Micro-averaged silhouette score**: Mean of all individual silhouette scores for each sample
*   **Macro-averaged silhouette score**: Mean of all cluster-level silhouette scores.
*   Convex combination of micro and macro silhouette scores.


In [2]:
class Silhouette:
    @staticmethod
    def Micro(X, labels):
        """
        Compute the micro-averaged silhouette score for all samples.
        Parameters:
        - X: array-like of shape (n_samples, n_features). Input data points.
        - labels: array-like of shape (n_samples,). Cluster labels for each sample.
        Returns:
        - float: The micro-averaged silhouette score.
        """
        return silhouette_score(X, labels)

    @staticmethod
    def Macro(X, labels):
        """
        Compute the macro-averaged silhouette score.
        Parameters:
        - X: array-like of shape (n_samples, n_features). Input data points.
        - labels: array-like of shape (n_samples,). Cluster labels for each sample.
        Returns:
        - float: The macro-averaged silhouette score.
        """
        silhouette_vals = silhouette_samples(X, labels)
        unique_labels = np.unique(labels)
        cluster_means = [
            np.mean(silhouette_vals[labels == lbl]) for lbl in unique_labels
        ]
        return np.mean(cluster_means) if cluster_means else 0

    @staticmethod
    def Convex(X, labels, alpha=0.5):
        """
        Compute a convex combination of micro-averaged and macro-averaged silhouette scores.
        Parameters:
        - X: array-like of shape (n_samples, n_features). Input data points.
        - labels: array-like of shape (n_samples,). Cluster labels for each sample.
        Returns:
        - float: The convex combination of the silhouette scores.
        """
        S_micro = Silhouette.Micro(X, labels)
        S_macro = Silhouette.Macro(X, labels)
        return alpha * S_micro + (1 - alpha) * S_macro

---

### Basic Usage of K-Sil Clustering
We demonstrate the core functionality of K-Sil clustering, using a simple synthetic dataset.
We apply K-Sil to test the following operations:
- `fit` – Train the model on data
- `labels_` & `cluster_centers_` – Retrieve assigned cluster labels and final centroids
- `predict` – Assign new samples to clusters
- `transform` – Compute distances of points to each cluster center (centroid distance representation)
- `fit_predict` – Train the model and get cluster assignments in one step
- `fit_transform` – Train and transform data to centroid-distance space

In [3]:
X = [[1,2], [3,4], [5,6], [7,8], [9,10], [10,11], [12, 13], [14,15]]

print(f"Data (X): {X}")

model = KSil(n_clusters=4, silhouette_objective='macro', weighting='exponential', approximation=False).fit(X)

# To prioritize micro-averaged silhouette score: silhouette_objective='micro'
# To use a fixed weight-sensitivity value instead of default "auto": sensitivity=int
# To use power weighting scheme: weighting='power'
# To enable silhouette scores approximation: approximation=True
# To sample x% points of the data or exact x points (>1): sample_size=x (instead of default sample_size=-1)

print(f"\nNumber of iterations during fitting: {model.n_iter_}")

Data (X): [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [10, 11], [12, 13], [14, 15]]

Number of iterations during fitting: 4


In [4]:
print(f"\nAssigned Cluster Labels: {model.labels_}")

# or using fit_predict
model_labels = KSil(n_clusters=4, silhouette_objective='macro', weighting='exponential', approximation=False).fit_predict(X)
print(f"\nLabels: {model_labels}")


Assigned Cluster Labels: [0 0 3 3 2 2 1 1]

Labels: [0 0 3 3 2 2 1 1]


In [5]:
print(f"\nFinal Cluster Centroids (as pd.Series):\n\n {model.cluster_centers_}")
print(f"\nFinal Cluster Centroids (as array):\n\n {list(model.cluster_centers_)}")


Final Cluster Centroids (as pd.Series):

 0      [2.817830592990843, 3.817830592990844]
1    [13.244918662403707, 14.244918662403709]
2     [9.622459331201854, 10.622459331201854]
3                                  [7.0, 8.0]
dtype: object

Final Cluster Centroids (as array):

 [[2.817830592990843, 3.817830592990844], [13.244918662403707, 14.244918662403709], [9.622459331201854, 10.622459331201854], [7.0, 8.0]]


In [6]:
y = [[-1,-3], [0,0], [100,100], [32,33], [10.5,10], [11,11], [7,7], [6,6]]
print(f"vector y: {y}")
print(f"\nPredicted labels for y: {model.predict(y)}")

vector y: [[-1, -3], [0, 0], [100, 100], [32, 33], [10.5, 10], [11, 11], [7, 7], [6, 6]]

Predicted labels for y: [0 0 1 1 2 2 3 3]


In [7]:
print(f"\nTransformation of y:\n\n {model.transform(y)}")

# or we could get the centroid-distance represenation of X
print(f"\n\nTransformation of X:\n\n {KSil(n_clusters=4).fit_transform(X)}")


Transformation of y:

 [[  7.8140031   22.36749712  17.27449103  13.60147051]
 [  4.74510271  19.45110737  14.33277244  10.63014581]
 [136.73106363 121.9851553  127.10800381 130.8166656 ]
 [ 41.26981975  26.52369039  31.64662151  35.35533906]
 [  9.86077813   5.05508782   1.07588719   4.03112887]
 [ 10.88721514   3.9457771    1.42834003   5.        ]
 [  5.25516347   9.56492841   4.47208056   1.        ]
 [  3.8585056   10.97577014   5.87276271   2.23606798]]


Transformation of X:

 [[1.43422155e+00 1.83807820e+01 1.27279221e+01 8.48837637e+00]
 [1.39420558e+00 1.55523548e+01 9.89949494e+00 5.65994924e+00]
 [4.22263270e+00 1.27239277e+01 7.07106781e+00 2.83152212e+00]
 [7.05105983e+00 9.89550059e+00 4.24264069e+00 3.09499442e-03]
 [9.87948695e+00 7.06707346e+00 1.41421356e+00 2.82533213e+00]
 [1.12937005e+01 5.65285990e+00 0.00000000e+00 4.23954569e+00]
 [1.41221276e+01 2.82443278e+00 2.82842712e+00 7.06797282e+00]
 [1.69505548e+01 3.99434716e-03 5.65685425e+00 9.89639994e+00]]


---

### K-Sil Clustering: Approximation and Sampling Effect

We evaluate the effect of **silhouette approximation** and **sampling** on the performance of K-Sil clustering. We generate a dataset of 10000 data points and varying cluster **standard deviations** to assess how different configurations impact **convergence speed** and **silhouette scores**.

#### **Configurations**
We test **four different scenarios** by varying the use of **silhouette approximation** and **sampling size**:

1. **With Silhouette Approximation & Sampling** (`approximation=True, sample_size=0.7`)  

2. **With Silhouette Approximation & No Sampling** (`approximation=True, sample_size=-1`)   

3. **Without Silhouette Approximation & Sampling** (`approximation=False, sample_size=0.7`)   

4. **Without Silhouette Approximation & No Sampling** (`approximation=False, sample_size=-1`)  

Each configuration is **timed**, and we measure:  
- **Execution time** (seconds)  
- **Number of iterations** until convergence  
- **Macro-Silhouette Score**  

This helps us compare how different **sampling** and **approximation** settings impact the **speed** and **clustering quality** of K-Sil.


In [12]:
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=10000, centers=5, cluster_std=[1.0, 2.5, 0.5, 1.5, 3.0], random_state=42)

X, _ = make_blobs(n_samples=10000, centers=5,
                   cluster_std=[1.0, 2.5, 0.5, 1.5, 3.0],
                   random_state=42)

configs = [
    {"name": "Sampling - Approximation", "approximation": True, "sample_size": 0.7},
    {"name": "No Sampling - Approximation", "approximation": True, "sample_size": -1},
    {"name": "Sampling - No Approximation", "approximation": False, "sample_size": 0.7},
    {"name": "No Sampling - No Approximation", "approximation": False, "sample_size": -1},
]

results = []

def run_experiment(config):
    """Run KSil clustering with the given configuration and return metrics."""
    start_time = time.time()
    model = KSil(n_clusters=3,
                 silhouette_objective='macro',
                 weighting='exponential',
                 approximation=config["approximation"],
                 sample_size=config["sample_size"],
                 sensitivity=2).fit(X)
    elapsed_time = time.time() - start_time
    macro_silhouette = Silhouette.Macro(X, model.labels_)

    results.append((config["name"], elapsed_time, model.n_iter_, macro_silhouette))

for config in configs:
    run_experiment(config)

df_results = pd.DataFrame(results, columns=["Configuration", "Time (s)", "Iterations", "Macro-Silhouette"])

print("\nSummary (K-Sil with fixed weight-sensitivity value):\n")
display(df_results)


Summary (K-Sil with fixed weight-sensitivity value):



Unnamed: 0,Configuration,Time (s),Iterations,Macro-Silhouette
0,Sampling - Approximation,0.903975,21,0.667524
1,No Sampling - Approximation,0.979118,20,0.670666
2,Sampling - No Approximation,3.313883,6,0.670155
3,No Sampling - No Approximation,6.962808,7,0.673877


In [13]:
results=[]

def run_experiment(config):
    """Run KSil clustering with the given configuration and return metrics."""
    start_time = time.time()
    model = KSil(n_clusters=3,
                 silhouette_objective='macro',
                 weighting='exponential',
                 approximation=config["approximation"],
                 sample_size=config["sample_size"],
                 sensitivity="auto").fit(X)
    elapsed_time = time.time() - start_time
    macro_silhouette = Silhouette.Macro(X, model.labels_)

    results.append((config["name"], elapsed_time, model.n_iter_, macro_silhouette))

for config in configs:
    run_experiment(config)

df_results = pd.DataFrame(results, columns=["Configuration", "Time (s)", "Iterations", "Macro-Silhouette"])

print("\nSummary (K-Sil with auto-tuned weight-sensitivity value):\n")
display(df_results)


Summary (K-Sil with auto-tuned weight-sensitivity value):



Unnamed: 0,Configuration,Time (s),Iterations,Macro-Silhouette
0,Sampling - Approximation,1.342736,12,0.671907
1,No Sampling - Approximation,1.891218,12,0.674738
2,Sampling - No Approximation,12.175815,7,0.672271
3,No Sampling - No Approximation,26.675359,9,0.674977
