Installing Ucimlrepo & Pycaret

In [1]:
!pip install ucimlrepo &> null
print("installed successfully")

!pip install pycaret &> null
print("installed successfully")

installed successfully
installed successfully


Importing the Libraries

In [2]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

List available datasets

In [3]:
from ucimlrepo import list_available_datasets

list_available_datasets()

-------------------------------------
The following datasets are available:
-------------------------------------
Dataset Name                                                                            ID    
------------                                                                            --    
Abalone                                                                                 1     
Adult                                                                                   2     
Auto MPG                                                                                9     
Automobile                                                                              10    
Balance Scale                                                                           12    
Breast Cancer                                                                           14    
Breast Cancer Wisconsin (Original)                                                      15    
Breast Cancer Wisconsin (Progno

Fetching The Dataset

In [6]:
from ucimlrepo import fetch_ucirepo

Heart_Disease = fetch_ucirepo(id=45)

X = Heart_Disease.data.features
Y = Heart_Disease.data.targets

print(Heart_Disease.metadata)
print(Heart_Disease.variables)

{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. Sa

This imports all functions and classes from the PyCaret clustering module.

In [7]:
from pycaret.clustering import *
model = setup(X, verbose = False)

Retrieve a list of available clustering models in PyCaret using the models() function.

In [8]:
models_list = model.models().Name.index
print(models_list)

Index(['kmeans', 'ap', 'meanshift', 'sc', 'hclust', 'dbscan', 'optics',
       'birch'],
      dtype='object', name='ID')


Selecting specific clustering models from the list you obtained.

The model.models() invokes a method on the model object, offering details on all clustering algorithms supported by the system.

In [9]:
models_list = models_list[[0,2,4,5]]
print('Clustering Models Taken: ', models_list)

Clustering Models Taken:  Index(['kmeans', 'meanshift', 'hclust', 'dbscan'], dtype='object', name='ID')


In [10]:
parameters ={
    'No Data Processing': {'transformation': False, 'normalize': False, 'pca': False},
    'Using Normalisation': {'transformation': False, 'normalize': True, 'pca': False},
    'Using Transform': {'transformation': True, 'normalize': False, 'pca': False},
    'Using PCA': {'transformation': False, 'normalize': False, 'pca': True},
    'T+N': {'transformation': True, 'normalize': True, 'pca': False},
    'T+N+PCA': {'transformation': True, 'normalize': True, 'pca': True},
}

In [11]:
results = []

for model in models_list:
    model_results = pd.DataFrame()

    for cluster_size in range(3, 6):
        for name, args in parameters.items():
            exp = setup(X, verbose=False, **args)
            create_model(model, num_clusters=cluster_size, verbose=False)
            temp = exp.pull()

            temp['name'] = name
            temp['cluster_size'] = cluster_size

            model_results = pd.concat([model_results, temp], ignore_index=True)

    model_results.set_index(['name', 'cluster_size'], inplace=True)

    model_results_transposed = model_results.sort_index().T

    model_results_transposed.iloc[:3, :].to_csv(model + '.csv')

    print(model)
    display(model_results_transposed.iloc[:3, :])


kmeans


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.2821,0.2761,0.2745,0.1353,0.1203,0.1186,0.1344,0.1344,0.1195,0.1296,0.1313,0.1132,0.2812,0.2799,0.2762,0.5602,0.5377,0.5399
Calinski-Harabasz,195.5082,176.8977,169.2776,48.7304,40.4522,37.2899,48.7812,41.6978,37.1214,47.4832,40.5616,35.8809,195.4521,177.0044,169.2445,801.8385,925.1518,1016.9573
Davies-Bouldin,1.1533,1.0698,1.0775,2.3002,2.2947,2.1873,2.268,2.1532,2.1932,2.269,2.1463,2.2025,1.1562,1.0647,1.0572,0.5415,0.5238,0.5313


meanshift


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.6378,0.6378,0.6378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6378,0.6378,0.6378,0.0,0.0,0.0
Calinski-Harabasz,54.463,54.463,54.463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.463,54.463,54.463,0.0,0.0,0.0
Davies-Bouldin,0.5276,0.5276,0.5276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5276,0.5276,0.5276,0.0,0.0,0.0


hclust


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.2443,0.2537,0.1995,0.1426,0.1123,0.0921,0.1426,0.1123,0.0921,0.1624,0.1193,0.1062,0.2443,0.2537,0.1995,0.5585,0.5403,0.5543
Calinski-Harabasz,174.0719,154.4552,145.7324,39.0832,35.3045,31.181,39.0832,35.3045,31.181,40.7073,37.7609,32.7531,174.0719,154.4552,145.7324,781.6421,877.0004,951.6959
Davies-Bouldin,1.2725,1.1425,1.2998,2.1858,2.231,2.4554,2.1858,2.231,2.4554,2.0277,2.1313,2.285,1.2725,1.1425,1.2998,0.5343,0.5204,0.5235


dbscan


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Calinski-Harabasz,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Davies-Bouldin,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0




In [12]:
for model in models_list:
    model_results = pd.DataFrame()

    for cluster_size in range(3, 6):
        for name, args in parameters.items():
            exp = setup(X, verbose=False, **args)
            create_model(model, num_clusters=cluster_size, verbose=False)
            temp = exp.pull()

            temp['name'] = name
            temp['cluster_size'] = cluster_size

            model_results = pd.concat([model_results, temp], ignore_index=True)

    model_results.set_index(['name', 'cluster_size'], inplace=True)

    model_results_transposed = model_results.sort_index().T

    model_results_transposed.iloc[:3, :].to_csv(model + '.csv')

    print(model)
    display(model_results_transposed.iloc[:3, :])

# Save the CSV file of the last result
model_results_transposed.iloc[:3, :].to_csv('last_result.csv')


kmeans


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.2821,0.2799,0.2778,0.1341,0.1363,0.1211,0.1363,0.1358,0.118,0.1296,0.1332,0.1092,0.2815,0.2755,0.2779,0.5602,0.5419,0.5449
Calinski-Harabasz,195.5082,177.0044,169.3746,48.7839,41.7599,36.2326,48.7626,41.7209,37.2119,47.4723,40.9284,35.6691,195.452,176.2881,169.366,801.8385,926.1497,1011.1151
Davies-Bouldin,1.1533,1.0647,1.0674,2.2874,2.0929,2.1817,2.26,2.1014,2.1798,2.2612,2.1005,2.2028,1.155,1.0679,1.0666,0.5415,0.5273,0.5281


meanshift


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.6378,0.6378,0.6378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6378,0.6378,0.6378,0.0,0.0,0.0
Calinski-Harabasz,54.463,54.463,54.463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.463,54.463,54.463,0.0,0.0,0.0
Davies-Bouldin,0.5276,0.5276,0.5276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5276,0.5276,0.5276,0.0,0.0,0.0


hclust


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.2443,0.2537,0.1995,0.1426,0.1123,0.0921,0.1426,0.1123,0.0921,0.1624,0.1193,0.1062,0.2443,0.2537,0.1995,0.5585,0.5403,0.5543
Calinski-Harabasz,174.0719,154.4552,145.7324,39.0832,35.3045,31.181,39.0832,35.3045,31.181,40.7073,37.7609,32.7531,174.0719,154.4552,145.7324,781.6421,877.0004,951.6959
Davies-Bouldin,1.2725,1.1425,1.2998,2.1858,2.231,2.4554,2.1858,2.231,2.4554,2.0277,2.1313,2.285,1.2725,1.1425,1.2998,0.5343,0.5204,0.5235


dbscan


name,No Data Processing,No Data Processing,No Data Processing,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA,Using Normalisation,Using Normalisation,Using Normalisation,Using PCA,Using PCA,Using PCA,Using Transform,Using Transform,Using Transform
cluster_size,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Calinski-Harabasz,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Davies-Bouldin,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
num_rows, num_features = X.shape
print(f"Number of Rows in the Dataset: {num_rows}")
print(f"Number of Features in the Dataset: {num_features}")

Number of Rows in the Dataset: 303
Number of Features in the Dataset: 13


In [14]:
print("Clustering Algorithms Used:", ', '.join(models_list))

Clustering Algorithms Used: kmeans, meanshift, hclust, dbscan


In [23]:
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from pycaret.clustering import *

# ... (rest of your code)

# Generate synthetic labels for silhouette score calculation
synthetic_labels = make_blobs(n_samples=num_rows, random_state=42, centers=3)[1]

best_algorithm = None
best_num_clusters = None
best_silhouette_score = -1  # Initialize with a low value

for model in models_list:
    for cluster_size in range(3, 6):
        for name, args in parameters.items():
            exp = setup(X, verbose=False, **args)
            create_model(model, num_clusters=cluster_size, verbose=False)
            temp = exp.pull()

            temp['name'] = name
            temp['cluster_size'] = cluster_size

            model_results = pd.concat([model_results, temp], ignore_index=True)

        model_results.set_index(['name', 'cluster_size'], inplace=True)

        model_results_transposed = model_results.sort_index().T

        # Update synthetic labels to match the number of samples
        synthetic_labels = make_blobs(n_samples=model_results_transposed.shape[0], random_state=42, centers=3)[1]

        silhouette_score_value = silhouette_score(model_results_transposed.values, synthetic_labels, metric='euclidean')

        # Update best values if the current configuration has a higher silhouette score
        if silhouette_score_value > best_silhouette_score:
            best_algorithm = model
            best_num_clusters = cluster_size
            best_silhouette_score = silhouette_score_value

# Display the best results
print(f"\nBest Clustering Algorithm: {best_algorithm}")
print(f"Best Number of Clusters: {best_num_clusters}")
print(f"Best Silhouette Score: {best_silhouette_score}")




Best Clustering Algorithm: meanshift
Best Number of Clusters: 5
Best Silhouette Score: 0.00017286201501372686
