In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pycaret &> /dev/null


In [19]:
import numpy as np
from pycaret.datasets import get_data
from pycaret.clustering import *
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Load a sample dataset
data = get_data('wholesale')
# data = data.drop(columns=['species'])

# Define the preprocessing methods
preprocessing_methods = ['None', 'Normalize', 'PCA', 'Transform', 'PCA+Transform', 'PCA+Transform+Normalize']

# Define the clustering techniques
clustering_techniques = ['kmeans', 'hclust', 'meanshift']  # Updated clustering techniques

# Define the number of clusters
num_clusters = [3, 4, 5]

# Initialize dictionaries to store the results matrices for each clustering technique
results_matrices = {}

# Iterate over each clustering technique
for cluster_method in clustering_techniques:
    # Initialize an empty DataFrame to store results for this clustering method
    results_matrix = np.zeros((3, len(preprocessing_methods)), dtype=float)

    # Iterate over each preprocessing method
    for i, prep_method in enumerate(preprocessing_methods):
        # Apply the preprocessing method
        if 'Normalize' in prep_method:
            exp_clf_setup = setup(data, normalize=True, verbose=False)
        elif 'PCA' in prep_method:
            exp_clf_setup = setup(data, pca=True, pca_components=2, verbose=False)
        elif 'Transform' in prep_method:
            exp_clf_setup = setup(data, transformation=True, transformation_method='yeo-johnson', verbose=False)
        else:
            exp_clf_setup = setup(data, verbose=False)
        
        preprocessed_data = get_config('X')
        
        # Initialize lists to store the scores for each number of clusters
        silhouette_scores = []
        calinski_scores = []
        davies_scores = []
        
        # Iterate over each number of clusters
        for n_clusters in num_clusters:
            # Initialize the clustering model
            if cluster_method == 'kmeans':
                model = create_model('kmeans', num_clusters=n_clusters)
            elif cluster_method == 'hclust':
                model = create_model('hclust', linkage='ward', num_clusters=n_clusters)
            elif cluster_method == 'meanshift':
                model = create_model('meanshift')
            
            # Fit the model
            model.fit(preprocessed_data)
            save_model(model, f'{cluster_method}_{prep_method}_model')

            # Get cluster labels
            labels = model.labels_
            
            # Calculate silhouette score
            silhouette = silhouette_score(preprocessed_data, labels)
            silhouette_scores.append(silhouette)
            
            # Calculate calinski harabasz score
            calinski = calinski_harabasz_score(preprocessed_data, labels)
            calinski_scores.append(calinski)
            
            # Calculate davies bouldin score
            davies = davies_bouldin_score(preprocessed_data, labels)
            davies_scores.append(davies)
        
        # Store the scores in the results matrix
        results_matrix[0, i] = np.mean(silhouette_scores)
        results_matrix[1, i] = np.mean(calinski_scores)
        results_matrix[2, i] = np.mean(davies_scores)
    
    # Store the results matrix in the dictionary
    results_matrices[cluster_method] = results_matrix

# Writing each matrix to a CSV file
for algorithm, matrix in results_matrices.items():
    np.savetxt(f'{algorithm}_results.csv', matrix, delimiter=',', header=','.join(preprocessing_methods), comments='')


Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4784,210.1526,0.8895,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3956,207.8186,0.904,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3878,214.2765,0.8835,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3568,139.3494,1.1736,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3683,131.5821,1.1454,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3517,138.6787,1.1397,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5233,285.4804,0.751,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4682,306.9175,0.7219,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4519,353.7661,0.704,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.8554,13039.7633,0.3389,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5181,15162.6315,0.6203,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.462,14221.6229,0.7017,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.525,285.5124,0.7489,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4636,307.0293,0.7281,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4519,353.7661,0.704,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3568,139.3494,1.1736,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3414,130.689,1.2204,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3503,139.6243,1.1441,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.375,183.3597,0.9783,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3483,185.2559,0.9701,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3522,196.954,0.9106,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.36,126.2475,1.1551,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3623,120.7561,0.8255,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3178,123.5061,0.896,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5253,245.3486,0.7324,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4307,288.667,0.7928,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4316,290.1487,0.7395,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.8554,13039.7633,0.3389,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5109,14291.9587,0.6054,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.453,14101.6445,0.7125,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5253,245.3486,0.7324,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4307,288.667,0.7928,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4316,290.1487,0.7395,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.36,126.2475,1.1551,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3623,120.7561,0.8255,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3178,123.5061,0.896,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3644,54.7541,0.5765,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3644,54.7541,0.5765,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3644,54.7541,0.5765,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4054,30.8599,0.4229,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4054,30.8599,0.4229,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4054,30.8599,0.4229,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4897,100.6653,0.4861,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4897,100.6653,0.4861,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4897,100.6653,0.4861,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.9076,14114.2429,0.1625,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.9076,14114.2429,0.1625,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.9076,14114.2429,0.1625,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4897,100.6653,0.4861,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4897,100.6653,0.4861,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4897,100.6653,0.4861,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4054,30.8599,0.4229,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4054,30.8599,0.4229,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4054,30.8599,0.4229,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved


In [38]:
kmeans = pd.read_csv('/kaggle/working/kmeans_results.csv')
kmeans

Unnamed: 0,None,Normalize,PCA,Transform,PCA+Transform,PCA+Transform+Normalize
0,0.42058,0.417179,0.417139,0.425333,0.415965,0.416093
1,210.749213,210.723848,210.289695,210.555651,211.009439,211.010024
2,0.892355,0.910384,0.907081,0.885442,0.914935,0.915299


In [39]:
hclust = pd.read_csv('/kaggle/working/hclust_results.csv')
hclust

Unnamed: 0,None,Normalize,PCA,Transform,PCA+Transform,PCA+Transform+Normalize
0,0.358504,0.358504,0.358504,0.358504,0.358504,0.358504
1,188.523179,188.523179,188.523179,188.523179,188.523179,188.523179
2,0.952983,0.952983,0.952983,0.952983,0.952983,0.952983


In [40]:
meanshift = pd.read_csv('/kaggle/working/meanshift_results.csv')
meanshift

Unnamed: 0,None,Normalize,PCA,Transform,PCA+Transform,PCA+Transform+Normalize
0,0.364403,0.364403,0.364403,0.364403,0.364403,0.364403
1,54.754134,54.754134,54.754134,54.754134,54.754134,54.754134
2,0.576453,0.576453,0.576453,0.576453,0.576453,0.576453


In [None]:
# After writing the CSV files, apply plot_model
for cluster_method in clustering_techniques:
    for prep_method in preprocessing_methods:
        # Load the model
        try:
            model = load_model(f'{cluster_method}_{prep_method}_model')
            
            # Plot elbow plot
#             plot_elbow = plot_model(model, plot='elbow')
            
            # Plot distribution plot
#             plot_distribution = plot_model(model, plot='distribution')
            plot_distribution = plot_model(model, plot = 'tsne')
            # Display or save the plots as needed
#             if plot_elbow is not None:
#                 plot_elbow.show()
#             else:
#                 print(f'Elbow plot not available for {cluster_method}_{prep_method}.')
            
            if plot_distribution is not None:
                plot_distribution.show()
            else:
                print(f'Distribution plot for {cluster_method}_{prep_method}.')
                
        except FileNotFoundError:
            print(f'Model {cluster_method}_{prep_method}_model not found.')


Transformation Pipeline and Model Successfully Loaded


Distribution plot for kmeans_None.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for kmeans_Normalize.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for kmeans_PCA.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for kmeans_Transform.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for kmeans_PCA+Transform.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for kmeans_PCA+Transform+Normalize.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for hclust_None.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for hclust_Normalize.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for hclust_PCA.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for hclust_Transform.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for hclust_PCA+Transform.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for hclust_PCA+Transform+Normalize.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for meanshift_None.
Transformation Pipeline and Model Successfully Loaded


Distribution plot for meanshift_PCA.
Transformation Pipeline and Model Successfully Loaded
