<a href="https://colab.research.google.com/github/rhy004/Clustering/blob/main/Clustering_SalesTransactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INSTALLING LIBRARIES

In [1]:
!pip install pycaret &> /dev/null
print("pycaret installed")

pycaret installed


In [2]:
!pip install pyarrow  &> /dev/null

In [3]:
import pandas as pd
import numpy as np

In [38]:
df = pd.read_csv("/content/Sales_Transactions_Dataset_Weekly.csv")

In [39]:
from pycaret.utils import version
version()

'3.2.0'

# SETUP AND FUNCTIONS

In [40]:
from pycaret.clustering import *
Clustering = setup(df)

Unnamed: 0,Description,Value
0,Session id,2929
1,Original data shape,"(811, 107)"
2,Transformed data shape,"(811, 917)"
3,Numeric features,106
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


In [41]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

def eval(df,labels):
    silhouette = silhouette_score(df, labels)
    print(f"Silhouette Score: {silhouette:.2f}")
    ch_index = calinski_harabasz_score(df, labels)
    print(f"Calinski-Harabasz Index: {ch_index:.2f}")
    db_index = davies_bouldin_score(df, labels)
    print(f"Davies-Bouldin Index: {db_index:.2f}")
    return [silhouette,ch_index,db_index]

In [42]:
def model1(df,method,count):
    n_cluster = [3,4,5]
    result = []
    for i in n_cluster:
        kmeans = create_model('kmeans', num_clusters=i)
        kmeans_df = assign_model(kmeans)
        j,k,l =eval(df,kmeans_df['Cluster'])
        result.append({
            'Method': method,
            'Model': 'KMeans',
            'Num_Clusters': i,
            'Silhouette': j,
            'CH_Index': k,
            'DB_Index': l
        })
        hclust = create_model('hclust', num_clusters=i)
        hclust_df = assign_model(hclust)
        eval(df,hclust_df['Cluster'])
    results_df = pd.DataFrame(result)
    results_df.to_csv(f'kmeans{count}.csv', index=False)

def model2(df,method,count):
    n_cluster = [3,4,5]
    result = []
    for i in n_cluster:
        birch = create_model('birch', num_clusters=i)
        birch_df = assign_model(birch)
        j,k,l =eval(df,birch_df['Cluster'])
        result.append({
            'Method': method,
            'Model': 'birch',
            'Num_Clusters': i,
            'Silhouette': j,
            'CH_Index': k,
            'DB_Index': l
        })
    results_df = pd.DataFrame(result)
    results_df.to_csv(f'birch{count}.csv', index=False)

def model3(df,method,count):
    n_cluster = [3,4,5]
    result = []
    for i in n_cluster:
        hclust = create_model('birch', num_clusters=i)
        hclust_df = assign_model(hclust)
        j,k,l =eval(df,hclust_df['Cluster'])
        result.append({
            'Method': method,
            'Model': 'hclust',
            'Num_Clusters': i,
            'Silhouette': j,
            'CH_Index': k,
            'DB_Index': l
        })
    results_df = pd.DataFrame(result)
    results_df.to_csv(f'hclust{count}.csv', index=False)

In [43]:
X_transformed = get_config('X_transformed')

In [44]:
X_transformed

Unnamed: 0,Product_Code_P1,Product_Code_P2,Product_Code_P3,Product_Code_P4,Product_Code_P5,Product_Code_P6,Product_Code_P7,Product_Code_P8,Product_Code_P9,Product_Code_P10,...,Normalized 42,Normalized 43,Normalized 44,Normalized 45,Normalized 46,Normalized 47,Normalized 48,Normalized 49,Normalized 50,Normalized 51
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06,0.22,0.28,0.39,0.50,0.00,0.22,0.17,0.11,0.39
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.20,0.40,0.50,0.10,0.10,0.40,0.50,0.10,0.60,0.00
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.27,1.00,0.18,0.18,0.36,0.45,1.00,0.45,0.45,0.36
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.41,0.47,0.06,0.12,0.24,0.35,0.71,0.35,0.29,0.35
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.27,0.53,0.27,0.60,0.20,0.20,0.13,0.53,0.33,0.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.33,0.33,0.00,0.00,0.33,0.00,0.00,0.67,0.00
807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.43,0.43,0.57,0.29,0.57,0.71,0.71,0.71,0.86,0.71
808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.50,0.00,0.00,0.50,0.50,0.00,0.00,0.00,1.00,0.75
809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.50,0.50,0.00,0.00,0.00,1.00,0.00


In [45]:
model1(X_transformed,'No Data Processing',1)
model2(X_transformed,'No Data Processing',1)
model3(X_transformed,'No Data Processing',1)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6141,3188.1254,0.6225,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.61
Calinski-Harabasz Index: 3188.13
Davies-Bouldin Index: 0.62


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6123,3091.1022,0.6278,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.61
Calinski-Harabasz Index: 3091.10
Davies-Bouldin Index: 0.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5666,2601.2288,0.8528,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.57
Calinski-Harabasz Index: 2601.23
Davies-Bouldin Index: 0.85


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5675,2581.5766,0.8658,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.57
Calinski-Harabasz Index: 2581.58
Davies-Bouldin Index: 0.87


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4538,2236.8582,0.9098,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.45
Calinski-Harabasz Index: 2236.86
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4533,2232.213,0.914,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.45
Calinski-Harabasz Index: 2232.21
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6123,3091.1022,0.6278,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.61
Calinski-Harabasz Index: 3091.10
Davies-Bouldin Index: 0.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5675,2581.5766,0.8658,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.57
Calinski-Harabasz Index: 2581.58
Davies-Bouldin Index: 0.87


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4533,2232.213,0.914,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.45
Calinski-Harabasz Index: 2232.21
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6123,3091.1022,0.6278,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.61
Calinski-Harabasz Index: 3091.10
Davies-Bouldin Index: 0.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5675,2581.5766,0.8658,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.57
Calinski-Harabasz Index: 2581.58
Davies-Bouldin Index: 0.87


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4533,2232.213,0.914,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.45
Calinski-Harabasz Index: 2232.21
Davies-Bouldin Index: 0.91


# PARAMETER EVALUATION FOR DIFFERENT PRE PROCESSING

In [46]:
Clustering = setup(df,normalize = True,normalize_method = 'zscore')

Unnamed: 0,Description,Value
0,Session id,4876
1,Original data shape,"(811, 107)"
2,Transformed data shape,"(811, 917)"
3,Numeric features,106
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


In [47]:
X_transformed = get_config('X_transformed')

In [48]:
X_transformed

Unnamed: 0,Product_Code_P1,Product_Code_P2,Product_Code_P3,Product_Code_P4,Product_Code_P5,Product_Code_P6,Product_Code_P7,Product_Code_P8,Product_Code_P9,Product_Code_P10,...,Normalized 42,Normalized 43,Normalized 44,Normalized 45,Normalized 46,Normalized 47,Normalized 48,Normalized 49,Normalized 50,Normalized 51
0,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-0.896267,-0.263463,-0.094387,0.282308,0.600801,-1.183443,-0.428828,-0.659373,-0.891511,-0.110890
1,-0.035136,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-0.371585,0.438369,0.741373,-0.824292,-0.853567,0.321079,0.587434,-0.903710,0.769424,-1.250744
2,-0.035136,-0.035136,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-0.109243,2.777809,-0.474278,-0.519023,0.091772,0.509144,2.402188,0.317979,0.260975,-0.198571
3,-0.035136,-0.035136,-0.035136,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,0.415439,0.711303,-0.930148,-0.747975,-0.344538,0.133014,1.349630,-0.031075,-0.281371,-0.227798
4,-0.035136,-0.035136,-0.035136,-0.035136,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-0.109243,0.945247,-0.132376,1.083639,-0.489975,-0.431182,-0.755483,0.597223,-0.145785,-0.081663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-1.121131,0.165434,0.095558,-1.205879,-1.217159,0.057788,-1.227319,-1.252765,1.006701,-1.250744
807,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,0.490394,0.555341,1.007297,-0.099278,0.855316,1.487083,1.349630,1.225520,1.650737,0.824375
808,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,0.752735,-1.121258,-1.158082,0.702053,0.600801,-1.183443,-1.227319,-1.252765,2.125289,0.941284
809,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-1.121131,-1.121258,-1.158082,0.702053,0.600801,-1.183443,-1.227319,-1.252765,2.125289,-1.250744


In [49]:
model1(X_transformed,'Normalized',2)
model2(X_transformed,'Normalized',2)
model3(X_transformed,'Normalized',2)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0258,29.2131,5.7314,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.03
Calinski-Harabasz Index: 29.21
Davies-Bouldin Index: 5.73


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0258,28.9372,5.6755,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.03
Calinski-Harabasz Index: 28.94
Davies-Bouldin Index: 5.68


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0153,20.8878,7.7836,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 20.89
Davies-Bouldin Index: 7.78


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0149,20.7535,7.9009,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 20.75
Davies-Bouldin Index: 7.90


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0117,16.0406,9.1345,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 16.04
Davies-Bouldin Index: 9.13


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0109,16.1339,8.7452,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 16.13
Davies-Bouldin Index: 8.75


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0258,28.9372,5.6755,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.03
Calinski-Harabasz Index: 28.94
Davies-Bouldin Index: 5.68


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0149,20.7535,7.9009,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 20.75
Davies-Bouldin Index: 7.90


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0109,16.1339,8.7452,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 16.13
Davies-Bouldin Index: 8.75


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0258,28.9372,5.6755,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.03
Calinski-Harabasz Index: 28.94
Davies-Bouldin Index: 5.68


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0149,20.7535,7.9009,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 20.75
Davies-Bouldin Index: 7.90


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0109,16.1339,8.7452,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 16.13
Davies-Bouldin Index: 8.75


In [50]:
Clustering = setup(df, transformation = True,transformation_method = 'yeo-johnson')

Unnamed: 0,Description,Value
0,Session id,2245
1,Original data shape,"(811, 107)"
2,Transformed data shape,"(811, 917)"
3,Numeric features,106
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


In [51]:
X_transformed = get_config('X_transformed')

In [52]:
X_transformed

Unnamed: 0,Product_Code_P1,Product_Code_P2,Product_Code_P3,Product_Code_P4,Product_Code_P5,Product_Code_P6,Product_Code_P7,Product_Code_P8,Product_Code_P9,Product_Code_P10,...,Normalized 42,Normalized 43,Normalized 44,Normalized 45,Normalized 46,Normalized 47,Normalized 48,Normalized 49,Normalized 50,Normalized 51
0,0.000855,-0.000000,-0.000000,-0.000000,-0.000000,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.056770,0.182661,0.227034,0.302831,0.363977,-0.000000,0.189358,0.153456,0.103431,0.332109
1,-0.000000,0.000855,-0.000000,-0.000000,-0.000000,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.168177,0.291856,0.353821,0.093003,0.092889,0.310455,0.367289,0.093995,0.451545,0.000000
2,-0.000000,-0.000000,0.000855,-0.000000,-0.000000,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.215103,0.521073,0.156438,0.158640,0.283212,0.340011,0.586538,0.352097,0.359963,0.309930
3,-0.000000,-0.000000,-0.000000,0.000855,-0.000000,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.295633,0.327547,0.057116,0.110077,0.203037,0.279289,0.471036,0.287318,0.249158,0.302434
4,-0.000000,-0.000000,-0.000000,-0.000000,0.000855,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.215103,0.355705,0.220398,0.417322,0.173597,0.174494,0.118586,0.399897,0.278312,0.339402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.000000,0.252672,0.258945,-0.000000,-0.000000,0.266338,-0.000000,-0.000000,0.490902,0.000000
807,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.305913,0.307552,0.387793,0.238613,0.400134,0.472382,0.471036,0.496542,0.588668,0.543967
808,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.0,-0.0,-0.0,-0.0,-0.0,...,0.339839,-0.000000,-0.000000,0.365837,0.363977,-0.000000,-0.000000,-0.000000,0.653502,0.567751
809,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.000000,-0.000000,-0.000000,0.365837,0.363977,-0.000000,-0.000000,-0.000000,0.653502,0.000000


In [53]:
model1(X_transformed,'transformed',3)
model2(X_transformed,'transformed',3)
model3(X_transformed,'transformed',3)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4764,1877.8974,0.8078,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.48
Calinski-Harabasz Index: 1877.90
Davies-Bouldin Index: 0.81


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4745,1864.9927,0.8096,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.47
Calinski-Harabasz Index: 1864.99
Davies-Bouldin Index: 0.81


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4566,1845.174,0.8721,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.46
Calinski-Harabasz Index: 1845.17
Davies-Bouldin Index: 0.87


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4605,1822.7091,0.8394,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.46
Calinski-Harabasz Index: 1822.71
Davies-Bouldin Index: 0.84


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3645,1533.1167,1.2781,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 1533.12
Davies-Bouldin Index: 1.28


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.363,1507.7948,1.2777,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 1507.79
Davies-Bouldin Index: 1.28


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4745,1864.9927,0.8096,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.47
Calinski-Harabasz Index: 1864.99
Davies-Bouldin Index: 0.81


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4605,1822.7091,0.8394,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.46
Calinski-Harabasz Index: 1822.71
Davies-Bouldin Index: 0.84


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3613,1509.6428,1.2827,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 1509.64
Davies-Bouldin Index: 1.28


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4745,1864.9927,0.8096,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.47
Calinski-Harabasz Index: 1864.99
Davies-Bouldin Index: 0.81


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4605,1822.7091,0.8394,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.46
Calinski-Harabasz Index: 1822.71
Davies-Bouldin Index: 0.84


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3613,1509.6428,1.2827,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 1509.64
Davies-Bouldin Index: 1.28


In [54]:
Clustering = setup(df, pca = True,pca_method = 'linear')

Unnamed: 0,Description,Value
0,Session id,1258
1,Original data shape,"(811, 107)"
2,Transformed data shape,"(811, 811)"
3,Numeric features,106
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


In [55]:
X_transformed = get_config('X_transformed')

In [56]:
X_transformed

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca801,pca802,pca803,pca804,pca805,pca806,pca807,pca808,pca809,pca810
0,6.081114,-7.301664,2.004908,-5.036443,-1.453142,-0.928395,7.604436,-7.479716,3.373972,0.122868,...,1.446536e-15,1.633351e-16,5.866211e-16,-1.515010e-15,1.378021e-15,-4.692037e-16,8.280865e-16,0.908925,-9.311399e-16,9.367507e-16
1,-36.476562,-4.090887,2.749835,-1.398390,-4.114317,5.801501,-3.889281,4.628092,-1.550255,4.180193,...,-1.812988e-02,2.020045e-02,-1.608906e-02,9.102253e-04,1.896615e-02,1.271094e-02,-1.960465e-02,-0.244765,4.570205e-01,-7.244205e-14
2,-2.313286,-2.180751,-2.690020,1.808793,4.108286,-3.111408,-1.963624,-0.778193,5.866588,-0.042862,...,3.812188e-02,-3.922983e-02,-4.265385e-02,1.392937e-02,-4.997969e-02,-6.809321e-03,-2.775010e-01,0.003376,-1.775332e-02,3.286087e-14
3,-4.173271,-2.604199,-2.158100,-1.500562,0.989142,-1.863381,1.347841,7.485402,-0.971854,3.490470,...,2.110794e-01,-1.723388e-02,3.456113e-03,-2.971424e-02,-6.193950e-03,1.652900e-02,-1.015606e-01,0.050780,2.618523e-01,-2.484644e-14
4,-2.600648,1.781717,7.035220,1.161173,8.380258,-0.488953,0.135902,-6.787866,-1.778162,-5.968254,...,3.904193e-01,1.940956e-02,1.673885e-01,1.128033e-01,3.700908e-02,1.590731e-01,-3.953143e-02,0.023795,1.200289e-01,1.846526e-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,-62.678741,-3.966666,1.075144,-0.649190,-0.758838,1.056675,1.077057,0.790108,0.669175,-1.036548,...,7.253659e-03,-1.413066e-04,8.228362e-03,4.988595e-03,-3.711414e-02,-1.706445e-02,-1.119059e-02,-0.010699,-2.253585e-02,1.212832e-14
807,-46.744610,1.707994,-2.491156,-0.540245,-1.830241,0.289093,2.968794,1.589330,6.689853,1.922429,...,1.510136e-02,2.385480e-02,-1.553696e-03,1.604152e-02,-1.179481e-03,-2.173711e-02,1.392005e-03,0.000709,2.111936e-02,1.784944e-14
808,-62.280597,-1.363657,-0.287264,-1.128674,0.389991,-0.580639,-1.167904,0.789483,-0.154224,1.280052,...,-2.317446e-03,-1.144673e-02,-3.813935e-02,4.270877e-02,4.662455e-03,-9.022509e-03,1.868250e-02,0.000956,-4.632736e-03,1.470091e-14
809,-63.875200,-2.697456,-0.671587,0.310977,0.111430,-0.011938,-1.018957,0.167919,0.132791,-0.656650,...,1.381504e-02,3.178917e-02,-1.292502e-02,5.833922e-04,9.183179e-03,-9.652774e-03,-1.800157e-02,0.007050,2.166932e-02,1.212658e-14


In [57]:
model1(X_transformed,'Pca',4)
model2(X_transformed,'Pca',4)
model3(X_transformed,'Pca',4)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6141,3188.1254,0.6225,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.61
Calinski-Harabasz Index: 3188.13
Davies-Bouldin Index: 0.62


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6123,3091.1022,0.6278,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.61
Calinski-Harabasz Index: 3091.10
Davies-Bouldin Index: 0.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5666,2601.2288,0.8528,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.57
Calinski-Harabasz Index: 2601.23
Davies-Bouldin Index: 0.85


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5675,2581.5766,0.8658,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.57
Calinski-Harabasz Index: 2581.58
Davies-Bouldin Index: 0.87


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4538,2236.8582,0.9098,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.45
Calinski-Harabasz Index: 2236.86
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4533,2232.213,0.914,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.45
Calinski-Harabasz Index: 2232.21
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6123,3091.1022,0.6278,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.61
Calinski-Harabasz Index: 3091.10
Davies-Bouldin Index: 0.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5675,2581.5766,0.8658,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.57
Calinski-Harabasz Index: 2581.58
Davies-Bouldin Index: 0.87


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4533,2232.213,0.914,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.45
Calinski-Harabasz Index: 2232.21
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6123,3091.1022,0.6278,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.61
Calinski-Harabasz Index: 3091.10
Davies-Bouldin Index: 0.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5675,2581.5766,0.8658,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.57
Calinski-Harabasz Index: 2581.58
Davies-Bouldin Index: 0.87


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4533,2232.213,0.914,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.45
Calinski-Harabasz Index: 2232.21
Davies-Bouldin Index: 0.91


In [58]:
Clustering = setup(df, transformation = True, normalize = True,
      normalize_method = 'zscore', transformation_method = 'yeo-johnson')

Unnamed: 0,Description,Value
0,Session id,3648
1,Original data shape,"(811, 107)"
2,Transformed data shape,"(811, 917)"
3,Numeric features,106
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


In [59]:
X_transformed = get_config('X_transformed')

In [60]:
X_transformed

Unnamed: 0,Product_Code_P1,Product_Code_P2,Product_Code_P3,Product_Code_P4,Product_Code_P5,Product_Code_P6,Product_Code_P7,Product_Code_P8,Product_Code_P9,Product_Code_P10,...,Normalized 42,Normalized 43,Normalized 44,Normalized 45,Normalized 46,Normalized 47,Normalized 48,Normalized 49,Normalized 50,Normalized 51
0,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-0.928946,-0.101854,0.069828,0.423802,0.714784,-1.314426,-0.314562,-0.595485,-0.884492,0.002414
1,-0.035136,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-0.221845,0.598691,0.839412,-0.808168,-0.849915,0.458308,0.699359,-0.906020,0.829182,-1.348042
2,-0.035136,-0.035136,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,0.076003,2.069227,-0.358684,-0.422794,0.248617,0.627076,1.948722,0.441921,0.378348,-0.087774
3,-0.035136,-0.035136,-0.035136,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,0.587128,0.827661,-0.961557,-0.707922,-0.214149,0.280344,1.290544,0.103614,-0.167118,-0.118257
4,-0.035136,-0.035136,-0.035136,-0.035136,28.460499,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,0.076003,1.008313,0.029549,1.096024,-0.384074,-0.318046,-0.717844,0.691561,-0.023599,0.032067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-1.289272,0.347303,0.263524,-1.354221,-1.386060,0.206393,-1.393594,-1.396909,1.022923,-1.348042
807,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,0.652375,0.699382,1.045616,0.046758,0.923480,1.382927,1.290544,1.196288,1.504203,0.863889
808,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,0.867710,-1.273710,-1.308248,0.793734,0.714784,-1.314426,-1.393594,-1.396909,1.823361,0.960601
809,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,-0.035136,...,-1.289272,-1.273710,-1.308248,0.793734,0.714784,-1.314426,-1.393594,-1.396909,1.823361,-1.348042


In [61]:
model1(X_transformed,'T+N',5)
model2(X_transformed,'T+N',5)
model3(X_transformed,'T+N',5)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0232,30.9249,6.0734,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 30.92
Davies-Bouldin Index: 6.07


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0227,30.7747,6.1811,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 30.77
Davies-Bouldin Index: 6.18


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.015,22.0239,7.6816,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 22.02
Davies-Bouldin Index: 7.68


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0155,21.9195,7.6878,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 21.92
Davies-Bouldin Index: 7.69


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0087,17.1442,8.6271,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 17.14
Davies-Bouldin Index: 8.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0074,17.0919,9.3205,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 17.09
Davies-Bouldin Index: 9.32


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0227,30.7747,6.1811,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 30.77
Davies-Bouldin Index: 6.18


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0155,21.9195,7.6878,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 21.92
Davies-Bouldin Index: 7.69


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0074,17.0919,9.3205,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 17.09
Davies-Bouldin Index: 9.32


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0227,30.7747,6.1811,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 30.77
Davies-Bouldin Index: 6.18


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0155,21.9195,7.6878,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 21.92
Davies-Bouldin Index: 7.69


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0074,17.0919,9.3205,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 17.09
Davies-Bouldin Index: 9.32


In [62]:
Clustering = setup(df, transformation = True, normalize = True, pca = True,
      normalize_method = 'zscore', transformation_method = 'yeo-johnson',
      pca_method = 'linear')

Unnamed: 0,Description,Value
0,Session id,4655
1,Original data shape,"(811, 107)"
2,Transformed data shape,"(811, 811)"
3,Numeric features,106
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


In [63]:
X_transformed = get_config('X_transformed')

In [64]:
X_transformed

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca801,pca802,pca803,pca804,pca805,pca806,pca807,pca808,pca809,pca810
0,5.333364,0.843624,-2.574572,-0.229633,-1.993705,0.381294,0.565564,-0.513103,0.473962,0.712658,...,-3.816392e-17,-6.158268e-17,1.040834e-17,1.214306e-17,1.040834e-17,-1.652324e-16,2.949030e-17,2.272488e-16,1.621966e-16,6.800116e-16
1,1.494063,-2.504333,-3.386638,-1.585637,-0.253755,-2.562377,-0.980901,0.058536,1.531251,-0.636686,...,-3.759685e-02,-1.158868e+00,-5.126286e-01,9.911401e-01,-7.333946e-01,-1.083782e+00,-2.961829e-01,1.289592e+00,1.250646e+00,-4.218847e-15
2,6.432341,-2.279906,-1.222827,-0.738929,1.314368,-0.443442,1.726971,1.272649,0.919640,0.133728,...,-5.188415e-01,3.639022e-01,1.101604e+00,-4.941095e-01,1.383324e-01,-1.782420e-01,1.150148e+00,-3.681608e-01,-1.032149e+00,-4.718448e-16
3,4.508143,0.187961,-1.040095,0.155557,0.914495,-3.115398,0.938130,0.031946,1.181350,-0.704634,...,2.882959e-01,-1.745973e+00,-6.418822e-01,-2.969674e-01,-3.169139e-01,-4.019503e-01,-1.311574e+00,5.707259e-01,1.272181e+00,-2.609024e-15
4,4.628421,0.712472,-0.056058,-1.244120,-0.703036,1.440619,0.274216,-1.109489,0.779837,-2.979894,...,-6.315487e-01,1.358599e+00,-1.156280e+00,6.871090e-03,1.532245e-01,3.808283e-02,3.868214e-01,2.599780e-01,-1.259331e+00,-1.887379e-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806,-8.772829,0.730120,-1.041336,0.505964,-3.927144,-0.062240,-1.219538,0.245985,-0.429116,-0.583500,...,-2.896441e-01,-6.765290e-01,-5.542259e-01,1.141736e+00,-8.888251e-01,3.030105e-01,1.029306e-01,-5.309622e-01,-8.261971e-01,2.220446e-16
807,-1.004194,-5.055784,2.097023,4.533619,-0.565913,-2.607162,0.050112,0.176702,1.294711,0.763948,...,-2.397189e-01,-1.410174e-02,1.845471e-01,1.640901e+00,8.004090e-01,-3.419444e-01,1.093200e+00,6.217876e-01,-1.608132e-01,1.776357e-15
808,-9.228123,0.705497,0.887545,0.169643,0.413638,-0.718536,-1.229109,-0.910258,2.303784,0.674792,...,4.743054e-01,-7.602208e-01,-6.131925e-01,-1.596323e+00,6.067482e-01,-2.038578e+00,5.040367e-01,1.673439e-01,-2.192193e-01,4.440892e-16
809,-9.184513,-0.771373,0.424933,-1.009026,-0.184098,0.089288,-1.581329,-1.675947,-0.383383,6.027573,...,4.201333e-01,5.765506e-01,-7.128517e-01,3.906792e-01,-4.187286e-01,2.614925e-01,-3.312398e-01,-6.471671e-02,6.393304e-01,3.330669e-16


In [65]:
model1(X_transformed,'T+N+Pca',6)
model2(X_transformed,'T+N+Pca',6)
model3(X_transformed,'T+N+Pca',6)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0232,30.9283,6.0833,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 30.93
Davies-Bouldin Index: 6.08


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0227,30.7747,6.1811,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 30.77
Davies-Bouldin Index: 6.18


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0153,22.0383,7.7249,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 22.04
Davies-Bouldin Index: 7.72


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0155,21.9195,7.6878,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 21.92
Davies-Bouldin Index: 7.69


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0084,17.0883,8.84,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 17.09
Davies-Bouldin Index: 8.84


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0074,17.0919,9.3205,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 17.09
Davies-Bouldin Index: 9.32


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0227,30.7747,6.1811,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 30.77
Davies-Bouldin Index: 6.18


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0155,21.9195,7.6878,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 21.92
Davies-Bouldin Index: 7.69


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0074,17.0919,9.3205,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 17.09
Davies-Bouldin Index: 9.32


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0227,30.7747,6.1811,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 30.77
Davies-Bouldin Index: 6.18


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0155,21.9195,7.6878,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.02
Calinski-Harabasz Index: 21.92
Davies-Bouldin Index: 7.69


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.0074,17.0919,9.3205,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.01
Calinski-Harabasz Index: 17.09
Davies-Bouldin Index: 9.32


# FINAL CSV FILE

In [66]:
import pandas as pd
import glob
csv_files = glob.glob('*.csv')
dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.to_csv('Result.csv',index = False)

In [67]:
combined_df

Unnamed: 0,Method,Model,Num_Clusters,Silhouette,CH_Index,DB_Index,Channel,Region,Fresh,Milk,...,Normalized 42,Normalized 43,Normalized 44,Normalized 45,Normalized 46,Normalized 47,Normalized 48,Normalized 49,Normalized 50,Normalized 51
0,T+N,hclust,3.0,0.022669,30.774659,6.181133,,,,,...,,,,,,,,,,
1,T+N,hclust,4.0,0.015539,21.919547,7.687783,,,,,...,,,,,,,,,,
2,T+N,hclust,5.0,0.007437,17.091905,9.320497,,,,,...,,,,,,,,,,
3,Pca,hclust,3.0,0.612335,3091.102169,0.627823,,,,,...,,,,,,,,,,
4,Pca,hclust,4.0,0.567460,2581.576593,0.865811,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,T+N+Pca,,4.0,0.247214,136.334858,1.420790,,,,,...,,,,,,,,,,
1409,T+N+Pca,,5.0,0.217894,121.225102,1.584888,,,,,...,,,,,,,,,,
1410,Normalized,hclust,3.0,0.025850,28.937191,5.675465,,,,,...,,,,,,,,,,
1411,Normalized,hclust,4.0,0.014947,20.753456,7.900886,,,,,...,,,,,,,,,,


In [68]:
kmeans = combined_df[combined_df['Model']=='KMeans'].copy()
kmeans.drop(columns=['Model'],inplace = True)
print(kmeans)
kmeans.to_csv('KMEANS.csv',index = False)

                  Method  Num_Clusters  Silhouette      CH_Index  DB_Index  \
9                    Pca           3.0    0.614108   3188.125359  0.622521   
10                   Pca           4.0    0.566608   2601.228839  0.852833   
11                   Pca           5.0    0.453794   2236.858163  0.909815   
12    No Data Processing           3.0    0.614108   3188.125359  0.622521   
13    No Data Processing           4.0    0.566608   2601.228839  0.852833   
14    No Data Processing           5.0    0.453794   2236.858163  0.909815   
24            Normalized           3.0    0.025839     29.213127  5.731366   
25            Normalized           4.0    0.015277     20.887817  7.783615   
26            Normalized           5.0    0.011712     16.040634  9.134480   
476                  Pca           3.0    0.478351    210.152559  0.889509   
477                  Pca           4.0    0.386631    207.819678  0.954347   
478                  Pca           5.0    0.369021    213.907067

In [69]:
hclust = combined_df[combined_df['Model']=='hclust'].copy()
hclust.drop(columns=['Model'],inplace = True)
print(hclust)
hclust.to_csv('HCLUST.csv',index = False)

                  Method  Num_Clusters  Silhouette      CH_Index  DB_Index  \
0                    T+N           3.0    0.022669     30.774659  6.181133   
1                    T+N           4.0    0.015539     21.919547  7.687783   
2                    T+N           5.0    0.007437     17.091905  9.320497   
3                    Pca           3.0    0.612335   3091.102169  0.627823   
4                    Pca           4.0    0.567460   2581.576593  0.865811   
5                    Pca           5.0    0.453257   2232.213027  0.914041   
18               T+N+Pca           3.0    0.022669     30.774659  6.181133   
19               T+N+Pca           4.0    0.015539     21.919547  7.687783   
20               T+N+Pca           5.0    0.007437     17.091905  9.320497   
27                   T+N           3.0    0.211211    121.174683  1.632864   
28                   T+N           4.0    0.193459    111.794140  1.680607   
29                   T+N           5.0    0.178556    104.131563

In [70]:
birch = combined_df[combined_df['Model']=='birch'].copy()
birch.drop(columns=['Model'],inplace = True)
print(birch)
birch.to_csv('BIRCH.csv',index = False)

                  Method  Num_Clusters  Silhouette      CH_Index  DB_Index  \
6            transformed           3.0    0.474476   1864.992659  0.809623   
7            transformed           4.0    0.460455   1822.709089  0.839450   
8            transformed           5.0    0.361305   1509.642797  1.282670   
15                   Pca           3.0    0.612335   3091.102169  0.627823   
16                   Pca           4.0    0.567460   2581.576593  0.865811   
17                   Pca           5.0    0.453257   2232.213027  0.914041   
21               T+N+Pca           3.0    0.022669     30.774659  6.181133   
22               T+N+Pca           4.0    0.015539     21.919547  7.687783   
23               T+N+Pca           5.0    0.007437     17.091905  9.320497   
473          transformed           3.0    0.855442  13039.763275  0.338898   
474          transformed           4.0    0.510934  14291.958744  0.605433   
475          transformed           5.0    0.443546  13978.086706