<a href="https://colab.research.google.com/github/rhy004/Clustering/blob/main/Clustering_TravelReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INSTALLING LIBRARIES

In [1]:
!pip install pycaret &> /dev/null
print("pycaret installed")

pycaret installed


In [2]:
!pip install pyarrow  &> /dev/null

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("/content/Wholesale customers data.csv")

In [5]:
from pycaret.utils import version
version()

'3.2.0'

# SETUP AND FUNCTIONS

In [6]:
from pycaret.clustering import *
Clustering = setup(df)

Unnamed: 0,Description,Value
0,Session id,896
1,Original data shape,"(440, 8)"
2,Transformed data shape,"(440, 8)"
3,Numeric features,8
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,CPU Jobs,-1
9,Use GPU,False


In [7]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

def eval(df,labels):
    silhouette = silhouette_score(df, labels)
    print(f"Silhouette Score: {silhouette:.2f}")
    ch_index = calinski_harabasz_score(df, labels)
    print(f"Calinski-Harabasz Index: {ch_index:.2f}")
    db_index = davies_bouldin_score(df, labels)
    print(f"Davies-Bouldin Index: {db_index:.2f}")
    return [silhouette,ch_index,db_index]

In [8]:
def model1(df,method,count):
    n_cluster = [3,4,5]
    result = []
    for i in n_cluster:
        kmeans = create_model('kmeans', num_clusters=i)
        kmeans_df = assign_model(kmeans)
        j,k,l =eval(df,kmeans_df['Cluster'])
        result.append({
            'Method': method,
            'Model': 'KMeans',
            'Num_Clusters': i,
            'Silhouette': j,
            'CH_Index': k,
            'DB_Index': l
        })
        hclust = create_model('hclust', num_clusters=i)
        hclust_df = assign_model(hclust)
        eval(df,hclust_df['Cluster'])
    results_df = pd.DataFrame(result)
    results_df.to_csv(f'kmeans{count}.csv', index=False)

def model2(df,method,count):
    n_cluster = [3,4,5]
    result = []
    for i in n_cluster:
        birch = create_model('birch', num_clusters=i)
        birch_df = assign_model(birch)
        j,k,l =eval(df,birch_df['Cluster'])
        result.append({
            'Method': method,
            'Model': 'birch',
            'Num_Clusters': i,
            'Silhouette': j,
            'CH_Index': k,
            'DB_Index': l
        })
    results_df = pd.DataFrame(result)
    results_df.to_csv(f'birch{count}.csv', index=False)

def model3(df,method,count):
    n_cluster = [3,4,5]
    result = []
    for i in n_cluster:
        hclust = create_model('birch', num_clusters=i)
        hclust_df = assign_model(hclust)
        j,k,l =eval(df,hclust_df['Cluster'])
        result.append({
            'Method': method,
            'Model': 'hclust',
            'Num_Clusters': i,
            'Silhouette': j,
            'CH_Index': k,
            'DB_Index': l
        })
    results_df = pd.DataFrame(result)
    results_df.to_csv(f'hclust{count}.csv', index=False)

In [9]:
X_transformed = get_config('X_transformed')

In [10]:
X_transformed

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2.0,3.0,12669.0,9656.0,7561.0,214.0,2674.0,1338.0
1,2.0,3.0,7057.0,9810.0,9568.0,1762.0,3293.0,1776.0
2,2.0,3.0,6353.0,8808.0,7684.0,2405.0,3516.0,7844.0
3,1.0,3.0,13265.0,1196.0,4221.0,6404.0,507.0,1788.0
4,2.0,3.0,22615.0,5410.0,7198.0,3915.0,1777.0,5185.0
...,...,...,...,...,...,...,...,...
435,1.0,3.0,29703.0,12051.0,16027.0,13135.0,182.0,2204.0
436,1.0,3.0,39228.0,1431.0,764.0,4510.0,93.0,2346.0
437,2.0,3.0,14531.0,15488.0,30243.0,437.0,14841.0,1867.0
438,1.0,3.0,10290.0,1981.0,2232.0,1038.0,168.0,2125.0


In [11]:
model1(X_transformed,'No Data Processing',1)
model2(X_transformed,'No Data Processing',1)
model3(X_transformed,'No Data Processing',1)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4784,210.1526,0.8895,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.48
Calinski-Harabasz Index: 210.15
Davies-Bouldin Index: 0.89


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.375,183.3597,0.9783,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.38
Calinski-Harabasz Index: 183.36
Davies-Bouldin Index: 0.98


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3866,207.8197,0.9543,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.39
Calinski-Harabasz Index: 207.82
Davies-Bouldin Index: 0.95


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3483,185.2559,0.9701,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 185.26
Davies-Bouldin Index: 0.97


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3833,215.0578,0.902,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.38
Calinski-Harabasz Index: 215.06
Davies-Bouldin Index: 0.90


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3522,196.954,0.9106,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 196.95
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.375,183.3597,0.9783,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.38
Calinski-Harabasz Index: 183.36
Davies-Bouldin Index: 0.98


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3483,185.2559,0.9701,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 185.26
Davies-Bouldin Index: 0.97


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3522,196.954,0.9106,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 196.95
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.375,183.3597,0.9783,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.38
Calinski-Harabasz Index: 183.36
Davies-Bouldin Index: 0.98


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3483,185.2559,0.9701,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 185.26
Davies-Bouldin Index: 0.97


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3522,196.954,0.9106,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 196.95
Davies-Bouldin Index: 0.91


# PARAMETER EVALUATION FOR DIFFERENT PRE PROCESSING

In [12]:
Clustering = setup(df,normalize = True,normalize_method = 'zscore')

Unnamed: 0,Description,Value
0,Session id,6398
1,Original data shape,"(440, 8)"
2,Transformed data shape,"(440, 8)"
3,Numeric features,8
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,Normalize,True
9,Normalize method,zscore


In [13]:
X_transformed = get_config('X_transformed')

In [14]:
X_transformed

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,1.448652,0.590668,0.052933,0.523568,-0.041115,-0.589367,-0.043569,-0.066339
1,1.448652,0.590668,-0.391302,0.544458,0.170318,-0.270136,0.086407,0.089151
2,1.448652,0.590668,-0.447029,0.408538,-0.028157,-0.137536,0.133232,2.243293
3,-0.690297,0.590668,0.100111,-0.624020,-0.392977,0.687144,-0.498588,0.093411
4,1.448652,0.590668,0.840239,-0.052396,-0.079356,0.173859,-0.231918,1.299347
...,...,...,...,...,...,...,...,...
435,-0.690297,0.590668,1.401312,0.848446,0.850760,2.075222,-0.566831,0.241091
436,-0.690297,0.590668,2.155293,-0.592142,-0.757165,0.296561,-0.585519,0.291501
437,1.448652,0.590668,0.200326,1.314671,2.348386,-0.543380,2.511218,0.121456
438,-0.690297,0.590668,-0.135384,-0.517536,-0.602514,-0.419441,-0.569770,0.213046


In [15]:
model1(X_transformed,'Normalized',2)
model2(X_transformed,'Normalized',2)
model3(X_transformed,'Normalized',2)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3568,139.3494,1.1736,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 139.35
Davies-Bouldin Index: 1.17


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.36,126.2475,1.1551,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 126.25
Davies-Bouldin Index: 1.16


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3478,130.93,1.2377,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 130.93
Davies-Bouldin Index: 1.24


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3623,120.7561,0.8255,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 120.76
Davies-Bouldin Index: 0.83


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3529,137.9523,1.1481,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 137.95
Davies-Bouldin Index: 1.15


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3178,123.5061,0.896,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.32
Calinski-Harabasz Index: 123.51
Davies-Bouldin Index: 0.90


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6356,79.9203,0.5729,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.64
Calinski-Harabasz Index: 79.92
Davies-Bouldin Index: 0.57


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3552,127.0898,0.8597,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 127.09
Davies-Bouldin Index: 0.86


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3632,115.7714,0.7945,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 115.77
Davies-Bouldin Index: 0.79


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.6356,79.9203,0.5729,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.64
Calinski-Harabasz Index: 79.92
Davies-Bouldin Index: 0.57


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3552,127.0898,0.8597,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 127.09
Davies-Bouldin Index: 0.86


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3632,115.7714,0.7945,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.36
Calinski-Harabasz Index: 115.77
Davies-Bouldin Index: 0.79


In [16]:
Clustering = setup(df, transformation = True,transformation_method = 'yeo-johnson')

Unnamed: 0,Description,Value
0,Session id,5818
1,Original data shape,"(440, 8)"
2,Transformed data shape,"(440, 8)"
3,Numeric features,8
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,Transformation,True
9,Transformation method,yeo-johnson


In [17]:
X_transformed = get_config('X_transformed')

In [18]:
X_transformed

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,0.175221,334.331131,51.417803,12.627966,16.321114,6.865346,9.898261,15.099508
1,0.175221,334.331131,42.744825,12.657043,17.042362,10.573007,10.222421,16.203525
2,0.175221,334.331131,41.339338,12.459656,16.369883,11.181489,10.325210,23.048894
3,0.172171,334.331131,52.164618,9.046366,14.623169,13.209915,7.440912,16.230491
4,0.175221,334.331131,61.612680,11.584136,16.173020,12.168787,9.272923,20.947768
...,...,...,...,...,...,...,...,...
435,0.172171,334.331131,67.045314,13.037918,18.698220,14.813103,6.039063,17.085180
436,0.172171,334.331131,73.055859,9.334791,10.297295,12.463501,5.166199,17.346804
437,0.175221,334.331131,53.675704,13.509508,20.887195,8.043217,12.681387,16.404436
438,0.172171,334.331131,48.162513,9.866602,12.902328,9.575766,5.933083,16.933631


In [19]:
model1(X_transformed,'transformed',3)
model2(X_transformed,'transformed',3)
model3(X_transformed,'transformed',3)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.8554,13039.7633,0.3389,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.86
Calinski-Harabasz Index: 13039.76
Davies-Bouldin Index: 0.34


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.8554,13039.7633,0.3389,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.86
Calinski-Harabasz Index: 13039.76
Davies-Bouldin Index: 0.34


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5181,15162.6315,0.6203,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.52
Calinski-Harabasz Index: 15162.63
Davies-Bouldin Index: 0.62


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5109,14291.9587,0.6054,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.51
Calinski-Harabasz Index: 14291.96
Davies-Bouldin Index: 0.61


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4622,14223.6948,0.6969,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.46
Calinski-Harabasz Index: 14223.69
Davies-Bouldin Index: 0.70


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.453,14101.6445,0.7125,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.45
Calinski-Harabasz Index: 14101.64
Davies-Bouldin Index: 0.71


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.8554,13039.7633,0.3389,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.86
Calinski-Harabasz Index: 13039.76
Davies-Bouldin Index: 0.34


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5109,14291.9587,0.6054,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.51
Calinski-Harabasz Index: 14291.96
Davies-Bouldin Index: 0.61


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4435,13978.0867,0.7249,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.44
Calinski-Harabasz Index: 13978.09
Davies-Bouldin Index: 0.72


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.8554,13039.7633,0.3389,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.86
Calinski-Harabasz Index: 13039.76
Davies-Bouldin Index: 0.34


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5109,14291.9587,0.6054,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.51
Calinski-Harabasz Index: 14291.96
Davies-Bouldin Index: 0.61


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4435,13978.0867,0.7249,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.44
Calinski-Harabasz Index: 13978.09
Davies-Bouldin Index: 0.72


In [20]:
Clustering = setup(df, pca = True,pca_method = 'linear')

Unnamed: 0,Description,Value
0,Session id,532
1,Original data shape,"(440, 8)"
2,Transformed data shape,"(440, 8)"
3,Numeric features,8
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,PCA,True
9,PCA method,linear


In [21]:
X_transformed = get_config('X_transformed')

In [22]:
X_transformed

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7
0,650.022120,1585.519107,-95.390654,4540.780488,-356.637113,226.711829,-0.436434,-0.612153
1,-4426.804984,4042.451521,1534.804735,2567.655666,-44.394277,468.937999,-0.470410,-0.566833
2,-4841.998710,2578.762188,3801.384780,2273.494345,5245.385443,-2141.123348,-0.409387,-0.555028
3,990.346444,-6279.806002,1396.959816,-3310.006069,318.152343,247.816339,-0.485615,0.149400
4,10657.998731,-2159.725789,-20.042994,172.715394,3188.717127,-510.145118,-0.412244,-0.739625
...,...,...,...,...,...,...,...,...
435,20140.848491,6306.877234,5732.321561,-4339.060113,-1291.338631,5287.106826,-0.453961,0.152358
436,25912.249956,-11757.647089,-3275.758886,524.227272,316.064131,-1336.409531,-0.360522,0.033987
437,4555.114978,26201.758616,-5887.432918,-2082.906872,-29.795798,-1030.682166,-0.436917,0.067545
438,-2734.370912,-7070.775343,-790.703024,1344.547892,1448.412729,-219.126151,-0.425550,0.204710


In [23]:
model1(X_transformed,'Pca',4)
model2(X_transformed,'Pca',4)
model3(X_transformed,'Pca',4)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4784,210.1526,0.8895,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.48
Calinski-Harabasz Index: 210.15
Davies-Bouldin Index: 0.89


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.375,183.3597,0.9783,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.38
Calinski-Harabasz Index: 183.36
Davies-Bouldin Index: 0.98


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3866,207.8197,0.9543,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.39
Calinski-Harabasz Index: 207.82
Davies-Bouldin Index: 0.95


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3483,185.2559,0.9701,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 185.26
Davies-Bouldin Index: 0.97


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.369,213.9071,0.8906,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.37
Calinski-Harabasz Index: 213.91
Davies-Bouldin Index: 0.89


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3522,196.954,0.9106,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 196.95
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.375,183.3597,0.9783,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.38
Calinski-Harabasz Index: 183.36
Davies-Bouldin Index: 0.98


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3483,185.2559,0.9701,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 185.26
Davies-Bouldin Index: 0.97


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3522,196.954,0.9106,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 196.95
Davies-Bouldin Index: 0.91


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.375,183.3597,0.9783,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.38
Calinski-Harabasz Index: 183.36
Davies-Bouldin Index: 0.98


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3483,185.2559,0.9701,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 185.26
Davies-Bouldin Index: 0.97


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3522,196.954,0.9106,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.35
Calinski-Harabasz Index: 196.95
Davies-Bouldin Index: 0.91


In [24]:
Clustering = setup(df, transformation = True, normalize = True,
      normalize_method = 'zscore', transformation_method = 'yeo-johnson')

Unnamed: 0,Description,Value
0,Session id,8539
1,Original data shape,"(440, 8)"
2,Transformed data shape,"(440, 8)"
3,Numeric features,8
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,Transformation,True
9,Transformation method,yeo-johnson


In [25]:
X_transformed = get_config('X_transformed')

In [26]:
X_transformed

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,1.448652,0.622074,0.413012,0.979454,0.394901,-1.457973,0.620530,0.338074
1,1.448652,0.622074,-0.110911,0.995243,0.628376,0.080081,0.751556,0.602222
2,1.448652,0.622074,-0.195814,0.888063,0.410688,0.332499,0.793104,2.240051
3,-0.690297,0.622074,0.458126,-0.965336,-0.154740,1.173953,-0.372736,0.608674
4,1.448652,0.622074,1.028870,0.412660,0.346961,0.742061,0.367767,1.737333
...,...,...,...,...,...,...,...,...
435,-0.690297,0.622074,1.357048,1.202056,1.164393,1.839006,-0.939366,0.813167
436,-0.690297,0.622074,1.720136,-0.808722,-1.555067,0.864317,-1.292179,0.875764
437,1.448652,0.622074,0.549408,1.458127,1.872985,-0.969355,1.745475,0.650292
438,-0.690297,0.622074,0.216364,-0.519952,-0.711793,-0.333606,-0.982203,0.776907


In [27]:
model1(X_transformed,'T+N',5)
model2(X_transformed,'T+N',5)
model3(X_transformed,'T+N',5)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2228,153.5484,1.6406,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.22
Calinski-Harabasz Index: 153.55
Davies-Bouldin Index: 1.64


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2288,138.5574,1.5189,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.23
Calinski-Harabasz Index: 138.56
Davies-Bouldin Index: 1.52


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2469,136.3423,1.4233,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.25
Calinski-Harabasz Index: 136.34
Davies-Bouldin Index: 1.42


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2425,127.846,1.4576,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.24
Calinski-Harabasz Index: 127.85
Davies-Bouldin Index: 1.46


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2168,121.1435,1.5866,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.22
Calinski-Harabasz Index: 121.14
Davies-Bouldin Index: 1.59


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2073,112.2108,1.493,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.21
Calinski-Harabasz Index: 112.21
Davies-Bouldin Index: 1.49


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2112,121.1747,1.6329,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.21
Calinski-Harabasz Index: 121.17
Davies-Bouldin Index: 1.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1935,111.7941,1.6806,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.19
Calinski-Harabasz Index: 111.79
Davies-Bouldin Index: 1.68


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1786,104.1316,1.5252,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.18
Calinski-Harabasz Index: 104.13
Davies-Bouldin Index: 1.53


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2112,121.1747,1.6329,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.21
Calinski-Harabasz Index: 121.17
Davies-Bouldin Index: 1.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1935,111.7941,1.6806,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.19
Calinski-Harabasz Index: 111.79
Davies-Bouldin Index: 1.68


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1786,104.1316,1.5252,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.18
Calinski-Harabasz Index: 104.13
Davies-Bouldin Index: 1.53


In [28]:
Clustering = setup(df, transformation = True, normalize = True, pca = True,
      normalize_method = 'zscore', transformation_method = 'yeo-johnson',
      pca_method = 'linear')

Unnamed: 0,Description,Value
0,Session id,2819
1,Original data shape,"(440, 8)"
2,Transformed data shape,"(440, 8)"
3,Numeric features,8
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,Transformation,True
9,Transformation method,yeo-johnson


In [29]:
X_transformed = get_config('X_transformed')

In [30]:
X_transformed

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7
0,1.867977,-0.399413,-0.870010,0.273038,-1.129105,0.356084,-0.671648,-0.251674
1,1.947076,0.340402,-0.585761,-0.084543,0.261604,0.509164,-0.470332,-0.104946
2,2.070192,1.294152,-0.592123,-1.275821,-0.038995,1.066407,-0.280525,-0.286942
3,-1.128021,1.153129,-0.529434,-0.249043,0.488296,0.339359,0.709219,0.340170
4,1.326875,1.886215,-0.712467,-0.158218,-0.206842,1.240681,-0.323456,0.036063
...,...,...,...,...,...,...,...,...
435,0.115335,2.651520,-0.570216,0.145512,0.615728,-1.183592,-0.419890,1.041588
436,-2.267865,1.833323,-0.864458,0.057388,-0.556515,0.566372,-0.333775,-0.285650
437,3.373709,0.246154,-0.728390,0.482113,-0.744117,-0.088705,0.234669,0.093254
438,-1.240739,0.316193,-0.752327,-0.820105,-0.679749,0.076944,-0.103408,0.101224


In [31]:
model1(X_transformed,'T+N+Pca',6)
model2(X_transformed,'T+N+Pca',6)
model3(X_transformed,'T+N+Pca',6)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2219,153.5307,1.6415,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.22
Calinski-Harabasz Index: 153.53
Davies-Bouldin Index: 1.64


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2288,138.5574,1.5189,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.23
Calinski-Harabasz Index: 138.56
Davies-Bouldin Index: 1.52


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2472,136.3349,1.4208,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.25
Calinski-Harabasz Index: 136.33
Davies-Bouldin Index: 1.42


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2425,127.846,1.4576,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.24
Calinski-Harabasz Index: 127.85
Davies-Bouldin Index: 1.46


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2179,121.2251,1.5849,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.22
Calinski-Harabasz Index: 121.23
Davies-Bouldin Index: 1.58


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2073,112.2108,1.493,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.21
Calinski-Harabasz Index: 112.21
Davies-Bouldin Index: 1.49


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2112,121.1747,1.6329,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.21
Calinski-Harabasz Index: 121.17
Davies-Bouldin Index: 1.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1935,111.7941,1.6806,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.19
Calinski-Harabasz Index: 111.79
Davies-Bouldin Index: 1.68


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1786,104.1316,1.5252,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.18
Calinski-Harabasz Index: 104.13
Davies-Bouldin Index: 1.53


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2112,121.1747,1.6329,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.21
Calinski-Harabasz Index: 121.17
Davies-Bouldin Index: 1.63


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1935,111.7941,1.6806,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.19
Calinski-Harabasz Index: 111.79
Davies-Bouldin Index: 1.68


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.1786,104.1316,1.5252,0,0,0


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Silhouette Score: 0.18
Calinski-Harabasz Index: 104.13
Davies-Bouldin Index: 1.53


# FINAL CSV FILE

In [32]:
import pandas as pd
import glob
csv_files = glob.glob('*.csv')
dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.to_csv('Result.csv',index = False)

In [33]:
combined_df

Unnamed: 0,Method,Model,Num_Clusters,Silhouette,CH_Index,DB_Index,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,T+N,hclust,3.0,0.211211,121.174683,1.632864,,,,,,,,
1,T+N,hclust,4.0,0.193459,111.794140,1.680607,,,,,,,,
2,T+N,hclust,5.0,0.178556,104.131563,1.525158,,,,,,,,
3,Pca,hclust,3.0,0.375016,183.359667,0.978255,,,,,,,,
4,Pca,hclust,4.0,0.348269,185.255916,0.970075,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,T+N+Pca,KMeans,4.0,0.247214,136.334858,1.420790,,,,,,,,
490,T+N+Pca,KMeans,5.0,0.217894,121.225102,1.584888,,,,,,,,
491,Normalized,hclust,3.0,0.635610,79.920337,0.572897,,,,,,,,
492,Normalized,hclust,4.0,0.355197,127.089763,0.859689,,,,,,,,


In [34]:
kmeans = combined_df[combined_df['Model']=='KMeans'].copy()
kmeans.drop(columns=['Model'],inplace = True)
print(kmeans)
kmeans.to_csv('KMEANS.csv',index = False)

                 Method  Num_Clusters  Silhouette      CH_Index  DB_Index  \
449                 Pca           3.0    0.478351    210.152559  0.889509   
450                 Pca           4.0    0.386631    207.819678  0.954347   
451                 Pca           5.0    0.369021    213.907067  0.890608   
452  No Data Processing           3.0    0.478351    210.152559  0.889509   
453  No Data Processing           4.0    0.386631    207.819678  0.954347   
454  No Data Processing           5.0    0.383296    215.057837  0.902040   
464          Normalized           3.0    0.356769    139.349412  1.173637   
465          Normalized           4.0    0.347824    130.930033  1.237737   
466          Normalized           5.0    0.352924    137.952305  1.148050   
470         transformed           3.0    0.855442  13039.763275  0.338898   
471         transformed           4.0    0.518134  15162.631487  0.620288   
472         transformed           5.0    0.462171  14223.694793  0.696933   

In [35]:
hclust = combined_df[combined_df['Model']=='hclust'].copy()
hclust.drop(columns=['Model'],inplace = True)
print(hclust)
hclust.to_csv('HCLUST.csv',index = False)

                 Method  Num_Clusters  Silhouette      CH_Index  DB_Index  \
0                   T+N           3.0    0.211211    121.174683  1.632864   
1                   T+N           4.0    0.193459    111.794140  1.680607   
2                   T+N           5.0    0.178556    104.131563  1.525158   
3                   Pca           3.0    0.375016    183.359667  0.978255   
4                   Pca           4.0    0.348269    185.255916  0.970075   
5                   Pca           5.0    0.352228    196.953956  0.910621   
458             T+N+Pca           3.0    0.211211    121.174683  1.632864   
459             T+N+Pca           4.0    0.193459    111.794140  1.680607   
460             T+N+Pca           5.0    0.178556    104.131563  1.525158   
476  No Data Processing           3.0    0.375016    183.359667  0.978255   
477  No Data Processing           4.0    0.348269    185.255916  0.970075   
478  No Data Processing           5.0    0.352228    196.953956  0.910621   

In [36]:
birch = combined_df[combined_df['Model']=='birch'].copy()
birch.drop(columns=['Model'],inplace = True)
print(birch)
birch.to_csv('BIRCH.csv',index = False)

                 Method  Num_Clusters  Silhouette      CH_Index  DB_Index  \
446         transformed           3.0    0.855442  13039.763275  0.338898   
447         transformed           4.0    0.510934  14291.958744  0.605433   
448         transformed           5.0    0.443546  13978.086706  0.724891   
455                 Pca           3.0    0.375016    183.359667  0.978255   
456                 Pca           4.0    0.348269    185.255916  0.970075   
457                 Pca           5.0    0.352228    196.953956  0.910621   
461             T+N+Pca           3.0    0.211211    121.174683  1.632864   
462             T+N+Pca           4.0    0.193459    111.794140  1.680607   
463             T+N+Pca           5.0    0.178556    104.131563  1.525158   
467                 T+N           3.0    0.211211    121.174683  1.632864   
468                 T+N           4.0    0.193459    111.794140  1.680607   
469                 T+N           5.0    0.178556    104.131563  1.525158   