In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import hdbscan
from sklearn.manifold import TSNE
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples


### DATA PREPARATION

In [105]:
# Read in the data
# Main change from v2 to v3 is that the specific regions have been replaced with general regions
cmd_data = pd.read_csv('/data2/home/prasannaiyer/Projects/SKU_Cluster_Local/SKU_Clusters/Data/cmd_attributes_v3_upload.csv',\
    encoding = 'latin-1', decimal = '.', thousands = ',')

In [106]:
cmd_data['p_bu'] = cmd_data['p_bu'].replace({'AW': 'AG', 'CW': 'CE'})

In [107]:
sku_count = cmd_data.shape[0]

In [108]:
attr_input = ['Attr1_Str_qu', 'Attr2_OpPr_qu','Attr3_Costamount_norm',
       'Attr4_RdDi_norm', 'Attr5_BrDi_norm', 'Attr5_Str_Type1',
       'Attr5_Str_Type2', 'Attr5_Str_Type3', 'Attr6_Major_Type1',
       'Attr6_Major_Type2', 'Attr9_Frctn0', 'Attr9_Frctn1', 'Attr10_Snsr0', 'Attr10_Snsr1']

In [109]:
attr_input_cost_model = ['Attr1_Str_qu', 'Attr2_OpPr_qu',
       'Attr4_RdDi_norm', 'Attr5_BrDi_norm', 'Attr5_Str_Type1',
       'Attr5_Str_Type2', 'Attr5_Str_Type3', 'Attr6_Major_Type1',
       'Attr6_Major_Type2', 'Attr9_Frctn0', 'Attr9_Frctn1', 'Attr10_Snsr0', 'Attr10_Snsr1']
attr_input_subset = ['Attr1_Str_qu', 'Attr2_OpPr_qu', 'Attr3_Costamount_norm',
       'Attr4_RdDi_norm', 'Attr5_BrDi_norm', 'Attr9_Frctn0', 'Attr9_Frctn1']
attr_input_subset_cost = ['Attr1_Str_qu', 'Attr2_OpPr_qu', 'Attr3_Costamount_norm',
       'Attr4_RdDi_norm', 'Attr5_BrDi_norm', 'Attr9_Frctn0', 'Attr9_Frctn1']

In [110]:
# create an autoencoder class. For init function, input would be the input data, output data, layers. Also in the init function, create the autoencoder model
# For the fit function, input would be the number of epochs, batch size, and the learning rate

class auto_encoder:
    def __init__(self, input_data, output_data, ae_layers, alpha_ae = 0.2):
        self.input_data = input_data
        self.output_data = output_data
        self.ae_layers = ae_layers
        self.alpha_ae = alpha_ae
        self.autoencoder = self.create_autoencoder()
                
    def create_autoencoder(self):
        # create the input layer
        encoder_input = Input(shape=(self.input_data.shape[1],))
        nn_layer = encoder_input
        # create the encoder layers
        for i in range(len(self.ae_layers)):
            layer_name = 'encoder_layer_' + str(i+1)
            layer_name = Dense(self.ae_layers[i])(nn_layer)
            layer_name = LeakyReLU(alpha=self.alpha_ae)(layer_name)
            nn_layer = layer_name
        encoder_output = nn_layer
        self.encoder_model = Model(encoder_input, encoder_output)
        # create the decoder layers
        for i in range(len(self.ae_layers)-2, -1, -1):
            layer_name = 'decoder_layer_' + str(i+1)
            layer_name = Dense(self.ae_layers[i])(nn_layer)
            layer_name = LeakyReLU(alpha=self.alpha_ae)(layer_name)
            nn_layer = layer_name
        decoder_output = nn_layer
        # create the output layer
        autoencoder_output = Dense(self.output_data.shape[1], activation = 'linear')(nn_layer)
        self.ae_model = Model(encoder_input, autoencoder_output)
        # compile the model
        self.ae_model.compile(optimizer='adam', loss='mse')

    # Create a function to fit the model
    def autoencoder_fit(self, epochs = 500, batch_size = 32, verbose = 1):
        self.ae_model_hist = self.ae_model.fit(self.input_data, self.output_data, epochs = epochs, batch_size = batch_size, verbose = verbose)
        self.training_loss = self.ae_model_hist.history['loss'][-1]

    # Create a function to predict the embeddings from the encoder model
    def predict_embeddings(self, input_data):
        self.embeddings = self.encoder_model.predict(input_data)

    # Create a function to plot the training loss
    def plot_training_loss(self):
        plt.figure(figsize = (10, 6))
        ax = sns.lineplot(x = range(1, len(self.ae_model_hist.history['loss']) + 1), y = self.ae_model_hist.history['loss'])
        ax.set_xlabel('Epochs')
        ax.set_ylabel('Training Loss')
        ax.set_title('Training Loss vs Epochs')
        ax.grid(linestyle='-', linewidth='0.5', color='red')

    # create a function to print the loss for epochs at specified intervals
    def print_loss(self, interval = 500):
        for i in range(0, len(self.ae_model_hist.history['loss']), interval):
            print(f'Loss at epoch {i}: {self.ae_model_hist.history["loss"][i]:.4f}')
        # print the loss at the last epoch using f-string
        print(f'Loss at epoch {len(self.ae_model_hist.history["loss"])}: {self.ae_model_hist.history["loss"][-1]:.4f}')
        
        # print minimum loss and the epoch at which it occurs
        # print(f'Minimum loss: {min(self.ae_model_hist.history['loss']):.4f} at epoch {self.ae_model_hist.history['loss'].index(min(self.ae_model_hist.history['loss']))}')
        print(f'Minimum loss: {min(self.ae_model_hist.history["loss"]):.4f} at epoch {np.argmin(self.ae_model_hist.history["loss"])}')
    
    


In [111]:
# Function to create clusters of the input data
def create_clusters(input_data, cluster_count = 6):
    # create the k-means model
    kmeans = KMeans(cluster_count, random_state = 42)
    # fit the model
    return kmeans.fit_predict(input_data)        
   


In [112]:
def run_experiments(experiment_details, input_data, cluster_attr, experiment_results):
    # experiment_results = pd.DataFrame(columns = ['Experiment', 'Training Loss', 'Silhouette Score'])
    print(len(experiment_details))
    for experiment in experiment_details:
        print(f'Running experiment: {experiment} with layers: {experiment_details[experiment][0]} and cluster count: {experiment_details[experiment][1]}')
        # create the autoencoder model
        ae = auto_encoder(input_data[cluster_attr], input_data[cluster_attr], experiment_details[experiment][0])
        # fit the model
        ae.autoencoder_fit(epochs = 1000, batch_size = 32, verbose = 0)
        training_loss = ae.training_loss
        # predict the embeddings
        ae.predict_embeddings(input_data[cluster_attr])
        encoded_data = ae.embeddings
        # create the clusters
        label_str = str(experiment) + '_labels'
        input_data[label_str] = create_clusters(encoded_data, cluster_count = experiment_details[experiment][1])
        # calculate the silhouette score
        silh_score_experiment = silhouette_score(input_data[cluster_attr], input_data[label_str])
        # calculate the calinski harabasz score
        # calinski_harabasz_score = calinski_harabasz_score(input_data, input_data[label_str])
        # calculate the davies bouldin score
        # davies_bouldin_score = davies_bouldin_score(input_data, input_data[label_str])
        # Add the experiment results to the dataframe
        experiment_results.loc[len(experiment_results)] = [experiment, training_loss, silh_score_experiment]
    return experiment_results, input_data
        

        
        

In [113]:
input_size = len(attr_input)
experiment_details = {'Experiment_2.1':[[input_size*2, 32, 16, 4], 4], \
                      'Experiment_2.2':[[input_size*2, 32, 16, 4], 6], \
                      'Experiment_2.3':[[input_size*2, 32, 16, 4], 8], \
                      'Experiment_3.1':[[input_size*2, 32, 16, 6], 4], \
                      'Experiment_3.2':[[input_size*2, 32, 16, 6], 6], \
                      'Experiment_3.3':[[input_size*2, 32, 16, 6], 8], \
                      'Experiment_4.1':[[input_size*2, 32, 16, 8], 4], \
                      'Experiment_4.2':[[input_size*2, 32, 16, 8], 6], \
                      'Experiment_4.3':[[input_size*2, 32, 16, 8], 8]}
experiment_test = {'Experiment_2.1':[[input_size*2, 32, 16, 4], 4], \
                      'Experiment_2.2':[[input_size*2, 32, 16, 4], 6]}
experiment_results = pd.DataFrame(columns = ['Experiment', 'Training Loss', 'Silhouette Score'])

In [114]:
# Experiment 1.0 - Compute the results for basic input data
kmeans_plain_cluster_count = [4, 6, 8]
experiment_count = 0
for cluster_count in kmeans_plain_cluster_count:
    experiment_count += 1
    experiment_name = 'Experiment_1.' + str(experiment_count)
    cmd_data['kmeans_plain_labels'] = create_clusters(cmd_data[attr_input], cluster_count)
    silh_score_experiment = silhouette_score(cmd_data[attr_input], cmd_data['kmeans_plain_labels'])
    experiment_results.loc[len(experiment_results)] = [experiment_name, 0, silh_score_experiment]


In [115]:
#experiment_results, cmd_data_output = run_experiments(experiment_test, cmd_data, attr_input, experiment_results)
experiment_results, cmd_data_output = run_experiments(experiment_details, cmd_data, attr_input, experiment_results)

9
Running experiment: Experiment_2.1 with layers: [28, 32, 16, 4] and cluster count: 4
Running experiment: Experiment_2.2 with layers: [28, 32, 16, 4] and cluster count: 6
Running experiment: Experiment_2.3 with layers: [28, 32, 16, 4] and cluster count: 8
Running experiment: Experiment_3.1 with layers: [28, 32, 16, 6] and cluster count: 4
Running experiment: Experiment_3.2 with layers: [28, 32, 16, 6] and cluster count: 6
Running experiment: Experiment_3.3 with layers: [28, 32, 16, 6] and cluster count: 8
Running experiment: Experiment_4.1 with layers: [28, 32, 16, 8] and cluster count: 4
Running experiment: Experiment_4.2 with layers: [28, 32, 16, 8] and cluster count: 6
Running experiment: Experiment_4.3 with layers: [28, 32, 16, 8] and cluster count: 8


In [116]:
experiment_results

Unnamed: 0,Experiment,Training Loss,Silhouette Score
0,Experiment_1.1,0.0,0.283616
1,Experiment_1.2,0.0,0.32342
2,Experiment_1.3,0.0,0.332018
3,Experiment_2.1,0.001897,0.273954
4,Experiment_2.2,0.001697,0.224756
5,Experiment_2.3,0.0022,0.274777
6,Experiment_3.1,0.00025,0.306873
7,Experiment_3.2,0.000194,0.292352
8,Experiment_3.3,0.000306,0.280056
9,Experiment_4.1,0.000111,0.248976


In [None]:
input_size = len(attr_input)
ae_layers = [input_size*2, 32, 16, 8]
ae_model = auto_encoder(cmd_data[attr_input], cmd_data[attr_input], ae_layers)

In [None]:
# fit the model
# ae_model_hist = ae_model.ae_model.fit(cmd_data[attr_input], cmd_data[attr_input], epochs = 500, batch_size = 32, verbose = 1)
ae_model.autoencoder_fit(epochs = 500, batch_size = 32, verbose = 1)

In [None]:
ae_model.print_loss()

In [None]:
cmd_data['kmeans_labels'] = create_clusters(cmd_data[attr_input], cluster_algorithm = 'k-means')

In [None]:
cmd_data['hdbscan_labels'] = create_clusters(cmd_data[attr_input], cluster_algorithm = 'hdbscan')

In [None]:
cmd_data.columns

In [None]:
# count by p_bu
sku_count_bu = cmd_data.groupby('p_bu')['p_bu'].count()

In [None]:
# Create a function to plot the cluster details, getting cmd_data, cluster column name as input
def plot_cluster_details(cmd_data, cluster_col_name, sku_count_bu):
    ######## CLUSTER COUNT BY CLUSTER AND CLUSTER COUNT BY p_bu ########
    fig, ax = plt.subplots(1, 2, figsize = (20, 6))
    # plot the cluster count
    cluster_order = cmd_data[cluster_col_name].value_counts().sort_values(ascending = False).index
    sns.countplot(x = cmd_data[cluster_col_name], ax = ax[0], order = cluster_order)
    ax[0].set_title('Cluster Count')
    ax[0].set_xlabel('Cluster')
    ax[0].set_ylabel('Count')
    ax[0].grid(linestyle='-', linewidth='0.5', color='red')
    # add labels - cluster count & % of total to the bars
    ax[0].bar_label(ax[0].containers[0], labels = \
                    [f'{x} ({y:.2f}%)' \
                     for x, y in zip(ax[0].containers[0].datavalues, \
                                     ax[0].containers[0].datavalues/cmd_data[cluster_col_name].value_counts().sum()*100)])
    # plot the cluster count by p_bu
    p_bu_order = cmd_data.groupby('p_bu')['p_bu'].value_counts().sort_values(ascending = False).index
    hue_order = cluster_order
    sns.countplot(hue = cmd_data[cluster_col_name], x = cmd_data['p_bu'], ax = ax[1])
    # add labels - cluster count & % of total to the bars
    for i in range(len(ax[1].containers)):
        ax[1].bar_label(ax[1].containers[i], labels = \
                        [f'{x} ({y:.2f}%)' \
                         for x, y in zip(ax[1].containers[i].datavalues, \
                                         ax[1].containers[i].datavalues/sku_count_bu.sum()*100)])
    ax[1].set_title('Cluster Count by p_bu')
    ax[1].set_xlabel('Cluster')
    ax[1].set_ylabel('Count')
    ax[1].grid(linestyle='-', linewidth='0.5', color='red')
    ######## PLOT OF CLUSTER COUNT WITH p_bu AND p_region AS HUE #########
    fig, ax = plt.subplots(1, 2, figsize = (20, 6))
    # plot the cluster count with p_bu as hue
    sns.countplot(hue = cmd_data['p_bu'], x = cmd_data[cluster_col_name], ax = ax[0], order = cluster_order)
    ax[0].set_title('Cluster Count with p_bu as hue')
    ax[0].set_xlabel('Cluster')
    ax[0].set_ylabel('Count')
    ax[0].grid(linestyle='-', linewidth='0.5', color='red')
    # add labels - cluster count & % of total to the bars
    for i in range(len(ax[0].containers)):
        ax[0].bar_label(ax[0].containers[i], labels = \
                        [f'{x} ({y:.2f}%)' \
                         for x, y in zip(ax[0].containers[i].datavalues, \
                                         ax[0].containers[i].datavalues/sku_count_bu.sum()*100)])
    # plot the cluster count with p_region as hue
    sns.countplot(hue = cmd_data['p_region'], x = cmd_data[cluster_col_name], ax = ax[1], order = cluster_order)
    ax[1].set_title('Cluster Count with p_region as hue')
    ax[1].set_xlabel('Cluster')
    ax[1].set_ylabel('Count')
    ax[1].grid(linestyle='-', linewidth='0.5', color='red')
    # add labels - cluster count & % of total to the bars
    for i in range(len(ax[1].containers)):
        ax[1].bar_label(ax[1].containers[i], labels = \
                        [f'{x} ({y:.2f}%)' \
                         for x, y in zip(ax[1].containers[i].datavalues, \
                                         ax[1].containers[i].datavalues/sku_count_bu.sum()*100)])
    
    ######## PLOT OF CLUSTER COUNT FOR p_bu = AG AND p_bu = CE #########
    AG_cluster_order = cmd_data[cmd_data['p_bu'] == 'AG'][cluster_col_name].value_counts().sort_values(ascending = False).index
    CE_cluster_order = cmd_data[cmd_data['p_bu'] == 'CE'][cluster_col_name].value_counts().sort_values(ascending = False).index
    fig, ax = plt.subplots(1, 2, figsize = (20, 6))
    # plot the cluster count for p_bu = AG
    sns.countplot(x = cmd_data[cmd_data['p_bu'] == 'AG'][cluster_col_name], ax = ax[0], order = AG_cluster_order)
    ax[0].set_title('Cluster Count for p_bu = AG')
    ax[0].set_xlabel('Cluster')
    ax[0].set_ylabel('Count')
    ax[0].grid(linestyle='-', linewidth='0.5', color='red')
    # add labels - cluster count & % of total to the bars
    ax[0].bar_label(ax[0].containers[0], labels = \
                    [f'{x} ({y:.2f}%)' \
                        for x, y in zip(ax[0].containers[0].datavalues, \
                                        ax[0].containers[0].datavalues/sku_count_bu['AG']*100)])
    # plot the cluster count for p_bu = CE
    sns.countplot(x = cmd_data[cmd_data['p_bu'] == 'CE'][cluster_col_name], ax = ax[1], order = CE_cluster_order)
    ax[1].set_title('Cluster Count for p_bu = CE')
    ax[1].set_xlabel('Cluster')
    ax[1].set_ylabel('Count')
    ax[1].grid(linestyle='-', linewidth='0.5', color='red')
    # add labels - cluster count & % of total to the bars
    ax[1].bar_label(ax[1].containers[0], labels = \
                    [f'{x} ({y:.2f}%)' \
                        for x, y in zip(ax[1].containers[0].datavalues, \
                                        ax[1].containers[0].datavalues/sku_count_bu['CE']*100)])   

    ######## PLOT OF CLUSTER COUNT FOR EACH p_bu BY REGION #########
    fig, ax = plt.subplots(1, 2, figsize = (20, 6))
    hue_order = cmd_data[cmd_data['p_bu'] == 'AG'][cluster_col_name].value_counts().sort_values(ascending = False).index
    region_order = cmd_data[cmd_data['p_bu'] == 'AG']['p_region'].value_counts().sort_values(ascending = False).index
    # plot the cluster count for p_bu = AG by region
    sns.countplot(hue = cmd_data[cmd_data['p_bu'] == 'AG'][cluster_col_name], \
        x = cmd_data[cmd_data['p_bu'] == 'AG']['p_region'], ax = ax[0], hue_order = hue_order, order = region_order)
    ax[0].set_title('Cluster Count for p_bu = AG by region')
    ax[0].set_xlabel('Cluster')
    ax[0].set_ylabel('Count')
    ax[0].grid(linestyle='-', linewidth='0.5', color='red')
    # add labels - cluster count & % of total to the bars
    for i in range(len(ax[0].containers)):
        ax[0].bar_label(ax[0].containers[i], labels = \
                        [f'{x} ({y:.2f}%)' \
                            for x, y in zip(ax[0].containers[i].datavalues, \
                                            ax[0].containers[i].datavalues/sku_count_bu['AG']*100)])
    # plot the cluster count for p_bu = CE by region
    hue_order = cmd_data[cmd_data['p_bu'] == 'CE'][cluster_col_name].value_counts().sort_values(ascending = False).index
    region_order = cmd_data[cmd_data['p_bu'] == 'CE']['p_region'].value_counts().sort_values(ascending = False).index
    sns.countplot(hue = cmd_data[cmd_data['p_bu'] == 'CE'][cluster_col_name], \
        x = cmd_data[cmd_data['p_bu'] == 'CE']['p_region'], ax = ax[1], hue_order = hue_order, order = region_order)
    ax[1].set_title('Cluster Count for p_bu = CE by region')
    ax[1].set_xlabel('Cluster')
    ax[1].set_ylabel('Count')
    ax[1].grid(linestyle='-', linewidth='0.5', color='red')
    # add labels - cluster count & % of total to the bars
    for i in range(len(ax[1].containers)):
        ax[1].bar_label(ax[1].containers[i], labels = \
                        [f'{x} ({y:.2f}%)' \
                            for x, y in zip(ax[1].containers[i].datavalues, \
                                            ax[1].containers[i].datavalues/sku_count_bu['CE']*100)])   



In [None]:
plot_cluster_details(cmd_data, 'kmeans_labels', sku_count_bu)

In [None]:
fig = plt.figure(figsize = (20, 6))
ax = cmd_data.groupby(by = 'kmeans_labels')['p_nm'].count().sort_values(ascending = False).plot(kind = 'bar', ax = fig.add_subplot(121))
ax.bar_label(ax.containers[0], labels = [f'{x} ({(x/sku_count)*100:,.0f}%)' for x in ax.containers[0].datavalues])
ax.set_xlabel('Cluster')
ax.set_ylabel('Count')
ax.set_title('Count of KMeans Clusters')
ax.grid(linestyle='-', linewidth='0.5', color='red')

ax = fig.add_subplot(122)
ax = sns.countplot(x = 'p_bu', hue = 'kmeans_labels', data = cmd_data)
ax.set_xlabel('BU')
ax.set_ylabel('Count by Cluster')
ax.set_title('Count of KMeans Clusters by BU')
ax.grid(linestyle='-', linewidth='0.5', color='red')


In [None]:
fig = plt.figure(figsize = (20, 6))
fig.add_subplot(121)
# seaborn plot of the count of the clusters for p_bu = 'AG' by region
hue_order = cmd_data[cmd_data['p_bu'] == 'AG'].groupby('kmeans_labels')['kmeans_labels'].count().sort_values(ascending = False).index
ax = sns.countplot(x = 'kmeans_labels', data = cmd_data[cmd_data['p_bu'] == 'AG'], hue_order = hue_order)
ax.set_xlabel('BU')
ax.set_ylabel('Count by Cluster')
ax.set_title('Count of KMeans Clusters for AG')
ax.grid(linestyle='-', linewidth='0.5', color='red')
for bar in ax.containers:
    ax.bar_label(bar, labels = [f'{x} ({(x/sku_count_bu["AG"])*100:,.0f}%)' for x in bar.datavalues])
# plot for p_bu = 'CE'
fig.add_subplot(122)
hue_order = cmd_data[cmd_data['p_bu'] == 'CE'].groupby('kmeans_labels')['kmeans_labels'].count().sort_values(ascending = False).index
ax = sns.countplot(x = 'kmeans_labels', data = cmd_data[cmd_data['p_bu'] == 'CE'], \
                    hue_order = hue_order)
ax.set_xlabel('BU')
ax.set_ylabel('Count by Cluster')
ax.set_title('Count of KMeans Clusters for CE')
ax.grid(linestyle='-', linewidth='0.5', color='red')
for bar in ax.containers:
    ax.bar_label(bar, labels = [f'{x} ({(x/sku_count_bu["CE"])*100:,.0f}%)' for x in bar.datavalues])


In [None]:
fig = plt.figure(figsize = (20, 6))
fig.add_subplot(121)
# seaborn plot of the count of the clusters for p_bu = 'AG' by region
hue_order = cmd_data[cmd_data['p_bu'] == 'AG'].groupby('kmeans_labels')['kmeans_labels'].count().sort_values(ascending = False).index
ax = sns.countplot(x = 'p_region', hue = 'kmeans_labels', data = cmd_data[cmd_data['p_bu'] == 'AG'], \
                   order = cmd_data[cmd_data['p_bu'] == 'AG'].groupby('p_region')['p_region'].count().\
                    sort_values(ascending = False).index, hue_order = hue_order)
ax.set_xlabel('BU')
ax.set_ylabel('Count by Cluster')
ax.set_title('Count of KMeans Clusters for AG')
ax.grid(linestyle='-', linewidth='0.5', color='red')
for bar in ax.containers:
    ax.bar_label(bar, labels = [f'{x} ({(x/sku_count_bu["AG"])*100:,.0f}%)' for x in bar.datavalues])
# plot for p_bu = 'CE'
fig.add_subplot(122)
hue_order = cmd_data[cmd_data['p_bu'] == 'CE'].groupby('kmeans_labels')['kmeans_labels'].count().sort_values(ascending = False).index
ax = sns.countplot(x = 'p_region', hue = 'kmeans_labels', data = cmd_data[cmd_data['p_bu'] == 'CE'], \
                   order = cmd_data[cmd_data['p_bu'] == 'CE'].groupby('p_region')['p_region'].count().\
                    sort_values(ascending = False).index, hue_order = hue_order)
ax.set_xlabel('BU')
ax.set_ylabel('Count by Cluster')
ax.set_title('Count of KMeans Clusters for CE')
ax.grid(linestyle='-', linewidth='0.5', color='red')
for bar in ax.containers:
    ax.bar_label(bar, labels = [f'{x} ({(x/sku_count_bu["CE"])*100:,.0f}%)' for x in bar.datavalues])


In [None]:
# seaborn plot of the count of the clusters for p_bu = 'AG' by region


In [None]:
fig = plt.figure(figsize = (20, 6))
ax = cmd_data.groupby(by = 'hdbscan_labels')['p_nm'].count().sort_values(ascending = False).plot(kind = 'bar', ax = fig.add_subplot(121))
ax.bar_label(ax.containers[0], labels = [f'{x} ({(x/sku_count)*100:,.0f}%)' for x in ax.containers[0].datavalues])
ax.set_xlabel('Cluster')
ax.set_ylabel('Count')
ax.set_title('Count of hdbscan Clusters')
ax.grid(linestyle='-', linewidth='0.5', color='red')

ax = fig.add_subplot(122)
ax = sns.countplot(x = 'p_bu', hue = 'hdbscan_labels', data = cmd_data)

In [None]:
# Calculate the silhouette score for the k-means and hdbscan labels
print(f'Silhouette score for k-means: {silhouette_score(cmd_data[attr_input], cmd_data["kmeans_labels"]):.4f}')
print(f'Silhouette score for hdbscan: {silhouette_score(cmd_data[attr_input], cmd_data["hdbscan_labels"]):.4f}')

In [None]:
# print the silhouette score for each cluster using mean of silhouette_samples for each cluster
# get the silhouette score for each sample
sample_silhouette_values = silhouette_samples(cmd_data[attr_input], cmd_data['kmeans_labels'])
silh_score_per_cluster = []
for i in range(8):
    silh_score_per_cluster.append(sample_silhouette_values[cmd_data['kmeans_labels'] == i].mean()) 
  
silh_score_per_cluster


In [None]:
input_size = cmd_data[attr_input].shape[1]
l1_size = input_size*2
l2_size = 32
l3_size = 16
bottleneck_size = 8
encoder_input = Input(shape = (input_size, ))
# encoder layer 1
encoder_m5_l1 = Dense(l1_size)(encoder_input)
encoder_m5_l1 = LeakyReLU()(encoder_m5_l1)
# encoder layer 2
encoder_m5_l2 = Dense(l2_size)(encoder_m5_l1)
encoder_m5_l2 = LeakyReLU()(encoder_m5_l2)
# encoder layer 3
encoder_m5_l3 = Dense(l3_size)(encoder_m5_l2)
encoder_m5_l3 = LeakyReLU()(encoder_m5_l3)
# encoder bottleneck layer
encoder_m5_output = Dense(bottleneck_size)(encoder_m5_l3)
encoder_m5_output = LeakyReLU()(encoder_m5_output)
# decoder layer 1
decoder_m5_l1 = Dense(l3_size)(encoder_m5_output)
decoder_m5_l1 = LeakyReLU()(decoder_m5_l1)
# decoder layer 2
decoder_m5_l2 = Dense(l2_size)(decoder_m5_l1)
decoder_m5_l2 = LeakyReLU()(decoder_m5_l2)
# decoder layer 3
decoder_m5_l3 = Dense(l1_size)(decoder_m5_l2)
decoder_m5_l3 = LeakyReLU()(decoder_m5_l3)
# decoder output layer
decoder_m5_output = Dense(input_size, activation = 'linear')(decoder_m5_l3)
# define the autoencoder model
ae_fixed_m5 = Model(encoder_input, decoder_m5_output)

In [None]:
ae_fixed_m5.summary()

In [None]:
# compile the model
ae_fixed_m5.compile(optimizer='adam', loss='mse')
# fit the model
ae_fixed_m5_hist = ae_fixed_m5.fit(cmd_data[attr_input], cmd_data[attr_input], epochs = 500, batch_size = 32, verbose = 1)

In [None]:
# print the loss at epochs 100, 500, 999 and min loss. Format loss to 4 decimal places using f strings
print(f'Loss at epoch 100: {ae_fixed_m5_hist.history["loss"][99]:.4f}')
print(f'Loss at epoch 500: {ae_fixed_m5_hist.history["loss"][499]:.4f}')
#print(f'Loss at epoch 999: {ae_fixed_m5_hist.history["loss"][998]:.4f}')
print(f'Min loss: {min(ae_fixed_m5_hist.history["loss"]):.4f}')
