# Optimizing k-Means Clustering - Code Notebook Exercise
**Author**: Dr. Yves Staudt

CAS: Machine Learning - Unsupervised Learning

## Loading Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px

## Loading Data

In [2]:
# Attention to adapt path
df = pd.read_csv('lego_dataset_encoded_prepared_selected_feature.csv')

## Scaling Data

In [3]:
# set up the scaler
scaler = MinMaxScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit( xxx )

# transform train and test sets
df_scaled = xxx

In [4]:
# let's transform the returned NumPy arrays to dataframes for the rest of
# the demo

df_scaled = xxx

## Optimizing K-Means

In [5]:
# Initialize an empty list to store dictionaries
results_list = []

for k in range(2, 11):
    km = KMeans(
        xxx
    )    
    km.fit(df_scaled)

    orig_disp = km.inertia_

    # Append to the list
    results_list.append({
        'clusterCount': k,
        'distortions': orig_disp,
        'silhouetteScore': silhouette_score(df_scaled, km.labels_, metric='euclidean')
    })

# Create a DataFrame from the list
resultsdf = pd.DataFrame(results_list)

## Representing Results from Optimization

In [None]:
resultsdf

In [None]:
fig, axs = plt.subplots(1,2, figsize = (12,4))
sns.lineplot(x = 'clusterCount', y = 'distortions', data=resultsdf, ax = axs[0], linestyle = '--', marker = 'o', color='b')
sns.lineplot(x='clusterCount', y='silhouetteScore', data=resultsdf, ax = axs[1], linestyle='--', marker='o', color='b')

axs[0].set_xlabel('Number of Clusters')
axs[0].set_ylabel('Within Variance')

axs[1].set_xlabel('Number of Clusters')
axs[1].set_ylabel('Silhouette Score')


# Adjust the spacing between the subplots
plt.tight_layout()
plt.show()

## k-Means Clustering for Optimal k

In [None]:
km = KMeans(
    xxx
)
km.fit(xxx)

## Cluster Centers

In [None]:
cluster_centers = xxx
cluster_centers

## Interpreting Results

In [10]:
df_pred = df
df_pred['cluster_predict'] = km.labels_

In [None]:
df_pred[['cluster_predict']].groupby('cluster_predict').value_counts()

In [None]:
df_pred.groupby('cluster_predict').describe()

## Figures

In [13]:
def my_boxplot_fct(data,variable):
    xxx


In [None]:
df_pred.columns

In [None]:
my_boxplot_fct(xxx)

In [None]:
my_boxplot_fct(xxx)