# k-Means Clustering - Code Notebook Solution
**Author**: Dr. Yves Staudt

CAS: Machine Learning - Unsupervised Learning

## Loading Packages

In [11]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import plotly.express as px

## Loading Data

In [2]:
# Attention to adapt path
df = pd.read_csv('lego_dataset_encoded_prepared_selected_feature.csv')

## Scaling Data

In [3]:
# set up the scaler
scaler = MinMaxScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(df)

# transform train and test sets
df_scaled = scaler.transform(df)

In [4]:
# let's transform the returned NumPy arrays to dataframes for the rest of
# the demo

df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

## K-Means

In [5]:
km = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
)
km.fit(df_scaled)

## Cluster Centers

In [6]:
cluster_centers = pd.DataFrame(km.cluster_centers_, columns=df_scaled.columns)
cluster_centers

Unnamed: 0,Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (max.),Verpackungsbreite,Verpackungstiefe,Verpackungshöhe,Paketgewicht,kein_nachhaltigkeitszertifikat,Produktfarbe_Rare,Ursprungsland_ Ungarn,Ursprungsland_ China,...,LegoCategory_LEGO Architecture,LegoCategory_LEGO Classic,LegoCategory_LEGO Ideas,LegoCategory_LEGO Super Mario,LegoCategory_LEGO Friends,LegoCategory_LEGO Creator 3in1,LegoCategory_LEGO Minecraft,LegoCategory_LEGO Spider,LegoCategory_LEGO Jurassic World,LegoCategory_LEGO Dots
0,0.215292,1.0,0.378691,0.432321,0.148085,0.049638,0.853535,8.673617e-18,1.665335e-16,1.0,...,8.673617e-18,-3.122502e-17,0.01515152,0.035354,0.156566,0.005050505,0.005051,0.015152,0.055556,0.010101
1,0.347816,0.995485,0.485872,0.545018,0.167986,0.091136,0.82439,8.673617e-18,0.8780488,1.665335e-16,...,0.0195122,0.07317073,3.469447e-18,0.009756,0.209756,3.469447e-18,0.009756,0.009756,0.004878,0.068293
2,0.321952,0.984185,0.438383,0.453024,0.16203,0.067123,0.813142,0.01848049,3.885781e-16,-5.828671e-16,...,0.01026694,0.01026694,0.02258727,0.078029,0.014374,0.04106776,0.043121,0.010267,0.002053,0.030801


## Interpreting Results

In [7]:
df_pred = df
df_pred['cluster_predict'] = km.predict(df_scaled)

In [8]:
df_pred[['cluster_predict']].groupby('cluster_predict').value_counts()

cluster_predict
0    198
1    205
2    487
Name: count, dtype: int64

In [15]:
df_pred.groupby('cluster_predict').describe()

Unnamed: 0_level_0,Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (max.),Empfohlenes Alter in Jahren (max.),...,LegoCategory_LEGO Jurassic World,LegoCategory_LEGO Jurassic World,LegoCategory_LEGO Dots,LegoCategory_LEGO Dots,LegoCategory_LEGO Dots,LegoCategory_LEGO Dots,LegoCategory_LEGO Dots,LegoCategory_LEGO Dots,LegoCategory_LEGO Dots,LegoCategory_LEGO Dots
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
cluster_predict,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,198.0,6.128788,3.20886,1.5,4.0,6.0,7.0,18.0,198.0,99.0,...,0.0,1.0,198.0,0.010101,0.100248,0.0,0.0,0.0,0.0,1.0
1,205.0,8.978049,5.484895,1.5,6.0,7.0,12.0,18.0,205.0,98.57561,...,0.0,1.0,205.0,0.068293,0.252865,0.0,0.0,0.0,0.0,1.0
2,487.0,8.421971,3.954743,1.5,7.0,7.0,8.0,23.0,487.0,97.513347,...,0.0,1.0,487.0,0.030801,0.172955,0.0,0.0,0.0,0.0,1.0


## Figures

In [12]:
def my_boxplot_fct(data,variable):
    fig = px.box(data, x="cluster_predict", y=variable)
    fig.show()


In [10]:
df_pred.columns

Index(['Empfohlenes Alter in Jahren (mind.)',
       'Empfohlenes Alter in Jahren (max.)', 'Verpackungsbreite',
       'Verpackungstiefe', 'Verpackungshöhe', 'Paketgewicht',
       'kein_nachhaltigkeitszertifikat', 'Produktfarbe_Rare',
       'Ursprungsland_ Ungarn ', 'Ursprungsland_ China ',
       'Ursprungsland_ Tschechische Republik ', 'Ursprungsland_Rare',
       'Ursprungsland_ Dänemark ',
       'EU TSD Warnung_ Keine Warnung zutreffend, Nicht für Kinder unter 36 Monaten geeignet ',
       'EU TSD Warnung_ Nicht für Kinder unter 36 Monaten geeignet ',
       'EU TSD Warnung_ Nicht für Kinder unter 18 Monaten geeignet ',
       'Verpackungsart_ Box ', 'Verpackungsart_ Polybag ', 'LegoCategory_Rare',
       'LegoCategory_LEGO Disney ', 'LegoCategory_LEGO Technic ',
       'LegoCategory_LEGO City ', 'LegoCategory_LEGO Duplo ',
       'LegoCategory_LEGO Star Wars ', 'LegoCategory_LEGO Ninjago ',
       'LegoCategory_LEGO Icons ', 'LegoCategory_LEGO Marvel ',
       'LegoCategory_LEG

In [13]:
my_boxplot_fct(df_pred, "Empfohlenes Alter in Jahren (mind.)")

In [14]:
my_boxplot_fct(df_pred, "Verpackungstiefe")