# T-SNE - Code Notebook Solution
**Author**: Dr. Yves Staudt

CAS: Machine Learning - Unsupervised Learning

## Loading Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

from sklearn.manifold import TSNE

import plotly.express as px

## Loading Data

In [2]:
# Attention to adapt path
df = pd.read_csv('lego_dataset_encoded_prepared_selected_feature.csv')

## Scaling Data

In [3]:
# set up the scaler
scaler = MinMaxScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(df)

# transform train and test sets
df_scaled = scaler.transform(df)

In [4]:
# let's transform the returned NumPy arrays to dataframes for the rest of
# the demo

df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

## k-Means Clustering for Optimal k

In [5]:
km = KMeans(
    n_clusters=4, init='random',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
)
km.fit(df_scaled)

## Cluster Centers

In [6]:
cluster_centers = pd.DataFrame(km.cluster_centers_, columns=df_scaled.columns)
cluster_centers

Unnamed: 0,Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (max.),Verpackungsbreite,Verpackungstiefe,Verpackungshöhe,Paketgewicht,kein_nachhaltigkeitszertifikat,Produktfarbe_Rare,Ursprungsland_ Ungarn,Ursprungsland_ China,...,LegoCategory_LEGO Architecture,LegoCategory_LEGO Classic,LegoCategory_LEGO Ideas,LegoCategory_LEGO Super Mario,LegoCategory_LEGO Friends,LegoCategory_LEGO Creator 3in1,LegoCategory_LEGO Minecraft,LegoCategory_LEGO Spider,LegoCategory_LEGO Jurassic World,LegoCategory_LEGO Dots
0,0.350687,0.968448,0.450695,0.467618,0.183426,0.086588,0.477273,0.06818182,-2.775558e-17,1.387779e-16,...,0.05681818,0.1022727,0.03409091,0.068182,0.045455,0.1022727,-1.0408340000000001e-17,-3.469447e-18,-1.734723e-18,0.022727
1,0.314499,0.985553,0.437644,0.449296,0.1557,0.062332,0.88642,0.007407407,4.718448e-16,-3.885781e-16,...,-5.2041700000000004e-18,-3.8163920000000003e-17,0.01975309,0.079012,0.009877,0.02716049,0.05185185,0.01234568,0.002469136,0.032099
2,0.351058,1.0,0.483364,0.548925,0.171586,0.093004,0.824121,6.938894e-18,0.9045226,1.94289e-16,...,0.0201005,0.05527638,3.469447e-18,0.01005,0.211055,6.938894e-18,0.01005025,0.01005025,0.005025126,0.070352
3,0.215292,1.0,0.378691,0.432321,0.148085,0.049638,0.853535,8.673617e-18,1.665335e-16,1.0,...,8.673617e-18,-3.122502e-17,0.01515152,0.035354,0.156566,0.005050505,0.005050505,0.01515152,0.05555556,0.010101


## T-SNE - Dimension Reduction

In [7]:
#execudting the tsne method, set random state that you can get the same results as me
tsne = TSNE(random_state=42)
# use fit_transform instead of fit, as TSNE has no transform method
df_tsne = tsne.fit_transform(df_scaled)

Save optained data

In [8]:
# saving data as pandas data frame
df_tsne = pd.DataFrame(df_tsne, columns = ['TSNE_DIM1', 'TSNE_DIM2'])
df_tsne.describe()

Unnamed: 0,TSNE_DIM1,TSNE_DIM2
count,890.0,890.0
mean,-1.538481,-0.064183
std,20.826809,14.651512
min,-35.98481,-26.089354
25%,-20.223335,-11.928159
50%,-2.703539,-2.300425
75%,16.978871,11.809465
max,32.146832,27.876345


In [9]:
df_tsne['cluster'] = km.labels_
# Convert the column to the 'category' data type
df_tsne['cluster'] = df_tsne['cluster'].astype('category')

## Visualisation

In [10]:
# representing the data
fig = px.scatter(df_tsne, x="TSNE_DIM1", y="TSNE_DIM2",color="cluster",
                 labels= {
                     "TSNE_DIM1": "Dimension 1",
                     "TSNE_DIM2": "Dimension 2",
                     "cluster": "Number of associated cluster"
                 },
                 title= "Representation of the dimension reduction by t-SNE")
fig.show()

  grouped = df.groupby(required_grouper, sort=False)  # skip one_group groupers
