# PCA - Code Notebook Solution
**Author**: Dr. Yves Staudt

CAS: Machine Learning - Unsupervised Learning

## Loading Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

from sklearn import decomposition

import plotly.express as px

## Loading Data

In [2]:
# Attention to adapt path
df = pd.read_csv('lego_dataset_encoded_prepared_selected_feature.csv')

## Scaling Data

In [3]:
# set up the scaler
scaler = MinMaxScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(df)

# transform train and test sets
df_scaled = scaler.transform(df)

In [4]:
# let's transform the returned NumPy arrays to dataframes for the rest of
# the demo

df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

## k-Means Clustering for Optimal k

In [5]:
km = KMeans(
    n_clusters=4, init='random',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
)
km.fit(df_scaled)

## Cluster Centers

In [6]:
cluster_centers = pd.DataFrame(km.cluster_centers_, columns=df_scaled.columns)
cluster_centers

Unnamed: 0,Empfohlenes Alter in Jahren (mind.),Empfohlenes Alter in Jahren (max.),Verpackungsbreite,Verpackungstiefe,Verpackungshöhe,Paketgewicht,kein_nachhaltigkeitszertifikat,Produktfarbe_Rare,Ursprungsland_ Ungarn,Ursprungsland_ China,...,LegoCategory_LEGO Architecture,LegoCategory_LEGO Classic,LegoCategory_LEGO Ideas,LegoCategory_LEGO Super Mario,LegoCategory_LEGO Friends,LegoCategory_LEGO Creator 3in1,LegoCategory_LEGO Minecraft,LegoCategory_LEGO Spider,LegoCategory_LEGO Jurassic World,LegoCategory_LEGO Dots
0,0.350687,0.968448,0.450695,0.467618,0.183426,0.086588,0.477273,0.06818182,-2.775558e-17,1.387779e-16,...,0.05681818,0.1022727,0.03409091,0.068182,0.045455,0.1022727,-1.0408340000000001e-17,-3.469447e-18,-1.734723e-18,0.022727
1,0.314499,0.985553,0.437644,0.449296,0.1557,0.062332,0.88642,0.007407407,4.996004e-16,-3.885781e-16,...,-5.2041700000000004e-18,-4.1633360000000003e-17,0.01975309,0.079012,0.009877,0.02716049,0.05185185,0.01234568,0.002469136,0.032099
2,0.351058,1.0,0.483364,0.548925,0.171586,0.093004,0.824121,6.938894e-18,0.9045226,1.94289e-16,...,0.0201005,0.05527638,3.469447e-18,0.01005,0.211055,6.938894e-18,0.01005025,0.01005025,0.005025126,0.070352
3,0.215292,1.0,0.378691,0.432321,0.148085,0.049638,0.853535,8.673617e-18,1.665335e-16,1.0,...,8.673617e-18,-3.122502e-17,0.01515152,0.035354,0.156566,0.005050505,0.005050505,0.01515152,0.05555556,0.010101


## PCA - Dimension Reduction

In [7]:
# keep the first two principal components
pca = decomposition.PCA(n_components=2)
# fit pca model to the survey
survey_pca = pca.fit(df_scaled)

In [8]:
pca.explained_variance_ratio_

array([0.19375395, 0.10919315])

Transform data

In [9]:
# projecting the values on the pca directions
df_pca = pca.transform(df_scaled)
# controling shape of the dataset
print("Orignial shape: {}". format(str(df_scaled.shape)))
print("Reduced shape: {}".format(str(df_pca.shape)))

Orignial shape: (890, 39)
Reduced shape: (890, 2)


In [10]:
# saving data as dataframe
df_pca = pd.DataFrame(df_pca, columns = ['Pc1', 'Pc2'])
df_pca.info()
#df_pca.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pc1     890 non-null    float64
 1   Pc2     890 non-null    float64
dtypes: float64(2)
memory usage: 14.0 KB


In [11]:
df_pca['cluster'] = km.labels_
# Convert 'cluster' to a categorical variable with a specified order
order = [0, 1, 2, 3]
df_pca['cluster'] = pd.Categorical(df_pca['cluster'], categories=order, ordered=True)


## Visualisation

In [15]:
# representing the data
fig = px.scatter(df_pca, x="Pc1", y="Pc2",color="cluster",
                 labels= {
                     "Pc1": "Principal Component 1",
                     "Pc2": "Principal Component 2",
                     "cluster": "Number of associated cluster"
                 },
                 title= "Representation of the two first principal components of PCA") 
fig.show()