In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn import preprocessing

X_df = pd.read_csv('objects.csv', index_col='Sample')
X_df.head(10)

Unnamed: 0_level_0,Size,Weight,Intensity,Value
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,25,249,43,80
2,32,320,82,81
3,10,102,61,79
4,64,650,69,80
5,88,873,73,82
6,12,121,48,78
7,66,651,42,79
8,37,380,27,80
9,54,549,92,81
10,77,764,55,79


In [2]:
n_comp = 3
pca = PCA(n_components=n_comp).fit(X_df)
X_pca = pca.transform(X_df)
np.round(pca.explained_variance_ratio_, 3)

array([0.995, 0.005, 0.   ])

In [3]:
load_df = pd.DataFrame(pca.components_, index=[f'PC{i}' for i in range(1, n_comp+1)], columns=X_df.columns)
load_df

Unnamed: 0,Size,Weight,Intensity,Value
PC1,0.100301,0.994791,0.018074,0.002005
PC2,-0.009425,-0.017267,0.999364,0.029738
PC3,-0.25768,0.024594,-0.030733,0.965428


### Min-Max Scaling

In [4]:
min_max_scaler = preprocessing.MinMaxScaler()
X_df_minmax = min_max_scaler.fit_transform(X_df)
X_df_minmax

array([[0.19230769, 0.19066148, 0.24615385, 0.5       ],
       [0.28205128, 0.28274968, 0.84615385, 0.75      ],
       [0.        , 0.        , 0.52307692, 0.25      ],
       [0.69230769, 0.71076524, 0.64615385, 0.5       ],
       [1.        , 1.        , 0.70769231, 1.        ],
       [0.02564103, 0.02464332, 0.32307692, 0.        ],
       [0.71794872, 0.71206226, 0.23076923, 0.25      ],
       [0.34615385, 0.36057069, 0.        , 0.5       ],
       [0.56410256, 0.57976654, 1.        , 0.75      ],
       [0.85897436, 0.85862516, 0.43076923, 0.25      ]])

In [5]:
n_comp = 3
pca = PCA(n_components=n_comp).fit(X_df_minmax)
X_pca = pca.transform(X_df_minmax)
np.round(pca.explained_variance_ratio_, 3)

array([0.663, 0.254, 0.083])

Two (2) principal components are required to achieve at least 90% variance coverage. These are PC1 and PC2 which covers ~92%.

In [6]:
load_df = pd.DataFrame(pca.components_, index=[f'PC{i}' for i in range(1, n_comp+1)], columns=X_df.columns)
load_df

Unnamed: 0,Size,Weight,Intensity,Value
PC1,0.622737,0.626086,0.277921,0.378119
PC2,-0.319363,-0.313407,0.741583,0.499838
PC3,-0.102031,-0.09807,-0.610583,0.779205


The original features that contribute most to the variance are: weight in PC1 and Intensity in PC2.

### Standard Scaling

In [7]:
std_scaler = preprocessing.StandardScaler().fit(X_df)
X_df_std = std_scaler.transform(X_df)
X_df_std

array([[-0.83182884, -0.84637182, -0.85103004,  0.08804509],
       [-0.56100085, -0.56932065,  1.19774598,  0.968496  ],
       [-1.41217455, -1.41998482,  0.09455889, -0.79240582],
       [ 0.67706999,  0.71838199,  0.51482064,  0.08804509],
       [ 1.60562312,  1.5885568 ,  0.72495151,  1.8489469 ],
       [-1.33479512, -1.34584436, -0.58836645, -1.67285672],
       [ 0.75444942,  0.72228412, -0.90356276, -0.79240582],
       [-0.36755228, -0.3351929 , -1.69155353,  0.08804509],
       [ 0.29017285,  0.32426694,  1.72307316,  0.968496  ],
       [ 1.18003627,  1.16322472, -0.22063742, -0.79240582]])

In [8]:
n_comp = 3
pca = PCA(n_components=n_comp).fit(X_df_std)
X_pca = pca.transform(X_df_std)
np.round(pca.explained_variance_ratio_, 3)

array([0.632, 0.272, 0.096])

In [9]:
load_df = pd.DataFrame(pca.components_, index=[f'PC{i}' for i in range(1, n_comp+1)], columns=X_df.columns)
load_df

Unnamed: 0,Size,Weight,Intensity,Value
PC1,0.568171,0.570098,0.358882,0.472625
PC2,-0.405053,-0.399104,0.694283,0.441158
PC3,-0.121866,-0.11828,-0.623837,0.76288


### Observations

1. Two (2) principal components are required to achieve at least 90% variance coverage. These are PC1 and PC2 which covers ~92%. And the original features that contribute most to the variance are: weight in PC1 and Intensity in PC2.
2. Size and weight features are similar/proportional, but PCA "alone" shows weight contributing to the most variance. Anda after performing scaling (min-max and standard) it shows similar variance for both size and weight features.
3. The intensity and value features contribute the least variance on PCA-only run. But after normalization it shows significant contribution to the variance.