In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

### Load Data

In [3]:
data = pd.read_csv("resources/b5.csv")
data

Unnamed: 0,E1,E2,E3,E4,E5,E6,E7,E8,E9,E10,...,O1,O2,O3,O4,O5,O6,O7,O8,O9,O10
0,4,2,5,2,5,1,4,3,5,1,...,4,1,3,1,5,1,4,2,5,5
1,2,2,3,3,3,3,1,5,1,5,...,3,3,3,3,2,3,3,1,3,2
2,5,1,1,4,5,1,1,5,5,1,...,4,5,5,1,5,1,5,5,5,5
3,2,5,2,4,3,4,3,4,4,5,...,4,3,5,2,4,2,5,2,5,5
4,3,1,3,3,3,1,3,1,3,5,...,3,1,1,1,3,1,3,1,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18925,1,4,3,5,4,3,1,2,1,5,...,1,3,5,3,4,1,4,2,5,3
18926,2,3,2,3,2,3,2,4,4,4,...,1,2,3,2,3,3,4,2,3,3
18927,2,5,4,5,5,5,1,2,1,5,...,5,3,1,3,4,1,1,5,5,5
18928,1,4,2,3,2,4,1,3,4,5,...,3,2,5,3,4,1,5,3,5,5


### Convert data to numpy array

In [5]:
X = data.values
X

array([[4, 2, 5, ..., 2, 5, 5],
       [2, 2, 3, ..., 1, 3, 2],
       [5, 1, 1, ..., 5, 5, 5],
       ...,
       [2, 5, 4, ..., 5, 5, 5],
       [1, 4, 2, ..., 3, 5, 5],
       [2, 3, 1, ..., 5, 5, 5]], dtype=int64)

### Scale the data

In [7]:
X = scale(X)
X

array([[ 1.112632  , -0.57855181,  1.28086553, ..., -0.96448433,
         0.88177594,  1.0099427 ],
       [-0.51209309, -0.57855181, -0.33817724, ..., -1.76030372,
        -1.15794039, -2.03840414],
       [ 1.92499454, -1.34069418, -1.95722   , ...,  1.42297383,
         0.88177594,  1.0099427 ],
       ...,
       [-0.51209309,  1.70787529,  0.47134415, ...,  1.42297383,
         0.88177594,  1.0099427 ],
       [-1.32445564,  0.94573292, -1.14769862, ..., -0.16866494,
         0.88177594,  1.0099427 ],
       [-0.51209309,  0.18359055, -1.95722   , ...,  1.42297383,
         0.88177594,  1.0099427 ]])

## Analysis

### PCA with 50 components

In [10]:
pca = PCA()
pca.fit(X)
print(pca.explained_variance_ratio_)

[0.16101911 0.09246263 0.07527279 0.07100381 0.05526279 0.0313044
 0.02660753 0.02099101 0.01937029 0.01849593 0.01794133 0.01735232
 0.01689316 0.01604282 0.01576013 0.01470117 0.01459846 0.01438242
 0.01362569 0.01322527 0.0131285  0.01268631 0.01223362 0.01180048
 0.01164071 0.01136677 0.01108761 0.0106515  0.01045028 0.0102118
 0.01003675 0.00983507 0.00974897 0.00963948 0.0089566  0.00894
 0.00868373 0.00840808 0.00831106 0.00808253 0.00793945 0.00760062
 0.00748126 0.00727417 0.00694749 0.00689884 0.00650197 0.00647554
 0.0062404  0.00442735]


### PCA with 5 components

In [14]:
pca5 = PCA(n_components=5)
pca5.fit(X)
print(pca5.explained_variance_ratio_)

[0.16101911 0.09246263 0.07527276 0.07100377 0.05526257]


### Contributuions of variables to components

In [16]:
print(pca5.components_)

[[-0.18391014  0.17969564 -0.25064867  0.2061368  -0.23167008  0.1976802
  -0.22070165  0.13297265 -0.1646324   0.20183617  0.15465885 -0.12978284
   0.12271811 -0.10236724  0.14227503  0.16681247  0.1627777   0.17394509
   0.17756882  0.19476315  0.09255979 -0.17766442  0.09908258 -0.11467331
   0.13341505 -0.05636248  0.18786018 -0.13286202 -0.11972951 -0.18783946
  -0.11091497  0.05023994 -0.07324244  0.15189875 -0.1087401   0.09267628
  -0.04455464  0.13905637 -0.0913073  -0.09522001 -0.06943505  0.09889309
  -0.03539256  0.07185537 -0.13054232  0.08698146 -0.11181571 -0.01241829
   0.00436534 -0.11687661]
 [-0.12862187  0.16741212 -0.07309202  0.09547424 -0.14847108  0.12807602
  -0.1365875   0.10835604 -0.11570091  0.07576808 -0.22728293  0.14366688
  -0.21599991  0.11922738 -0.18825587 -0.25221019 -0.24452754 -0.24841223
  -0.19628024 -0.18517723  0.08481653 -0.17630336 -0.05190624 -0.19236417
   0.17355537 -0.17870151  0.17028722 -0.14026611 -0.2073608  -0.09458076
   0.1241632