<font color=skyblue>A simple example of Principal Component Analysis</font>

Data is not standardized

<font color=yellow>Use numpy</font>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

X = np.array([[1, 2, 3, 4, 5], [2, 1, 4, 5, 4]])
print("X:\n{}".format(X))
# np.cov 接受 行=變數，列=樣本
Scov = np.cov(X) # compute covariance matrix
print("Scov:\n{}".format(Scov))
# compute eigenvectors and eigenvalues
eigenvalues, eigenvectors = np.linalg.eig(Scov)
# sort eigenvalues and eigenvectors
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx] # eigenvectors are column-wise

print("eigenvalues:\n{}".format(eigenvalues))
print(f"(sigma_x1, sigma_x2) = ({Scov[0, 0]}, {Scov[1, 1]}), 與 eigenvalues: {eigenvalues} 相加相等")
print("eigenvectors:\n{}".format(eigenvectors))
v1 = -eigenvectors[:, 0] # reverse the direction of the first eigenvector
v2 = eigenvectors[:, 1] # keep the direction of the second eigenvector
print("v1:\n{}".format(v1))
print("v2:\n{}".format(v2))

# transform data by v1 and v2
Z1 = np.dot(v1, X)  # project X onto v1，矩陣相乘
Z2 = np.dot(v2, X)
Z = np.vstack((Z1, Z2))
print("Z:\n{}".format(Z))


X:
[[1 2 3 4 5]
 [2 1 4 5 4]]
Scov:
[[2.5 2. ]
 [2.  2.7]]
eigenvalues:
[4.60249844 0.59750156]
(sigma_x1, sigma_x2) = (2.5, 2.7), 與 eigenvalues: [4.60249844 0.59750156] 相加相等
eigenvectors:
[[-0.68922507 -0.72454731]
 [-0.72454731  0.68922507]]
v1:
[0.68922507 0.72454731]
v2:
[-0.72454731  0.68922507]
Z:
[[ 2.13831969  2.10299744  4.96586445  6.37963683  6.34431458]
 [ 0.65390282 -0.75986956  0.58325833  0.54793608 -0.8658363 ]]


<font color=yellow>Use sklearn PCA</font> 

注意：PCA.fit_transform 轉成新的座標軸前，會先將 X 置中

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
Z = pca.fit_transform(X.T)
print("Z:\n{}".format(Z))
print("explained_variance:\n{}".format(pca.explained_variance_))
print("explained_variance_ratio:\n{}".format(pca.explained_variance_ratio_))
print("components:\n{}".format(pca.components_))

# 自行計算 Z 值（以便控制 eigenvector 的方向）
# print the first principal component
v1 = pca.components_[0]
print("v1:\n{}".format(v1))
# print the second principal component
v2 = pca.components_[1]
print("v2:\n{}".format(v2))

# Manually center the data
X = X - np.mean(X, axis=1, keepdims=True)
# transform data by v1 and v2
Z1 = np.dot(v1, X)
Z2 = np.dot(v2, X)
Z = np.vstack((Z1, Z2))
print("Z:\n{}".format(Z))

Z:
[[-2.24790691 -0.62202455]
 [-2.28322915  0.79174783]
 [ 0.57963785 -0.55138005]
 [ 1.99341023 -0.51605781]
 [ 1.95808798  0.89771457]]
explained_variance:
[4.60249844 0.59750156]
explained_variance_ratio:
[0.88509585 0.11490415]
components:
[[ 0.68922507  0.72454731]
 [ 0.72454731 -0.68922507]]
v1:
[0.68922507 0.72454731]
v2:
[ 0.72454731 -0.68922507]
Z:
[[-2.24790691 -2.28322915  0.57963785  1.99341023  1.95808798]
 [-0.62202455  0.79174783 -0.55138005 -0.51605781  0.89771457]]


<font color=yellow>Use standardized data and Draw Scatter Plot for X and Z</font>