# Principal Components Analysis Scratch

In [1]:
import numpy as np

In [2]:
X = np.array([
    [1,0,-1],
    [-1,1,0]
]).T

In [3]:
#X = np.array([
#    [90,90,60,60,30],
#    [60,90,60,60,30],
#    [90,30,60,90,30]
#]).T

# Step_1: Normalize the Data

In [4]:
X_Mean = np.mean(X,0)
X_Mean

array([0., 0.])

In [5]:
S = (X-X_Mean)/(len(X)-1)
X = S
X

array([[ 0.5, -0.5],
       [ 0. ,  0.5],
       [-0.5,  0. ]])

In [6]:
N = len(X)
N

3

# Step_2: Find or Calculate the Covariance Matrix

### It is important to create a square matrix. Factorization of a number into components needs a square matrix. 

In [7]:
X_Cov = np.cov(X)
X_Cov

array([[ 0.5  , -0.25 , -0.25 ],
       [-0.25 ,  0.125,  0.125],
       [-0.25 ,  0.125,  0.125]])

In [8]:
X_Cov = np.cov(X.T)
X_Cov

array([[ 0.25 , -0.125],
       [-0.125,  0.25 ]])

In [9]:
#C_My = X.T*X
C_My = np.matmul(X.T,X)
C_My

array([[ 0.5 , -0.25],
       [-0.25,  0.5 ]])

# Step_3: Find Eigen Values and Vectors on Covariance Matrix C

### C*X = X*Lambda --> (C-Lambda)X=0

In [10]:
from numpy import linalg as la

### Calculate eigen Values

In [11]:
u,v = la.eig(C_My)
u

array([0.75, 0.25])

### Calculate eigen Vectors v --> Normalized

In [12]:
v

array([[ 0.70710678,  0.70710678],
       [-0.70710678,  0.70710678]])

# Step_4: Sort the eigen values or arrange them form top to bottom

In [13]:
idx = u.argsort()[::-1]   
u = u[idx]
v = v[:,idx]

In [14]:
u

array([0.75, 0.25])

In [15]:
v

array([[ 0.70710678,  0.70710678],
       [-0.70710678,  0.70710678]])

In [16]:
PC1 = np.array([v[:,0]])
PC1

array([[ 0.70710678, -0.70710678]])

### The above eigen vector is the top principal component associated with the max eigen value

# Step_5: Reconstruct the dimensionally reduced X using the TOP PC1

In [17]:
#X_d = PC1*X.T
#X_d

In [18]:
X_d = np.matmul(PC1,(X-X_Mean).T)
X_d

array([[ 0.70710678, -0.35355339, -0.35355339]])

# Step_6: Compare the dimensions of X and X_d

In [19]:
X_d.T.shape

(3, 1)

In [20]:
X.shape

(3, 2)

### Calculate the mean square error between the original and reconstructed 

In [21]:
#mse = np.sum((X-X_d)**2)

# Apply Inverse PCA and then compare

In [22]:
X_IPCA = np.matmul(PC1.T,X_d).T
X_IPCA

array([[ 0.5 , -0.5 ],
       [-0.25,  0.25],
       [-0.25,  0.25]])

In [23]:
mse = np.sum((X-X_IPCA)**2)/len(X[0])
mse

0.125

In [24]:
X

array([[ 0.5, -0.5],
       [ 0. ,  0.5],
       [-0.5,  0. ]])

# Apply PCA using sklearn API

In [25]:
from sklearn.decomposition import PCA

In [26]:
pca = PCA(n_components=2)

In [27]:
pca.fit(X)

PCA(n_components=2)

In [28]:
print(pca.explained_variance_ratio_)

[0.75 0.25]


In [29]:
print(pca.singular_values_)

[0.8660254 0.5      ]


In [30]:
X_D = pca.transform(X)
X_D

array([[ 7.07106781e-01, -1.66533454e-16],
       [-3.53553391e-01,  3.53553391e-01],
       [-3.53553391e-01, -3.53553391e-01]])

In [31]:
pca.components_

array([[ 0.70710678, -0.70710678],
       [ 0.70710678,  0.70710678]])

# Task: Apply PCA to 

In [32]:
X_Task = np.array([
    [90,90,60,60,30],
    [60,90,60,60,30],
    [90,30,60,90,30]
]).T

# And reduce the dimensionality to 2. Reconstruct and calculate the mse. Report your findings and insignts. Use sklearn to verify your output.