# What is PCA and SVD and how are they related

# Write your own function for PCA. Note that in the following the data is not normalized and is not adjusted for zero mean. We need to add that. Otherwise the values will not match with the library function.

In [166]:
def pca(X):
  # Add a functionality of adjusting X for zero mean for each column and normalize each column using variance.
  n, m = X.shape
  # Compute covariance matrix
  C = np.dot(X.T, X) / (n-1)
  # Eigen decomposition
  eigen_vals, eigen_vecs = np.linalg.eig(C)
  # Project X onto PC space
  X_pca = np.dot(X, eigen_vecs)
  return eigen_vecs,eigen_vals,X_pca

In [167]:
import numpy as np

In [189]:
A=np.array([[1,2,3],
   [4,3,7],
   [7,6,6]])

In [190]:
A.shape

(3, 3)

In [191]:
w1=A[:,0]
m1=np.mean(w1)
w2=A[:,1]
m2=np.mean(w2)
w3=A[:,2]
m3=np.mean(w3)
A[:,0]=w1
A[:,1]=w2
A[:,2]=w3
print("mean for column 1:",m1)
print("mean for column 2:",m2)
print("mean for column 3:",m3)

mean for column 1: 4.0
mean for column 2: 3.6666666666666665
mean for column 3: 5.333333333333333


# Adjust for mean and recreate A

In [192]:
w1=w1-m1
w2=w2-m2
w3=w3-m3
print(w1)
print(w2)
print(w3)

[-3.  0.  3.]
[-1.66666667 -0.66666667  2.33333333]
[-2.33333333  1.66666667  0.66666667]


In [172]:
A=np.array([w1, w2, w3])
print(A)

[[-7.         -4.         -1.        ]
 [-5.33333333 -4.33333333 -1.33333333]
 [-7.66666667 -3.66666667 -4.66666667]]


In [173]:
pca(A)

(array([[ 0.82225552,  0.56435218, -0.07350154],
        [ 0.48009863, -0.75718483, -0.44291808],
        [ 0.30561603, -0.32890385,  0.89354412]]),
 array([100.51060475,   0.42774277,   3.56165248]),
 array([[-7.98179922, -0.59282207,  1.39263896],
        [-6.87327824,  0.70979446,  1.11992772],
        [-9.49052881, -0.01547101, -1.98232783]]))

# Now let us use sklearn libeary PCA function for PCA

In [174]:
from sklearn.decomposition import PCA

In [193]:
pca = PCA(n_components=3)

In [194]:
A=np.array([[1,2,3],
   [4,3,7],
   [7,6,6]])

In [195]:
pca.fit(A)

PCA(n_components=3)

# Let us determine the 'importance' of each PCA component i.e. $\lambda_1 / (\lambda_1 + \lambda_2 + \lambda_3)$. This represents the amount of information can be represented by each PCA or the 'weight' of a PCA.

In [196]:
print(pca.explained_variance_ratio_)

[8.67682552e-01 1.32317448e-01 4.64293612e-33]


In [197]:
print(pca.singular_values_) # this represents the singular values sigmas

[5.53697723e+00 2.16222489e+00 4.05031369e-16]


In [198]:
print(pca.components_) # this represents the transformed data

[[-0.7639027  -0.49776505 -0.4107099 ]
 [-0.15306667 -0.47851243  0.8646366 ]
 [-0.62691567  0.72336423  0.28934569]]


In [186]:
pca = PCA(n_components=3, svd_solver='auto')
pca.fit(A)
PCA(n_components=3, svd_solver='auto')

PCA(n_components=3)

In [187]:
print(pca.explained_variance_ratio_)

[8.67682552e-01 1.32317448e-01 4.64293612e-33]


In [188]:
print(pca.singular_values_)

[5.53697723e+00 2.16222489e+00 4.05031369e-16]
