In [486]:
import pandas as pd
import numpy as np
from scipy.linalg import diagsvd
import matplotlib.pyplot as plt
import os
os.chdir('/Users/sosuke/Downloads')
from pca import pca
from scipy.linalg import fractional_matrix_power
from sklearn.cross_decomposition import CCA
from scipy.spatial import distance_matrix

In [354]:
col_5 = ["Wind", "Radiation", "CO", "NO", "NO2", "O3", "HC"]
df5 = pd.read_table('p5.txt', delimiter = '  ', names = col_5)

  


In [355]:
df5 = df5.drop(columns=["CO", "NO", "HC"])

We define a function that finds the loading matrix and the specific variances given a covariance or correlation matrix of a data matrix and the number of factors m, using a PC solution.

In [356]:
def ff(A, m):
    '''
    Given a covariance or a correlation matrix and the number of factors m, 
    this will calculate the loading matrix and the specific variances.
    '''
    eigval, eigvec = np.linalg.eigh(A)
    eigval = np.flip(eigval)[:m]
    eigvec = np.flip(eigvec, axis=1)[:, :m]
    L = np.dot(eigvec, np.power(np.diag(eigval), 0.5))
    
    return L, np.diagonal(A-np.dot(L, np.transpose(L)))

The loading matrix and the specific variances when $m=1$ and $m=2$ are given below. (The first matrix is the loading matrix and the second vector is the specific variances.)

In [372]:
cov = df5.cov()
l_cov_1, var_cov_1 = ff(cov, 1)
l_cov_1, var_cov_1

(array([[  0.19096945],
        [-17.02940737],
        [ -0.36960253],
        [ -1.92290987]]),
 array([ 2.85377457,  0.3686227 , 11.22692475, 27.28093099]))

In [373]:
l_cov_2, var_cov_2 = ff(cov, 2)
l_cov_2, var_cov_2

(array([[  0.19096945,  -0.26926066],
        [-17.02940737,  -0.60686095],
        [ -0.36960253,   0.77045188],
        [ -1.92290987,   5.19956805]]),
 array([2.78127327e+00, 3.42491328e-04, 1.06333286e+01, 2.45423055e-01]))

The proportions of the variability when $m = 1$ and $m =2$ are given as

In [374]:
eigval = np.flip(np.linalg.eigvalsh(cov))
sum(eigval[:2])/sum(eigval), sum(eigval[:3])/sum(eigval)

(0.9592958841693189, 0.9917026265481877)

which gives us 95.93% and 99.17%

The loading matrix and the specific variances when $m=1$ and $m=2$ are given below. (The first matrix is the loading matrix and the second vector is the specific variances.)

In [375]:
cor = df5.corr()
l_cor_1, var_cor_1 = ff(cor, 1)
l_cor_1, var_cor_1

(array([[-0.480817  ],
        [ 0.68131137],
        [ 0.47120959],
        [ 0.75764328]]),
 array([0.76881501, 0.53581482, 0.77796152, 0.42597666]))

In [376]:
l_cor_2, var_cor_2 = ff(cor, 2)
l_cor_2, var_cor_2

(array([[-0.480817  ,  0.55417418],
        [ 0.68131137, -0.10645863],
        [ 0.47120959,  0.77727746],
        [ 0.75764328, -0.03599683]]),
 array([0.46170599, 0.52448138, 0.17380126, 0.42468088]))

The proportions of the variability when $m = 1$ and $m =2$ are given as

In [367]:
eigval = np.flip(np.linalg.eigvalsh(cov))
sum(eigval[:2])/sum(eigval), sum(eigval[:3])/sum(eigval)

(0.9592958841693189, 0.9917026265481877)

which gives us 95.93% and 99.17%, which is the same as in the covariance case

The angle between the two vectors - the first factor loading using covariance matrix and the first factor loading using correlation matrix - is given by finding the arccos of the dot product of the two vectors divided by the magnitudes of both vectors. Thus, the angle is

In [52]:
np.arccos(np.dot(l_cov_1[0], l_cor_1[0])/(np.linalg.norm(l_cov_1[0])*np.linalg.norm(l_cor_1[0])))

3.141592653589793

i.e. 180 degrees.