# ISLP - Chapter 12 - Exercise 8
### Author: pzuehlke

In [2]:
import numpy as np
import pandas as pd
from statsmodels.datasets import get_rdataset
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
arrests = get_rdataset("USArrests").data
n, p = arrests.shape
arrests.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, Alabama to Wyoming
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Murder    50 non-null     float64
 1   Assault   50 non-null     int64  
 2   UrbanPop  50 non-null     int64  
 3   Rape      50 non-null     float64
dtypes: float64(2), int64(2)
memory usage: 2.0+ KB


In [4]:
arrests.head()

Unnamed: 0_level_0,Murder,Assault,UrbanPop,Rape
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,13.2,236,58,21.2
Alaska,10.0,263,48,44.5
Arizona,8.1,294,80,31.0
Arkansas,8.8,190,50,19.5
California,9.0,276,91,40.6


In [5]:
scaler = StandardScaler()
arrests_scaled = scaler.fit_transform(arrests)

__8 (a):__ We only need to perform PCA on the data and then compute the PVE
using `explained_variance_ratio_` as suggested in the statement:

In [6]:
pca = PCA(n_components=p)
pca_results = pca.fit_transform(arrests_scaled)

In [7]:
pve_method_a = pca.explained_variance_ratio_
for j, pve in enumerate(pve_method_a):
    print(f"PC{j + 1}: {pve:.4f}")
print(f"Total PVE: {sum(pve_method_a):.4f}")

PC1: 0.6201
PC2: 0.2474
PC3: 0.0891
PC4: 0.0434
Total PVE: 1.0000


__8 (b):__ We begin by computing the loading vectors of each principal component:

In [8]:
loadings = pca.components_
print(loadings)

[[ 0.53589947  0.58318363  0.27819087  0.54343209]
 [-0.41818087 -0.1879856   0.87280619  0.16731864]
 [-0.34123273 -0.26814843 -0.37801579  0.81777791]
 [-0.6492278   0.74340748 -0.13387773 -0.08902432]]


Note that `pca_results` is a $ 50 \times 4 $ array whose $ (i, j) $-th element
is the principal component scores for the $ i $-th state on the $ j $-th
component, or, in the notation of this chapter:
$$
    z_{ij} = \sum_{k=1}^{p}\phi_{kj}\,x_{ik}\,.
$$

In [9]:
print(pca_results.shape)

(50, 4)


The variance explained by the $ j $-th component is thus the variance of the $ j
$-th column (for $ j = 1, \dots, 4 $):
$$
    \text{variance explained by $ j $-th component}
    = \frac{1}{n} \sum_{i=1}^n z_{ij}^2
    = \frac{1}{n}\sum_{i=1}^{n}\left(\sum_{k=1}^{p}\phi_{kj}\,x_{ik}\right)^2\,.
$$
Equivalently, this is expression (12.9) on p. 511, with $ m $ replaced by $ j $.
Let's compute these variances:

In [10]:
pcs_variance = np.var(pca_results, axis=0)
print(pcs_variance)

[2.48024158 0.98976515 0.35656318 0.17343009]


Now we compute the total variance of the scaled data, namely
$$
    \text{total variance} = \sum_{j=1}^p\frac{1}{n}\sum_{i=1}^{n}x_{ij}^2\,.
$$

In [None]:
total_variance = np.sum(np.var(arrests_scaled, axis=0))

Now to obtain the proportion of variance explained by the $ j $-th component, $ \text{PVE}_j $,
we just need to divide these two:
$$
    \text{PVE}_j = \frac{\text{variance explained by the $ j $-th component}}{\text{total variance}}\,.
$$

In [13]:
pve_method_b = pcs_variance / total_variance
for j, pve in enumerate(pve_method_b):
    print(f"PC{j + 1}: {pve:.4f}")
print(f"Sum of PVE: {sum(pve_method_b):.4f}")

PC1: 0.6201
PC2: 0.2474
PC3: 0.0891
PC4: 0.0434
Sum of PVE: 1.0000


Finally, we compare the answers we have arrived at:

In [14]:
for j in range(p):
    print(f"PC{j + 1}: {abs(pve_method_a[j] - pve_method_b[j]):.8f}")

PC1: 0.00000000
PC2: 0.00000000
PC3: 0.00000000
PC4: 0.00000000


Conclusion: the two methods yield the same PVEs, as expected.