## Preprocessing

In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
url = '/Users/arpanganguli/Documents/Professional/Analysis/ISLR/Datasets/USArrests.csv'
USArrests = pd.read_csv(url, index_col='Unnamed: 0')
USArrests.head()

Unnamed: 0,Murder,Assault,UrbanPop,Rape
Alabama,13.2,236,58,21.2
Alaska,10.0,263,48,44.5
Arizona,8.1,294,80,31.0
Arkansas,8.8,190,50,19.5
California,9.0,276,91,40.6


***

## 8.a. Calculating proportion of variance explained (PVE) using PCA through method: pca.explained_variance_ratio_

In [7]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

In [8]:
df = pd.DataFrame(scale(USArrests))
df.columns = USArrests.columns
df.index = USArrests.index
df.head()

Unnamed: 0,Murder,Assault,UrbanPop,Rape
Alabama,1.255179,0.790787,-0.526195,-0.003451
Alaska,0.513019,1.11806,-1.224067,2.509424
Arizona,0.072361,1.493817,1.009122,1.053466
Arkansas,0.234708,0.233212,-1.084492,-0.186794
California,0.281093,1.275635,1.776781,2.088814


In [9]:
df.describe().round(4)

Unnamed: 0,Murder,Assault,UrbanPop,Rape
count,50.0,50.0,50.0,50.0
mean,-0.0,0.0,-0.0,0.0
std,1.0102,1.0102,1.0102,1.0102
min,-1.6207,-1.5244,-2.3407,-1.5025
25%,-0.8611,-0.7486,-0.7705,-0.664
50%,-0.1248,-0.1425,0.0321,-0.1221
75%,0.8029,0.9484,0.8521,0.5331
max,2.2293,2.015,1.7768,2.6712


In [66]:
pca = PCA(n_components=4)
pca_data = pca.fit_transform(df)
principaldf = pd.DataFrame(data = pca_data, columns = ['PC1', 'PC2', 'PC3', 'PC4'])
principaldf.head()

Unnamed: 0,PC1,PC2,PC3,PC4
0,0.985566,1.133392,-0.444269,0.156267
1,1.950138,1.073213,2.040003,-0.438583
2,1.763164,-0.745957,0.054781,-0.834653
3,-0.14142,1.119797,0.114574,-0.182811
4,2.52398,-1.542934,0.598557,-0.341996


In [67]:
principaldf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
PC1    50 non-null float64
PC2    50 non-null float64
PC3    50 non-null float64
PC4    50 non-null float64
dtypes: float64(4)
memory usage: 1.6 KB


In [132]:
PVAR = principaldf.var()
PVAR

PC1    2.530859
PC2    1.009964
PC3    0.363840
PC4    0.176969
dtype: float64

In [134]:
PSUM = np.sum(PVAR)
PSUM

4.081632653061227

In [139]:
PVE_method = pd.DataFrame([PVAR/PSUM]).T
PVE_method.columns = ['explained variance ratio']
PVE_method.index = principaldf.columns
PVE_method

Unnamed: 0,explained variance ratio
PC1,0.62006
PC2,0.247441
PC3,0.089141
PC4,0.043358


***

## 8.a. Calculating proportion of variance explained (PVE) using PCA through formula: $\frac{\sum_{i=1}^n(\sum_{j=1}^p\phi_{jm}x_{ij})^2}{\sum_{j=1}^p\sum_{i=1}^nx_{ij}^2}$

In [68]:
loadings = pca.components_.T
loadings_df = pd.DataFrame(loadings, index=df.columns, columns=principaldf.columns)
loadings_df

Unnamed: 0,PC1,PC2,PC3,PC4
Murder,0.535899,0.418181,-0.341233,0.649228
Assault,0.583184,0.187986,-0.268148,-0.743407
UrbanPop,0.278191,-0.872806,-0.378016,0.133878
Rape,0.543432,-0.167319,0.817778,0.089024


In [115]:
# PC1
num = np.sum((np.dot(df, loadings_df.PC1))**2)

denomdf = pd.DataFrame()
for i in range(0, 50):
    row_sum = np.sum(df.iloc[i]**2)
    denomdf = denomdf.append(pd.DataFrame([row_sum]))

denomdf.columns = ['sums']
denomdf.reset_index(drop=True, inplace=True)
denom = denomdf.sum()

PVE_PC1 = num/denom
PVE_PC1

sums    0.62006
dtype: float64

In [119]:
# PC2
num = np.sum((np.dot(df, loadings_df.PC2))**2)

denomdf = pd.DataFrame()
for i in range(0, 50):
    row_sum = np.sum(df.iloc[i]**2)
    denomdf = denomdf.append(pd.DataFrame([row_sum]))

denomdf.columns = ['sums']
denomdf.reset_index(drop=True, inplace=True)
denom = denomdf.sum()

PVE_PC2 = num/denom
PVE_PC2

sums    0.247441
dtype: float64

In [120]:
# PC3
num = np.sum((np.dot(df, loadings_df.PC3))**2)

denomdf = pd.DataFrame()
for i in range(0, 50):
    row_sum = np.sum(df.iloc[i]**2)
    denomdf = denomdf.append(pd.DataFrame([row_sum]))

denomdf.columns = ['sums']
denomdf.reset_index(drop=True, inplace=True)
denom = denomdf.sum()

PVE_PC3 = num/denom
PVE_PC3

sums    0.089141
dtype: float64

In [121]:
# PC4
num = np.sum((np.dot(df, loadings_df.PC4))**2)

denomdf = pd.DataFrame()
for i in range(0, 50):
    row_sum = np.sum(df.iloc[i]**2)
    denomdf = denomdf.append(pd.DataFrame([row_sum]))

denomdf.columns = ['sums']
denomdf.reset_index(drop=True, inplace=True)
denom = denomdf.sum()

PVE_PC4 = num/denom
PVE_PC4

sums    0.043358
dtype: float64

In [129]:
PVE_formula = pd.DataFrame([PVE_PC1.values, PVE_PC2.values, PVE_PC3.values, PVE_PC4.values])
PVE_formula.columns = ['explained variance ratio']
PVE_formula.index = principaldf.columns
PVE_formula

Unnamed: 0,explained variance ratio
PC1,0.62006
PC2,0.247441
PC3,0.089141
PC4,0.043358


**Therefore, PVE through both method and formula are the same.**