# Introduction to Dimensionality Reduction with PCA
Principal Component Analysis

In [None]:
%pylab inline

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import sklearn
import IPython
import platform

from sklearn import datasets
from sklearn.decomposition import PCA
iris = datasets.load_iris()

remove_setosa = False

if remove_setosa:
    indices_no_setosa = (iris.target != 0) #[TRUE for non-setosa else FALSE]
    X_iris = iris.data[indices_no_setosa]
    y_iris = iris.target[indices_no_setosa]
else:
    X_iris = iris.data
    y_iris = iris.target

numpy.set_printoptions(precision=4)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# Test set will be the 25% taken randomly
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.10, random_state=33)

colormarkers = [ ['red','s'], ['greenyellow','o'], ['blue','x']]

# Scale the features
scaler = preprocessing.MinMaxScaler()
# scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
px,py = X_train[:,0], X_train[:,2]

fig = plt.figure()
ax = plt.figure().gca()
ax.axis("equal")

ax.scatter(px,py, alpha=0.4, c=y_train)
ax.set_xlabel('P.length')
ax.set_ylabel('S.length')
plt.show()

In [None]:
pca = PCA(n_components=2)
txd = pca.fit_transform(X_train[:,[0,2]])

def draw_vector(j, v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)
    ax.annotate("PC{}".format(1+j), v1)

# plot data
px,py = X_train[:,0],X_train[:,2] 
fig = plt.figure(figsize=(8,8))
ax = fig.gca()
plt.axis("equal")
plt.scatter(px,py, alpha=0.4, c=y_train)
print("Our PCA scale factors and direction vectors are:")
for i, (var, direction_vec) in enumerate(zip(pca.explained_variance_, pca.components_)):
    print(var, direction_vec)
    l = np.sqrt(var)
    v = direction_vec * l #np.sqrt(length)
#     print(v)
    draw_vector(i, pca.mean_, pca.mean_ + v)
plt.show()

In [None]:
overlay_pcs = False
print("Replot, with rotated X,y values,")
print("And project points onto PC1:")
px, py = txd[:,0], txd[:,1]
fig = plt.figure(figsize=(8,8))
ax = fig.gca()
plt.axis("equal")

bg_alpha = 0.2 if overlay_pcs else 0.5
ax.scatter(px,py, alpha=bg_alpha, c=y_train)
if overlay_pcs:
    ax.scatter(px,[0 for p in px], alpha=0.5, c=y_train)
    ax.scatter([0 for p in py],py, alpha=0.5, c=y_train)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
plt.show()

for i,xv in enumerate(pca.explained_variance_ratio_):
    print("PC{} explains {:.2f}% of variance".format(1+i,xv*100))

## Singular Value Decomposition of the Covariance Matrix
Underneath the sklearn wrapper, PCA is doing SVD on the Covariance Matrix of our features.  Here we do the same steps ourselves.  Compare the U,S,V values to the scale and direction values in the section above.  They should match.

In [None]:
M = X_train[:,[0,2]]
print(M.shape)

covariance_mx = numpy.cov(M.T)
print("Numpy cov:\n",covariance_mx)

# Do not confuse the covariance matrix with the correlation matrix (which is normalised)!
# corr_mx = numpy.corrcoef(M.T)
# print("Numpy corr:\n",corr_mx)

fig, ax = plt.subplots()
im = ax.imshow(covariance_mx)

# # We want to show all ticks...
ax.set_xticks(np.arange(len(covariance_mx)))
ax.set_yticks(np.arange(len(covariance_mx)))

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(covariance_mx.shape[0]):
    for j in range(covariance_mx.shape[0]):
        text = ax.text(j, i, round(covariance_mx[i, j],3),
                       ha="center", va="center", color="grey")

print()
print("Diagonals of the covariance matrix are the variances of individual features")
print("Off-diagonals show covariances: how much two features vary together")
        
ax.set_title("Covariance matrix")
plt.show()

In [None]:
U,S,V_T = sp.linalg.svd(covariance_mx)
print("U matrix (PC directions)")
print(U)
print("\nS(igma) (scale values)")
print("This list is taken from the diagonal of the Sigma matrix, ordered by descending magnitude")
print(S)
print("\nV^T matrix (same as U)")
print(V_T)

Compare with values taken from original PCA operation above:

    0.1336 [0.5949 0.8038]
    0.0087 [-0.8038  0.5949]

## Look at how the PCs explain the data, with a Scree plot
Now we compress the full set of Iris data (4 components) using PCA.
We use the output to create a scree plot which can be used to select an appropriate dimensionality reduction.

In [None]:
pca4d = PCA(n_components=4)
pca4d.fit(X_train)

cmps = pca4d.components_
print(cmps)

variance = pca4d.explained_variance_ratio_ #calculate variance ratios

var=np.cumsum(np.round(pca4d.explained_variance_ratio_, decimals=3)*100)
#var is cumulative sum of variance explained with [n] features

ax = plt.figure(figsize=(8,6)).gca()
plt.ylabel('% Variance Explained')
plt.xlabel('# of Components')
plt.title('Iris data, PCA Scree Plot')
plt.ylim(70,100.5)

ax.xaxis.set_major_locator(MaxNLocator(integer=True))

plt.plot([1,2,3,4],var)

## Higher dimensioned datasets
Now we do some PCA and scree plots with the wine (13D) and Boston (14D if we include price) datasets.

In [None]:
wine = sklearn.datasets.load_wine()
scaler = sklearn.preprocessing.StandardScaler()
X = wine.data
y = wine.target

X = scaler.fit_transform(X)

print(X.shape, y.shape)

k = X.shape[1]
pca_kd = PCA(n_components=k)
txd = pca_kd.fit_transform(X)

plt.figure(figsize=(8,8))
plt.scatter(txd[:,0], txd[:,1], c=y)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Wine, reduced from 13D")
plt.show()
variance = pca_kd.explained_variance_ratio_ #calculate variance ratios
var=np.cumsum(np.round(pca_kd.explained_variance_ratio_, decimals=3)*100)
ax = plt.figure(figsize=(8,6)).gca()
plt.ylabel('% Variance Explained')
plt.xlabel('# of Components')
plt.title('Wine Scree Plot')
plt.ylim(0,100.5)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.plot(list(range(1,k+1)),var)

In [None]:
boston = sklearn.datasets.load_boston()
X = boston.data
y = boston.target
mms = sklearn.preprocessing.StandardScaler()
X = mms.fit_transform(numpy.c_[X,y]) # concatenate X features with the price
print(X.shape, y.shape)

k = X.shape[1]
pca_kd = PCA(n_components=k)
txd = pca_kd.fit_transform(X)

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,8))
# ax = fig.add_subplot(111, projection='3d')
ax = fig.add_subplot(111)

ax.scatter(txd[:,0], txd[:,1], c=y)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Boston, reduced from 14D")
plt.show()
variance = pca_kd.explained_variance_ratio_ #calculate variance ratios
var=np.cumsum(np.round(pca_kd.explained_variance_ratio_, decimals=3)*100)
ax = plt.figure(figsize=(8,6)).gca()
plt.ylabel('% Variance Explained')
plt.xlabel('# of Components')
plt.title('Boston Scree Plot')
plt.ylim(0,100.5)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.plot(list(range(1,k+1)),var)

## Summary
In this notebook, we:
 - Tested PCA on a boring 2d dataset that we extracted from the Iris data
 - Plotted the PCs on the original dataset, got their directional and length values
 - Transformed the dataset and projected its points onto the PCs
 - Performed SVD outside of the PCA routine and verified that we get the same PCs
 - Used a scree plot to see how successive PCs add to the cumulative variance explained
 - Did PCA on a pair of higher dimensional datasets and made scree plots for them