In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from seaborn import heatmap
plt.style.use('ggplot')
%matplotlib inline

## Load the Iris dataset

In [None]:
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df.head()

## Normalize the dataset

In [None]:
x_centered = X - X.mean(axis=0)
x_scaled = x_centered / x_centered.std(axis=0)
print("Column-wise mean", x_scaled.mean(axis=0))
print("Column-wise variance", x_scaled.var(axis=0))

## Find the Covariance matrix

In [None]:
np.cov?

In [None]:
C = np.cov(x_scaled.T)
C

In [None]:
cdf = pd.DataFrame(C, index=iris.feature_names, columns=iris.feature_names)
cdf

In [None]:
heatmap(cdf, annot=True, cmap=plt.cm.Blues)

## Get the Eigenvalues & eigenvectors of the covariace matrix

In [None]:
np.linalg.eig?

In [None]:
w, v = np.linalg.eig(C)
w

In [None]:
ix = np.argsort(w)[::-1]
v_sorted = v[:, ix]

## Select the number of components to keep

In [None]:
N = 2

## Project the data onto the principal components

In [None]:
v_inv =np.linalg.pinv(v_sorted)

In [None]:
x_lr = np.dot(v_inv, x_scaled.T).T
x_lr_reduced = x_lr[:, :N]

In [None]:
plt.scatter(x_lr_reduced[:, 0], x_lr_reduced[:, 1], c=y)

# Automate PCA with sklearn

In [None]:
pca = PCA()
x_lr = pca.fit_transform(X)

In [None]:
plt.scatter(x_lr[:, 0], x_lr[:, 1], c=y)

## Variance of each component - Scree Plots

In [None]:
pd.Series(pca.explained_variance_).plot(kind='bar')
plt.xlabel('PCs')
plt.ylabel('Variance')

In [None]:
pd.Series(pca.explained_variance_ratio_).plot(kind='bar')
plt.xlabel('PCs')
plt.ylabel('Ratio of variance')

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_), '-o')
plt.xlabel('Number of PCs')
plt.ylabel('Cumulative Variance Ratio')

## Biplot

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(x_lr[:, 0], x_lr[:, 1], c=y)
components = pca.components_
sv = pca.singular_values_
for i, feat in enumerate(iris.feature_names):
    xx, yy = components[:2, i] # * sv[i]
    ax.arrow(0, 0, xx, yy, color='r', width=0.02 ,length_includes_head=True, head_width=0.1)
    ax.text(xx, yy, feat)

# Exercise: Try PCA on the Breast Cancer dataset (given below):
## 1. Find the number of PCs required to get > 99% variance
## 2. Plot the top 2 PCs

In [None]:
from sklearn.datasets import load_breast_cancer
bc = load_breast_cancer()
X = bc.data
y = bc.target
df = pd.DataFrame(X, columns=bc.feature_names)
df.head()

In [None]:
# enter code here