In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_digits
from sklearn.datasets import make_blobs
%matplotlib inline

### 1. Replication of exercise from slides

In [None]:
rng = np.random.RandomState(42)
X = np.dot(rng.rand(2, 2), rng.randn(2, 200)).T
plt.scatter(X[:, 0], X[:, 1])
plt.axis('equal');

In [None]:
pca = PCA(n_components=2)
pca.fit(X)

In [None]:
print(pca.components_)

In [None]:
print(pca.explained_variance_)

In [None]:
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)

In [None]:
plt.scatter(X[:, 0], X[:, 1], alpha=0.5)
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 3 * np.sqrt(length)
    draw_vector(pca.mean_, pca.mean_ + v)
plt.axis('equal');

### 2. PCA for Dimensionality Reduction

In [None]:
pca = PCA(n_components=1)
pca.fit(X)

In [None]:
X_pca = pca.transform(X)

In [None]:
X.shape, X_pca.shape

In [None]:
X_new = pca.inverse_transform(X_pca)

In [None]:
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.8, color='grey')
plt.axis('equal');

### 3. PCA for Visualization

In [None]:
digits = load_digits()
digits.data.shape

In [None]:
fig = plt.figure(figsize=(4, 4)) 
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

for i in range(25):
    ax = fig.add_subplot(5, 5, i + 1, xticks=[], yticks=[])
    ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
    ax.text(0, 4, str(digits.target[i]))

In [None]:
pca = PCA(n_components=2)  
projected = pca.fit_transform(digits.data)
print(digits.data.shape)
print(projected.shape)

In [None]:
fig = plt.figure(figsize=(8, 8)) 
plt.scatter(projected[:, 0], projected[:, 1],
            c=digits.target, edgecolor='none', alpha=0.5,cmap='Paired')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.colorbar();

### 4. PCA for "Clustering"

In [None]:
import seaborn as sns
X1, Y1 = make_blobs(n_features=10, n_samples=100,centers=4, random_state=4,cluster_std=2)
X1.shape

In [None]:
pca=PCA(n_components=2)

In [None]:
pc = pca.fit_transform(X1)

In [None]:
pc.shape

In [None]:
pca.explained_variance_ratio_

In [None]:
pc_df = pd.DataFrame(data = pc , 
        columns = ['PC1', 'PC2'])
pc_df['Cluster'] = Y1
pc_df.head()

In [None]:
df = pd.DataFrame({'var':pca.explained_variance_ratio_,
             'PC':['PC1','PC2']})
sns.barplot(x='PC',y="var", 
           data=df, color="c");

In [None]:
sns.lmplot( x="PC1", y="PC2",
  data=pc_df, 
  fit_reg=False, 
  hue='Cluster', # color by cluster
  legend=True,
  scatter_kws={"s": 80})