# Principal Component Analysis

## Variance explained by PCs
The first PC captures ~69% of the variance in the data.  
The second PC captures ~19% of the variance.  
The first two PCs capture > 88% of the variance in the data. 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

penguins_all = pd.read_csv('data/penguins_af.csv')

In [None]:
penguins_all.columns

In [None]:
penguins_all

In [None]:
species_names=['Adelie','Chinstrap','Gentoo']

X = penguins_all[['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']]
y = penguins_all['species']

# Scale input features to N(0,1)
X_scal = StandardScaler().fit_transform(X)

pca = PCA(n_components=4)
X_r = pca.fit(X_scal).transform(X_scal)

# Proportion of variance explained for each components
pca.explained_variance_ratio_

In [None]:
df = pd.DataFrame(pca.explained_variance_ratio_, 
                  index=['PC1','PC2','PC3','PC4'],columns =['var'])

pl = df.plot.bar(figsize=(5,4))
pl.set_ylabel("Variance Explained")
pl.set_ylim([0,0.8])

## Plotting transformed data
Plot the Penguins data using the first two PCs.  
i.e. we are showing the Penguins data transformed into a 2D space where >88% of the variance in the data is preserved. 

In [None]:
plt.figure()
colors = ['navy', 'turquoise', 'darkorange']
lw = 2

for color, species in zip(colors, species_names):
    plt.scatter(X_r[y == species, 0], X_r[y == species, 1], color=color, alpha=.8, lw=lw,
                label=species)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.xlabel('PC1 (69%)')
plt.ylabel('PC2 (19%)')
plt.title('PCA of Penguins dataset')

plt.show()

## Top Trumps
`HarryPotterTT.csv` contains data on Top Trumps cards.  
This is an insightful example of clustering because some of the clusters are meaningful and some are not.   

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
TT_df = pd.read_csv('data/HarryPotterTT.csv')
TT_df_dash = TT_df.copy()
TT_df.head()

In [None]:
y = TT_df.pop('Name').values
X = TT_df.values
X_scal = StandardScaler().fit_transform(X)
X.shape

In [None]:
pcaHP = PCA(n_components=4)
X_r = pcaHP.fit(X_scal).transform(X_scal)
pcaHP.explained_variance_ratio_

In [None]:
pcaHP.components_

In [None]:
df = pd.DataFrame(pcaHP.explained_variance_ratio_, 
                  index=['PC1','PC2','PC3','PC4'],columns =['var'])

pl = df.plot.bar(color='red',figsize=(5,4))
pl.set_ylabel("Variance Explained")
pl.set_ylim([0,0.8])

In [None]:
plt.figure(figsize=(8,6))
lw = 2
labels = list(range(len (y)))
labels[0]='Harry'
labels[1]='Hermione'
labels[2]='Ron'
labels[3]='Prof D'
labels[4]='Snape'
labels[5]='Prof McG'
labels[6]='Prof Moody'
labels[7]='Hagrid'
labels[8]='Fred'
labels[10]='Arthur'
labels[11]='Crabbe'
labels[12]='Draco Malfoy'
labels[13]='Goyle'
labels[16]='Fleur'
labels[18]='Cedric D'
labels[19]='Viktor K'
labels[21]='Lucius Malfoy'

plt.scatter(X_r[:, 0], X_r[:, 1])

for label, xi, yi in zip(labels, X_r[:, 0], X_r[:, 1]):
    plt.annotate(
        label,
        xy=(xi, yi), xytext=(-3, 3),
        textcoords='offset points', ha='right', va='bottom')

plt.xlabel('PC1 (49%)')
plt.ylabel('PC2 (32%)')
plt.title('PCA of HP dataset')

plt.show()

In [None]:
TT_df_dash