# PCA Beispiel

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import scipy
np.set_printoptions(precision=2,suppress=True)
%matplotlib inline

**Aufgabe**
- Visualisiere Datensätze
- Führe eine PCA durch und diskutiere die Eigenwerte der Kovarianzmatrix
- Projiziere den Datensatz auf die zwei grössten Hauptkomponenten


## Einfacher Datensatz
(2-dimensional)

In [None]:
# PCA-Daten
XY = np.array([[1,1], [1, 2], [2,1], [2, 2], [3, 4], [4, 5], [5,4], [7, 6]])

In [None]:
plt.scatter(XY[:,0], XY[:,1])
plt.gca().set_aspect(1)

#### PCA von Hand

In [None]:
XY_cent = (XY - XY.mean(axis=0))

In [None]:
cov = (XY_cent.T  @  XY_cent)
cov

In [None]:
np.cov(XY_cent.T) * 7

In [None]:
np.cov(XY.T) * 7

In [None]:
eig, trans = scipy.linalg.eig(cov)
print(eig)
print(trans)

In [None]:
eig = np.real(eig)
eig

In [None]:
XY_trans = XY_cent @ trans
XY_trans

In [None]:
trans.T @ trans

In [None]:
plt.scatter(XY_trans[:,0], XY_trans[:,1])
plt.gca().set_aspect(1)

### Mit Scikit Learn

In [None]:
pca = PCA()

In [None]:
XY_t = pca.fit_transform(XY)
XY_t

In [None]:
vars(pca)

In [None]:
pca.singular_values_ ** 2

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.explained_variance_

In [None]:
_ / sum(_)

## Spiraldatensatz

In [None]:
df = pd.read_csv('spiraldatensatz.csv')
df.shape

In [None]:
df.head()

In [None]:
plt.figure(figsize=(20, 20))

ax1 = plt.subplot(311)
ax1.scatter(df.x1, df.x2, s=1)
ax1.set_aspect(1)

ax2 = plt.subplot(312)
ax2.scatter(df.x1, df.x3, s=1)
ax2.set_aspect(1)

ax3 = plt.subplot(313)
ax3.scatter(df.x2, df.x3, s=1)
ax3.set_aspect(1)

In [None]:
fig = plt.figure(figsize=(20, 20))
ax = fig.add_subplot(projection="3d")
df_seg = df
ax.scatter3D(df_seg.x1, df_seg.x2, df_seg.x3)

In [None]:
np.cov(df.to_numpy().T)

In [None]:
pca3 = PCA()

In [None]:
df_t = pca3.fit_transform(df)

In [None]:
df_t.shape

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(df_t[:,0], df_t[:,1], s=1, c=df_t[:,2], cmap='cool')
plt.gca().set_aspect(1)
plt.colorbar()

In [None]:
vars(pca3)

In [None]:
plt.figure(figsize=(20, 20))

ax1 = plt.subplot(311)
ax1.hist(df_t[:,0])

ax2 = plt.subplot(312)
ax2.hist(df_t[:,1])

ax3 = plt.subplot(313)
ax3.hist(df_t[:,2])

pass

### Reduktion auf zwei Dimensionen

In [None]:
pca2 = PCA(2)
pca2.fit(df)

In [None]:
pca2.components_

In [None]:
pca3.components_

In [None]:
pca2.singular_values_

In [None]:
pca2.explained_variance_

In [None]:
pca2.explained_variance_ratio_