# Семинар 8. Снижение размерностей

Подключение библиотек. 
$
\newcommand{\R}{\mathbb{R}}
\newcommand{\X}{\mathbb{X}}
\newcommand{\norm}[1]{\lVert #1 \rVert}
\newcommand{\abs}[1]{\left| #1 \right|}
\newcommand{\E}{\mathbb{E}}
\newcommand{\D}{\mathbb{D}}
\renewcommand{\Prob}{\mathbb{P}}
\renewcommand{\le}{\leqslant}
\renewcommand{\ge}{\geqslant}
\newcommand{\eps}{\varepsilon}
\newcommand{\Normal}{\mathcal{N}}
\DeclareMathOperator{\TP}{TP}
\DeclareMathOperator{\FP}{FP}
\DeclareMathOperator{\TN}{TN}
\DeclareMathOperator{\FN}{FN}
\DeclareMathOperator{\Accuracy}{Accuracy}
\DeclareMathOperator{\Precision}{Precision}
\DeclareMathOperator{\Recall}{Recall}
\DeclareMathOperator{\Fscore}{F_1}
\DeclareMathOperator{\MSE}{MSE}
\DeclareMathOperator{\RMSE}{RMSE}
\DeclareMathOperator{\MAE}{MAE}
\DeclareMathOperator{\MAPE}{MAPE}
\DeclareMathOperator{\Rsqured}{R^2}
$

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as wg

from matplotlib.colors import ListedColormap
from IPython.display import Markdown

from sklearn.metrics import classification_report
from sklearn import set_config

import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

set_config(display='diagram')

cm_bright = ListedColormap(['red', 'blue'])
cm = plt.cm.get_cmap('RdBu')
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
def draw_image(img, ax=None):
    d = img.shape[-1]
    d = int(np.sqrt(d))
    img = 1 - np.array(img).reshape(-1, d)
    if ax is None:
        plt.figure(0, (4, 4))
        plt.imshow(img, cmap='Greys')
        plt.axis('off')
        plt.show()
    else:
        ax.imshow(img, cmap='Greys')
        ax.set_axis_off()
    return

In [None]:
from sklearn.datasets import fetch_olivetti_faces


olivetti = fetch_olivetti_faces()
Markdown(olivetti.DESCR)

In [None]:
X = olivetti.data
y = olivetti.target

In [None]:
k = 10
_, axes = plt.subplots(1, k, figsize=(2 * k, 2))

for img, ax in zip(X, axes):
    draw_image(img, ax)
plt.show()

# 1. Метод главных компонент

Применим метод главных компонент для сжатия размерностей.

Основные свойства:
* информативность признаков убывает;
* признаки некоррелированы;
* преобразование обратимо;
* максимальное число компонент: `min(N, d)`;
* можно использовать как препроцессинг данных.

In [None]:
from sklearn.decomposition import PCA


n_components = 400

pca = PCA(n_components)
pca.fit(X)

In [None]:
plt.figure(0, (12, 4))
explained = pca.explained_variance_ratio_

plt.step(np.arange(n_components), explained, color='blue')
plt.show()

In [None]:
plt.figure(0, (12, 4))
explained = pca.explained_variance_ratio_.cumsum()

level = 0.9
n_components_opt = (explained < level).sum()

plt.step(np.arange(n_components), explained, color='blue',)
plt.axvline(n_components_opt, color='purple', ls='--', label=f'n_components={n_components_opt}')
plt.legend()
plt.show()

Посмотрим как выглядят лица после реконструкции:

In [None]:
pca = PCA(n_components_opt)
pca.fit(X)

In [None]:
img = X[0]

img_pca = (X[0] - pca.mean_) @ pca.components_.T
print(img_pca)

In [None]:
img_reconstructed = img_pca @ pca.components_ + pca.mean_

_, [ax1, ax2] = plt.subplots(1, 2, figsize=(8, 4))
draw_image(img, ax1)
draw_image(img_reconstructed, ax2)
plt.show()

Стандартные методы PCA:

In [None]:
X_pca = pca.transform(X)
X_rec = pca.inverse_transform(X_pca)

k = 10
_, axes = plt.subplots(2, k, figsize=(3 * k, 6))
for img, img_rec, [ax1, ax2] in zip(X, X_rec, axes.T):
    draw_image(img, ax1)
    draw_image(img_rec, ax2)

Посмотрим на качество в зависимости от числа компонент:

In [None]:
plot_output = wg.Output()

def viz(n_comp):
    X = olivetti.data
    
    pca = PCA(n_comp, random_state=42)
    pca.fit(X)
    
    k = 10
    X_pca = pca.transform(X[:k])
    X_rec = pca.inverse_transform(X_pca)

    with plot_output:
        plot_output.clear_output(wait=True)
        _, axes = plt.subplots(2, k, figsize=(2 * k, 4))
        for img, img_rec, [ax1, ax2] in zip(X, X_rec, axes.T):
            draw_image(img, ax1)
            draw_image(img_rec, ax2)
        plt.show()
    
    
wg.interact(
    viz,
    n_comp=wg.IntSlider(min=1, max=400, value=65)
)
display(plot_output)

Смотрим на собственные вектора:

In [None]:
k = 10
_, axes = plt.subplots(1, k, figsize=(2 * k, 2))

for img, ax in zip(pca.components_, axes):
    draw_image(img, ax)
plt.show()

In [None]:
output = wg.Output()

pca_10 = PCA(10, random_state=42)
pca_10.fit(X)

def viz(k0, k1, k2, k3, k4, k5, k6, k7, k8, k9):
    img_pca = np.array([k0, k1, k2, k3, k4, k5, k6, k7, k8, k9])
    img = pca_10.inverse_transform(img_pca.reshape(1, -1))

    with plot_output:
        plot_output.clear_output(wait=True)
        draw_image(img)
        plt.show()
        
comps = {
    f'k{i}': wg.FloatSlider(min=-3 * s, max=3 * s, value=0, step=0.01)
    for i, s in zip(range(10), X_pca.std(axis=0))
} 

wg.interact(
    viz,
    **comps
)
display(plot_output)

__Вывод:__ PCA раскладывает на "базисные" лица.

# 2. TSNE

Основные свойства:
* ищет нелинейные зависимости в данных;
* вычислительно сложный, поэтому число компонент должно быть небольшим;
* подходит для визуализации данных.

In [None]:
from sklearn.datasets import load_digits


digits = load_digits()
Markdown(digits.DESCR)

In [None]:
X = digits.data
y = digits.target

In [None]:
k = 10
_, axes = plt.subplots(2, k, figsize=(1 * k, 2))
axes = axes.flatten()

for img, ax in zip(X, axes):
    draw_image(img, ax)
plt.show()

In [None]:
pca = PCA(n_components=2)
pca.fit(X)

X_pca = pca.transform(X)

In [None]:
_, ax = plt.subplots(1, 1, figsize=(12, 12))

for c in range(10):
    Xl = X_pca[y == c]
    ax.scatter(Xl[:, 0], Xl[:, 1], marker=f'${c}$', label=f'{c}')
    
ax.legend()
plt.show()

Применим теперь TSNE:

In [None]:
from sklearn.manifold import TSNE


tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

In [None]:
_, ax = plt.subplots(1, 1, figsize=(12, 12))

for c in range(10):
    Xl = X_tsne[y == c]
    ax.scatter(Xl[:, 0], Xl[:, 1], marker=f'${c}$', label=f'{c}')
    
ax.legend()
plt.show()

Параметры:

In [None]:
output = wg.Output()

def viz(perplexity, early_exaggeration, angle):
    tsne = TSNE(perplexity=perplexity, 
                early_exaggeration=early_exaggeration,
                metric='euclidean',
                angle=angle,
                n_jobs=-1)
    X_tsne = tsne.fit_transform(X)

    with output:
        output.clear_output(wait=True)
        
        _, ax = plt.subplots(1, 1, figsize=(8, 8))
        for c in range(10):
            Xl = X_tsne[y == c]
            ax.scatter(Xl[:, 0], Xl[:, 1], marker=f'${c}$', label=f'{c}')

        ax.legend()
        plt.show()
    
wg.interact(viz,
            perplexity=wg.FloatSlider(min=5., max=50., value=30., step=1),
            early_exaggeration=wg.FloatSlider(min=1., max=50., value=12., step=1),
            metric=wg.Select(options=[]),
            angle=wg.FloatSlider(min=0.2, max=0.8, value=0.5, step=0.05))
display(output)