# Dimensionality Reduction

In this exercise, we will learn several dimensionality reduction methods using the Penguin dataset.

## Preprocessing


In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import sparse

# Load the penguins dataset
data_table = sns.load_dataset("penguins")
data_table.head()

# Impute the missing values
for column in data_table.columns:
    data_table[column] = data_table[column].fillna(
        data_table[column].dropna().mode()[0]
    )

categorical_cols = ["island", "sex"]
prep_data_table = pd.get_dummies(data_table, columns=categorical_cols, drop_first=True)

X = prep_data_table.drop(columns=["species"]).values.astype(float)
y = prep_data_table["species"].values
feature_cols = prep_data_table.drop(columns=["species"]).columns

X = X.astype(float)
ylabel, yids = np.unique(y, return_inverse=True)

# Multi Dimensional Scaling (MDS)


Let's apply the MDS using scikit learn package: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html


In [None]:
from sklearn.manifold import MDS
import matplotlib.pyplot as plt

# TODO: See the documentation for the MDS function and use it to project the data into 2D


# Visualize the results
sns.set_style("white")
sns.set(font_scale=1.2)
sns.set_style("ticks")
fig, ax = plt.subplots(figsize=(7, 5))

ax = sns.scatterplot(x=Xproj[:, 0], y=Xproj[:, 1], hue=y)

# Label your axis.
ax.set_xlabel("")
ax.set_ylabel("")

### Why was MDS not useful for this dataset? Can you think of any way to imporve the results ðŸ¤”?


In [None]:
# TODO: Fidn a way to show the structure for this dataset.

# Isomap

https://scikit-learn.org/stable/modules/generated/sklearn.manifold.Isomap.html


In [None]:
from sklearn.manifold import Isomap

# TODO: See the documentation for the MDS function and use it to project the data into 2D


# Visualize the results
sns.set_style("white")
sns.set(font_scale=1.2)
sns.set_style("ticks")
fig, ax = plt.subplots(figsize=(7, 5))

ax = sns.scatterplot(x=Xproj[:, 0], y=Xproj[:, 1], hue=y)

# Label your axis.
ax.set_xlabel("")
ax.set_ylabel("")

### Play with the number of neighbors and think about it's effect. Can you also explain why the effect is observed ðŸ¤”?


# t-SNE


https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html


In [None]:
from sklearn.manifold import TSNE

# TODO: See the documentation for the MDS function and use it to project the data into 2D


# Visualize the results
sns.set_style("white")
sns.set(font_scale=1.2)
sns.set_style("ticks")
fig, ax = plt.subplots(figsize=(7, 5))

ax = sns.scatterplot(x=Xproj[:, 0], y=Xproj[:, 1], hue=y)

# Label your axis.
ax.set_xlabel("")
ax.set_ylabel("")

# UMAP (Optional)
* https://umap-learn.readthedocs.io/en/latest/api.html


In [None]:
import umap


# TODO: See the documentation for the MDS function and use it to project the data into 2D


# Visualize the results
sns.set_style("white")
sns.set(font_scale=1.2)
sns.set_style("ticks")
fig, ax = plt.subplots(figsize=(7, 5))

ax = sns.scatterplot(x=Xproj[:, 0], y=Xproj[:, 1], hue=y)

# Label your axis.
ax.set_xlabel("")
ax.set_ylabel("")