# NDL Lab 4: PCA & t-SNE

## Download Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("satyajeetrai/palmer-penguins-dataset-for-eda")

print("Path to dataset files:", path)

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

RANDOM_STATE = 42

## Load DataFrame

In [None]:
df = pd.read_csv(path + "/penguins.csv")

df = df.dropna()

df.head()

## Subset Dataset to Features

Unsupervised learning: Keep only the 4 body measurements as features

Remove species/sex labels - the goal is to see if we can discover these natural groupings from the data alone, without being told the categories

In [None]:
features_df = df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]

## Exploratory Data Analysis (EDA)
Pairplot visualization showing distributions and pairwise relationships between features.

Main diagonal shows univariate distributions (histograms), off-diagonal shows bivariate scatter plots

In [None]:
sns.pairplot(features_df)

## Principal Component Analysis (PCA)

Cumulative explained variance: determine how many principal components are needed to capture the most information

The "elbow" point indicates where additional components provide diminishing value

In [None]:
scaler = StandardScaler()
Xs = scaler.fit_transform(features_df)

pca = PCA(random_state=RANDOM_STATE)
X_pca = pca.fit_transform(Xs)
explained = pca.explained_variance_ratio_
cum_explained = np.cumsum(explained)
print('Explained variance ratio (first 4):', np.round(explained[:4], 4))
print('Cumulative explained (first 4):', np.round(cum_explained[:4], 4))

plt.figure(figsize=(6,4))
plt.plot(range(1, len(cum_explained) + 1), cum_explained, marker='o')
plt.axhline(0.95, color='r', linestyle='--', label='95%')
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.title('PCA cumulative explained variance')
plt.xticks(range(1, len(cum_explained) + 1))
plt.legend()
plt.tight_layout()
plt.show()

### Principal Components = 2

In [None]:
pca_2d = PCA(n_components=2, random_state=RANDOM_STATE)
X_pca_2d = pca_2d.fit_transform(Xs)

explained_2d = pca_2d.explained_variance_ratio_

plt.figure(figsize=(8, 6))
plt.scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], alpha=0.7)
plt.xlabel(f'PC1 ({explained_2d[0]:.2%} variance)')
plt.ylabel(f'PC2 ({explained_2d[1]:.2%} variance)')
plt.title(f'Data projected onto first 2 Principal Components\nTotal variance explained: {sum(explained_2d):.2%}')
plt.grid(True, alpha=0.3)
plt.show()

### Evaluation: 2 Component PCA Plot with Labels

In [None]:
species = df.loc[features_df.index, 'species']
sex = df.loc[features_df.index, 'sex']

plot_df_pca = pd.DataFrame({
    'PC1': X_pca_2d[:, 0],
    'PC2': X_pca_2d[:, 1],
    'species': species.values,
    'sex': sex.values
})

plt.figure(figsize=(12, 8))
sns.scatterplot(data=plot_df_pca,
                x='PC1',
                y='PC2',
                hue='species',
                style='sex',
                palette='Set1',
                s=80,
                alpha=0.7)
plt.xlabel(f'PC1 ({explained_2d[0]:.2%} variance)')
plt.ylabel(f'PC2 ({explained_2d[1]:.2%} variance)')
plt.title(f'PCA Visualization - Species (color) and Sex (shape)\nTotal variance explained: {sum(explained_2d):.2%}',
          fontsize=14)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Principal Components = 3

In [None]:
pca_3d = PCA(n_components=3, random_state=RANDOM_STATE)
X_pca_3d = pca_3d.fit_transform(Xs)
explained_3d = pca_3d.explained_variance_ratio_

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(X_pca_3d[:, 0],
                    X_pca_3d[:, 1],
                    X_pca_3d[:, 2],
                    alpha=0.6,
                    s=30)

ax.set_xlabel(f'PC1 ({explained_3d[0]:.2%} variance)')
ax.set_ylabel(f'PC2 ({explained_3d[1]:.2%} variance)')
ax.set_zlabel(f'PC3 ({explained_3d[2]:.2%} variance)')
ax.set_title(f'Data projected onto first 3 Principal Components\nTotal variance explained: {sum(explained_3d):.2%}')

plt.show()

### Evaluation: 3 Component PCA Plot with Labels

In [None]:
species = df.loc[features_df.index, 'species']
sex = df.loc[features_df.index, 'sex']

plot_df_pca = pd.DataFrame({
    'PC1': X_pca_3d[:, 0],
    'PC2': X_pca_3d[:, 1],
    'PC3': X_pca_3d[:, 2],
    'species': species.values,
    'sex': sex.values
})

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

species_list = plot_df_pca['species'].unique()
sex_list = plot_df_pca['sex'].unique()
colors = plt.cm.Set1(range(len(species_list)))
markers = ['o', '^', 's']

for i, sp in enumerate(species_list):
    for j, sx in enumerate(sex_list):
        mask = (plot_df_pca['species'] == sp) & (plot_df_pca['sex'] == sx)
        ax.scatter(plot_df_pca.loc[mask, 'PC1'],
                   plot_df_pca.loc[mask, 'PC2'],
                   plot_df_pca.loc[mask, 'PC3'],
                   c=[colors[i]],
                   marker=markers[j],
                   label=f'{sp} - {sx}',
                   alpha=0.6,
                   s=50)

ax.set_xlabel(f'PC1 ({explained_3d[0]:.2%} variance)')
ax.set_ylabel(f'PC2 ({explained_3d[1]:.2%} variance)')
ax.set_zlabel(f'PC3 ({explained_3d[2]:.2%} variance)')
ax.set_title(f'3D PCA - Species (color) and Sex (shape)\nTotal variance explained: {sum(explained_3d):.2%}')
ax.legend(bbox_to_anchor=(1.15, 1), loc='upper left')
plt.tight_layout()
plt.show()

## t-SNE

t-SNE (t-Distributed Stochastic Neighbor Embedding)

Non-linear alternative to PCA - better at preserving local structure and revealing clusters

### Perplexity Parameter Optimization
Perplexity parameter: controls the number of nearby points each point considers as "neighbors"

Low perplexity: focus on local structure -> small, tight clusters

High perplexity: focus more globally -> large, more spread out clusters

A good starting point is $\sqrt(n)$ where $n$ is the number of samples in our dataset

In [None]:
tsne = TSNE(n_components=2,
            random_state=RANDOM_STATE,
            perplexity=18,
            n_iter=1000)

X_tsne = tsne.fit_transform(X_pca_3d)

plt.figure(figsize=(10, 8))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.6)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE Visualization')
plt.grid(True, alpha=0.3)
plt.show()

### Evaluation: t-SNE plot with labels

In [None]:
species = df.loc[features_df.index, 'species']
sex = df.loc[features_df.index, 'sex']

plot_df = pd.DataFrame({
    't-SNE 1': X_tsne[:, 0],
    't-SNE 2': X_tsne[:, 1],
    'species': species.values,
    'sex': sex.values,
})

plt.figure(figsize=(12, 8))
sns.scatterplot(data=plot_df,
                x='t-SNE 1',
                y='t-SNE 2',
                hue='species',
                style='sex',
                palette='Set1',
                s=80,
                alpha=0.7)

plt.title('t-SNE Visualization - Species (color) and Sex (shape)', fontsize=14)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()