## Dimensionality reduction - PCA

In [14]:
from __future__ import annotations
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

INPUT_EMBEDDINGS = "../data/data_science_job_posts_2025_embeddings.csv"
OUTPUT_PCA = "../data/data_science_job_posts_2025_embeddings_2d_pca.csv"
OUTPUT_MAIN = "../data/data_science_job_posts_2025_embeddings_2d.csv"

def load_embeddings(path: str) -> tuple[pd.Series, np.ndarray]:
    df = pd.read_csv(path)
    ids = df["id"]
    features = df.drop(columns=["id"])
    X = features.values
    return ids, X


def standardize_features(X: np.ndarray) -> np.ndarray:
    """
    Standardize each feature to have mean 0 and variance 1.
    This makes PCA focus on patterns instead of raw scales.
    """
    scaler = StandardScaler()
    return scaler.fit_transform(X)


def compute_pca_2d(X: np.ndarray, random_state: int = 42) -> np.ndarray:
    """
    Run PCA to reduce the embedding space to 2D.
    """
    pca = PCA(n_components=2, random_state=random_state)
    X_pca = pca.fit_transform(X)
    print("PCA explained variance ratio:", pca.explained_variance_ratio_)
    return X_pca


if __name__ == "__main__":
    print(f"Loading embeddings from: {INPUT_EMBEDDINGS}")
    ids, X = load_embeddings(INPUT_EMBEDDINGS)
    print("Embeddings shape:", X.shape)

    print("Standardizing features...")
    X_scaled = standardize_features(X)

    print("Computing PCA 2D projection...")
    X_pca = compute_pca_2d(X_scaled)

    df_pca = pd.DataFrame(
        {
            "id": ids,
            "x": X_pca[:, 0],
            "y": X_pca[:, 1],
        }
    )

    df_pca.to_csv(OUTPUT_PCA, index=False)
    df_pca.to_csv(OUTPUT_MAIN, index=False)

    print(f"PCA projection saved to: {OUTPUT_PCA}")
    print(f"Main 2D embedding saved to: {OUTPUT_MAIN}")

Loading embeddings from: ../data/data_science_job_posts_2025_embeddings.csv
Embeddings shape: (941, 105)
Standardizing features...
Computing PCA 2D projection...
PCA explained variance ratio: [0.05913694 0.04308239]
PCA projection saved to: ../data/data_science_job_posts_2025_embeddings_2d_pca.csv
Main 2D embedding saved to: ../data/data_science_job_posts_2025_embeddings_2d.csv


## Validation

In [15]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

embeddings = pd.read_csv("../data/data_science_job_posts_2025_embeddings.csv")

X = embeddings.drop(columns=["id"], errors="ignore")
X_scaled = StandardScaler().fit_transform(X)

# Fit PCA
pca = PCA(n_components=2)
pca.fit(X_scaled)

print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total explained:", sum(pca.explained_variance_ratio_))


Explained variance ratio: [0.05913694 0.04308242]
Total explained: 0.10221936081321611
