## EDA

In [8]:
import pandas as pd
import re
import numpy as np

from __future__ import annotations
from typing import List, Optional, Dict
from sklearn.decomposition import PCA

# Load dataset
df = pd.read_csv("../data/data_science_job_posts_2025_clean.csv")

# Show all column names
print("Columns:", df.columns.tolist())

# helps with mapping
print("\nValue counts:")
print(df["seniority_level"].value_counts(dropna=False))
print(df["status"].dropna().unique())
print(df["industry"].dropna().unique())
print(df["location"].head(10))
print(df["skills"].head(10))
print(df["salary"].head(10))
print(df["company_size"].head(10))

Columns: ['job_title', 'seniority_level', 'status', 'company', 'location', 'post_date', 'headquarter', 'industry', 'ownership', 'company_size', 'revenue', 'salary', 'skills', 'state', 'fips', 'fips_int', 'salary_mid', 'seniority_level_norm', 'skills_list', 'skills_clean']

Value counts:
seniority_level
senior      629
lead        116
midlevel    112
NaN          60
junior       24
Name: count, dtype: int64
['hybrid' 'on-site' 'remote']
['Retail' 'Manufacturing' 'Technology' 'Finance' 'Education' 'Healthcare'
 'Energy' 'Logistics']
0                               Grapevine, TX . Hybrid
1                              Fort Worth, TX . Hybrid
2    Austin, TX . Toronto, Ontario, Canada . Kirkla...
3    Chicago, IL . Scottsdale, AZ . Austin, TX . Hy...
4                                              On-site
5                                         New York, NY
6                                         Berkeley, CA
7                                       Menlo Park, CA
8                      

## Generate Embeddings

In [9]:
# File paths
INPUT_PATH = "../data/data_science_job_posts_2025_clean.csv"

# High-dimensional numeric embedding (all engineered features)
OUTPUT_EMBEDDINGS_FULL = "../data/data_science_job_posts_2025_embeddings_full.csv"

# 2D projection for visualization (id, x, y)
OUTPUT_EMBEDDINGS_2D = "../data/data_science_job_posts_2025_embeddings_2d.csv"

# merged cleaned data + x,y
OUTPUT_MERGED_WITH_2D = "../data/data_science_job_posts_2025_with_embeddings_2d.csv"

# lookup tables for encoding categories
SENIORITY_MAP: Dict[str, int] = {
    "junior": 1,
    "midlevel": 2,
    "senior": 3,
    "lead": 4,
}

STATUS_VALUES = ["hybrid", "on-site", "remote"]

INDUSTRY_VALUES = [
    "Retail",
    "Manufacturing",
    "Technology",
    "Finance",
    "Education",
    "Healthcare",
    "Energy",
    "Logistics",
]

# Helper functions

# Parse skills_clean into a list of lowercase skill tokens.
def parse_skills_clean(skills_str: str | float) -> List[str]:
    if pd.isna(skills_str):
        return []
    parts = str(skills_str).split("|")
    return [p.strip().lower() for p in parts if p.strip()]


# Build a vocabulary of top-N most frequent skills from the cleaned skills_clean column.
def build_skill_vocab(df: pd.DataFrame, column: str = "skills_clean", top_n: int = 50) -> List[str]:
    from collections import Counter

    counter = Counter()
    for s in df[column]:
        counter.update(parse_skills_clean(s))

    vocab = [skill for skill, _ in counter.most_common(top_n)]
    return vocab


# Parse '17 days ago', '1 day ago', '30+ days ago' -> numeric days.
def parse_post_age_days(s: str | float) -> Optional[float]:
    if pd.isna(s):
        return None
    text = str(s).lower()
    m = re.search(r"(\d+)", text)
    if not m:
        return None
    try:
        return float(m.group(1))
    except ValueError:
        return None


# Map normalized seniority labels to small integers.
def map_seniority_level(s: str | float) -> Optional[int]:
    if pd.isna(s):
        return None
    text = str(s).strip().lower()
    return SENIORITY_MAP.get(text)


# Min-max scale a numeric series to [0, 1].
def min_max_scale(series: pd.Series) -> pd.Series:
    s = series.astype(float)
    mn = s.min()
    mx = s.max()
    if pd.isna(mn) or pd.isna(mx) or mn == mx:
        return pd.Series(0.0, index=s.index)
    return (s - mn) / (mx - mn)


#  < 1,000  -> 'small'
#  < 10,000 -> 'medium'
#  >=10,000 -> 'large'
def bucket_company_size(size_str: str | float) -> str:
    if pd.isna(size_str):
        return "unknown"

    text = str(size_str)
    if "€" in text or "b" in text.lower() or "m" in text.lower():
        return "very_large"

    m = re.search(r"[\d,]+(?:\.\d+)?", text)
    if not m:
        return "unknown"

    try:
        n = float(m.group(0).replace(",", ""))
    except ValueError:
        return "unknown"

    if n < 1000:
        return "small"
    elif n < 10000:
        return "medium"
    else:
        return "large"


# -----------------------------
# Main embedding builder
# -----------------------------
# Take the cleaned jobs DataFrame and build a numeric embedding matrix.
# Each row corresponds to one job; columns are simple numeric features.
def build_embeddings(df_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_raw.copy()

    # Add a simple integer ID if not present (for joining later)
    if "id" not in df.columns:
        df["id"] = np.arange(len(df))

    # 1) Skills: build multi-hot columns from skills_clean
    print("Building skill vocabulary from skills_clean...")
    skill_vocab = build_skill_vocab(df, column="skills_clean", top_n=50)
    print(f"Top skills: {skill_vocab}")
    df["_skills_list"] = df["skills_clean"].apply(parse_skills_clean)

    for sk in skill_vocab:
        col = f"skill_{sk.replace(' ', '_')}"
        df[col] = df["_skills_list"].apply(lambda lst, sk=sk: int(sk in lst))

    # 2) Salary: use the cleaned salary_mid column
    print("Using salary_mid from cleaned schema...")
    df["salary_mid"] = pd.to_numeric(df["salary_mid"], errors="coerce")
    df["salary_mid_scaled"] = min_max_scale(df["salary_mid"])

    # 3) Seniority: use seniority_level_norm -> numeric level
    print("Encoding seniority_level_norm...")
    df["seniority_level_num"] = df["seniority_level_norm"].apply(map_seniority_level)
    median_level = df["seniority_level_num"].median()
    df["seniority_level_num"] = df["seniority_level_num"].fillna(median_level)

    # 4) Status (hybrid / on-site / remote): simple one-hot encoding
    print("Encoding status (hybrid, on-site, remote)...")
    df["status_clean"] = df["status"].astype(str).str.strip().str.lower()
    for val in STATUS_VALUES:
        col = f"status_{val.replace('-', '_')}"
        df[col] = (df["status_clean"] == val).astype(int)

    # 5) Industry: one-hot only for a small fixed set of industries
    print("Encoding industry...")
    df["industry_clean"] = df["industry"].astype(str).str.strip()
    for val in INDUSTRY_VALUES:
        col = f"industry_{val.lower()}"
        df[col] = (df["industry_clean"] == val).astype(int)

    # 6) Ownership: one-hot for all observed values
    print("Encoding ownership...")
    df["ownership_clean"] = df["ownership"].astype(str).str.strip()
    ownership_dummies = pd.get_dummies(df["ownership_clean"], prefix="ownership", dtype=int)
    df = pd.concat([df, ownership_dummies], axis=1)

    # 7) State: use the cleaned 'state' column directly
    print("Encoding state from cleaned column...")
    df["state_clean"] = df["state"].astype(str).str.strip().str.upper()
    state_dummies = pd.get_dummies(df["state_clean"], prefix="state", dtype=int)
    df = pd.concat([df, state_dummies], axis=1)

    # 8) Company size: bucket into small / medium / large / very_large
    print("Bucketing company_size...")
    df["company_size_bucket"] = df["company_size"].apply(bucket_company_size)
    size_dummies = pd.get_dummies(df["company_size_bucket"], prefix="company_size", dtype=int)
    df = pd.concat([df, size_dummies], axis=1)

    # 9) Post date: convert '17 days ago' style strings -> recency score
    print("Parsing post_date as days-ago...")
    df["post_age_days"] = df["post_date"].apply(parse_post_age_days)
    df["post_age_days"] = df["post_age_days"].fillna(df["post_age_days"].max())
    df["post_age_scaled"] = min_max_scale(df["post_age_days"])

    # 10) FIPS: use the integer FIPS code as a simple location feature
    print("Scaling fips_int as a simple region feature...")
    df["fips_int"] = pd.to_numeric(df["fips_int"], errors="coerce")
    df["fips_int_scaled"] = min_max_scale(df["fips_int"])

    # 11) Job title flags: simple text-based job family indicators
    print("Encoding job_title families...")

    def job_flags(title: str | float) -> Dict[str, int]:
        t = "" if pd.isna(title) else str(title).lower()
        return {
            "job_is_data_scientist": int("data scientist" in t),
            "job_is_data_engineer": int("data engineer" in t),
            "job_is_ml_engineer": int("machine learning engineer" in t or "ml engineer" in t),
            "job_is_analyst": int("analyst" in t),
        }

    job_flags_df = pd.DataFrame(list(df["job_title"].apply(job_flags)))
    df = pd.concat([df, job_flags_df], axis=1)

    # 12) Select the final embedding columns in a fixed order
    embed_cols: List[str] = ["id"]

    # skills
    embed_cols += [c for c in df.columns if c.startswith("skill_")]

    # salary
    embed_cols.append("salary_mid_scaled")

    # seniority
    embed_cols.append("seniority_level_num")

    # status, industry, ownership, state, company_size
    embed_cols += [c for c in df.columns if c.startswith("status_")]
    embed_cols += [c for c in df.columns if c.startswith("industry_")]
    embed_cols += [c for c in df.columns if c.startswith("ownership_")]
    embed_cols += [c for c in df.columns if c.startswith("state_")]
    embed_cols += [c for c in df.columns if c.startswith("company_size_")]

    # post age + FIPS
    embed_cols.append("post_age_scaled")
    embed_cols.append("fips_int_scaled")

    # job flags
    embed_cols += [
        "job_is_data_scientist",
        "job_is_data_engineer",
        "job_is_ml_engineer",
        "job_is_analyst",
    ]

    # Build numeric embedding DataFrame and fill any remaining NaNs with 0.0
    embeddings = df[embed_cols].copy()
    embeddings = embeddings.apply(pd.to_numeric, errors="coerce").fillna(0.0)

    print("Final embedding shape:", embeddings.shape)
    return embeddings


# 2D Projection with PCA
def project_embeddings_to_2d(embeddings: pd.DataFrame) -> pd.DataFrame:
    """
    Take the high-dimensional embeddings DataFrame (including 'id'),
    run PCA to project into 2D, and return a DataFrame with columns: id, x, y.
    """
    ids = embeddings["id"].values
    features = embeddings.drop(columns=["id"]).values

    print("Running PCA to project embeddings to 2D...")
    pca = PCA(n_components=2, random_state=42)
    coords = pca.fit_transform(features)

    emb2d = pd.DataFrame(
        {
            "id": ids,
            "x": coords[:, 0],
            "y": coords[:, 1],
        }
    )
    print("2D embeddings shape:", emb2d.shape)
    return emb2d


if __name__ == "__main__":
    print(f"Reading data from: {INPUT_PATH}")
    raw = pd.read_csv(INPUT_PATH)

    if "id" not in raw.columns:
        raw["id"] = np.arange(len(raw))

    # High-dimensional embeddings
    embeddings_full = build_embeddings(raw)

    print(f"Writing full embeddings to: {OUTPUT_EMBEDDINGS_FULL}")
    embeddings_full.to_csv(OUTPUT_EMBEDDINGS_FULL, index=False)

    # 2D projection
    embeddings_2d = project_embeddings_to_2d(embeddings_full)

    print(f"Writing 2D embeddings to: {OUTPUT_EMBEDDINGS_2D}")
    embeddings_2d.to_csv(OUTPUT_EMBEDDINGS_2D, index=False)

    merged_with_2d = raw.merge(embeddings_2d, on="id", how="left")
    print(f"Writing merged cleaned data + 2D embeddings to: {OUTPUT_MERGED_WITH_2D}")
    merged_with_2d.to_csv(OUTPUT_MERGED_WITH_2D, index=False)

Reading data from: ../data/data_science_job_posts_2025_clean.csv
Building skill vocabulary from skills_clean...
Top skills: ['python', 'machine learning', 'sql', 'r', 'aws', 'deep learning', 'tensorflow', 'spark', 'azure', 'pytorch', 'tableau', 'gcp', 'scikit-learn', 'scala', 'database', 'pandas', 'java', 'hadoop', 'git', 'numpy', 'docker', 'amazon', 'kubernetes', 'matplotlib', 'keras', 'powerbi', 'airflow', 'linux', 'neural network', 'scipy', 'bash', 'sklearn', 'opencv']
Using salary_mid from cleaned schema...
Encoding seniority_level_norm...
Encoding status (hybrid, on-site, remote)...
Encoding industry...
Encoding ownership...
Encoding state from cleaned column...
Bucketing company_size...
Parsing post_date as days-ago...
Scaling fips_int as a simple region feature...
Encoding job_title families...
Final embedding shape: (941, 106)
Writing full embeddings to: ../data/data_science_job_posts_2025_embeddings_full.csv
Running PCA to project embeddings to 2D...
2D embeddings shape: (941,