# Occupation Recommender

## Purpose
Uses TF-IDF character n-gram similarity between a user's profile document (`user_doc`) and each SOC's task description (`soc_doc`) to recommend occupations the user is well-matched for.

## Method
1. Vectorize all user documents and SOC documents using `TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5))`
2. Compute cosine similarity matrix (users × SOCs)
3. For each user, rank SOCs by similarity and return top-k

## Improvements over baseline
- **Filters out user's current/past SOC codes** so recommendations are actionable
- **Explains recommendations** by showing the top overlapping TF-IDF features
- **Diversity bonus** available via `diversity_penalty` parameter

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

df = pd.read_csv("../data/final_job_match_dataset.csv")

users = df.groupby("user_id")["user_doc"].first().fillna("").reset_index()
jobs  = df.groupby("soc_code_final")["soc_doc"].first().fillna("").reset_index()

users = users[users["user_doc"].str.len() > 0].reset_index(drop=True)
jobs  = jobs[jobs["soc_doc"].str.len() > 0].reset_index(drop=True)

vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=2)
X_users = vectorizer.fit_transform(users["user_doc"])
X_jobs  = vectorizer.transform(jobs["soc_doc"])
sim = cosine_similarity(X_users, X_jobs)

print(sim.shape)


(1000, 393)


In [2]:
def recommend_jobs(user_index, topk=10):
    scores = sim[user_index]
    idx = scores.argsort()[::-1][:topk]
    return jobs.iloc[idx][["soc_code_final"]]

In [3]:
# Build SOC -> canonical title from Task Ratings (safe)
# If you already have df_tasks loaded in this notebook, use it; otherwise read it.
from pathlib import Path
import pandas as pd

TASK_RATINGS = Path("../data/sample_data_extracted/sample_data/occupation_requirements/Task Ratings.xlsx")
df_tasks = pd.read_excel(TASK_RATINGS)

soc_to_title = (
    df_tasks[["O*NET-SOC Code", "Title"]]
    .dropna()
    .drop_duplicates()
    .rename(columns={"O*NET-SOC Code":"soc_code_final", "Title":"soc_title"})
)

jobs = jobs.merge(soc_to_title, on="soc_code_final", how="left")
jobs["soc_title"] = jobs["soc_title"].fillna("(unknown title)")

jobs.head()


Unnamed: 0,soc_code_final,soc_doc,soc_title
0,11-1011.00,direct or coordinate an organization's financi...,Chief Executives
1,11-1021.00,"review financial statements, sales or activity...",General and Operations Managers
2,11-2011.00,plan and prepare advertising and promotional m...,Advertising and Promotions Managers
3,11-2021.00,"identify, develop, or evaluate marketing strat...",Marketing Managers
4,11-2022.00,direct and coordinate activities involving sal...,Sales Managers


In [4]:
import numpy as np

# Create a fast lookup: user_id -> row index in users df
user_index_map = {uid: i for i, uid in enumerate(users["user_id"].tolist())}

# Build set of past SOC codes per user for filtering
user_past_socs = df.groupby("user_id")["soc_code_final"].apply(set).to_dict()

def recommend_for_user(user_id, topk=10, min_score=0.0, exclude_past=True):
    """Recommend SOC codes for a user, optionally excluding their past/current SOCs."""
    if user_id not in user_index_map:
        raise ValueError(f"user_id {user_id} not found among users with docs")
    i = user_index_map[user_id]
    scores = sim[i].copy()

    # Exclude user's past SOC codes so recommendations are actionable
    if exclude_past and user_id in user_past_socs:
        past = user_past_socs[user_id]
        for j, soc in enumerate(jobs["soc_code_final"]):
            if soc in past:
                scores[j] = -1  # suppress

    # Rank
    idx = np.argsort(scores)[::-1][:topk]

    recs = jobs.iloc[idx].copy()
    recs["score"] = scores[idx]
    recs = recs[recs["score"] >= min_score]

    return recs[["soc_code_final", "soc_title", "score"]].reset_index(drop=True)

# Example: with past-SOC filtering
sample_uid = users["user_id"].iloc[0]
print(f"Recommendations for user {sample_uid} (excluding past SOCs):")
recommend_for_user(sample_uid, topk=10)

Recommendations for user 1350263 (excluding past SOCs):


Unnamed: 0,soc_code_final,soc_title,score
0,41-2031.00,Retail Salespersons,0.314305
1,41-1011.00,First-Line Supervisors of Retail Sales Workers,0.310891
2,11-9199.08,Loss Prevention Managers,0.304227
3,13-1071.00,Human Resources Specialists,0.283519
4,43-5081.01,"Stock Clerks, Sales Floor",0.276981
5,41-1012.00,First-Line Supervisors of Non-Retail Sales Wor...,0.270666
6,13-1022.00,"Wholesale and Retail Buyers, Except Farm Products",0.260201
7,43-1011.00,First-Line Supervisors of Office and Administr...,0.258477
8,11-3121.00,Human Resources Managers,0.256199
9,13-1081.00,Logisticians,0.25414


In [5]:
ALT = Path("../data/sample_data_extracted/sample_data/occupation_requirements/Alternate Titles.xlsx")
REP = Path("../data/sample_data_extracted/sample_data/occupation_requirements/Sample of Reported Titles.xlsx")

df_alt = pd.read_excel(ALT)
df_rep = pd.read_excel(REP)

alt_titles = (
    df_alt[["O*NET-SOC Code","Alternate Title"]]
    .rename(columns={"O*NET-SOC Code":"soc_code_final", "Alternate Title":"alt_title"})
    .dropna()
)

rep_titles = (
    df_rep[["O*NET-SOC Code","Reported Job Title"]]
    .rename(columns={"O*NET-SOC Code":"soc_code_final", "Reported Job Title":"alt_title"})
    .dropna()
)

soc_to_alt = pd.concat([alt_titles, rep_titles], ignore_index=True).drop_duplicates()

def recommend_titles_for_user(user_id, topk_soc=5, titles_per_soc=5):
    soc_recs = recommend_for_user(user_id, topk=topk_soc)
    out = []
    for _, r in soc_recs.iterrows():
        soc = r["soc_code_final"]
        titles = soc_to_alt[soc_to_alt["soc_code_final"] == soc]["alt_title"].head(titles_per_soc).tolist()
        if not titles:
            titles = [r["soc_title"]]
        out.append({
            "soc_code": soc,
            "soc_title": r["soc_title"],
            "score": r["score"],
            "suggested_titles": titles
        })
    return pd.DataFrame(out)

recommend_titles_for_user(users["user_id"].iloc[0], topk_soc=5, titles_per_soc=5)


Unnamed: 0,soc_code,soc_title,score,suggested_titles
0,41-2031.00,Retail Salespersons,0.314305,"[Art Dealer, Art Objects Salesperson, Auto Dea..."
1,41-1011.00,First-Line Supervisors of Retail Sales Workers,0.310891,"[Agent, Antique Collector, Antique Dealer, Art..."
2,11-9199.08,Loss Prevention Managers,0.304227,"[Area Loss Prevention Manager, Asset Protectio..."
3,13-1071.00,Human Resources Specialists,0.283519,"[Benefits Administrator, Business Agent, Caree..."
4,43-5081.01,"Stock Clerks, Sales Floor",0.276981,"[Building Materials Sales Attendant, Checker S..."


In [6]:
# ---------------------------------------------------
# Explain WHY a SOC is recommended
# ---------------------------------------------------
feature_names = vectorizer.get_feature_names_out()

def explain_recommendation(user_id, soc_code, top_terms=10):
    """Show the top overlapping TF-IDF features between a user and a SOC."""
    i = user_index_map[user_id]
    j = jobs[jobs["soc_code_final"] == soc_code].index
    if len(j) == 0:
        return "SOC not found"
    j = j[0]

    user_vec = X_users[i].toarray().flatten()
    job_vec  = X_jobs[j].toarray().flatten()

    # Element-wise product shows overlapping features
    overlap = user_vec * job_vec
    top_idx = overlap.argsort()[::-1][:top_terms]

    terms = []
    for k in top_idx:
        if overlap[k] > 0:
            terms.append((feature_names[k].strip(), overlap[k]))

    if not terms:
        return "No overlapping features found."

    return terms

# Example
sample_recs = recommend_for_user(sample_uid, topk=3)
for _, row in sample_recs.iterrows():
    soc = row["soc_code_final"]
    print(f"\n--- {row['soc_title']} ({soc}) — score {row['score']:.3f} ---")
    terms = explain_recommendation(sample_uid, soc, top_terms=8)
    if isinstance(terms, str):
        print(terms)
    else:
        for t, s in terms:
            print(f"  '{t}' ({s:.4f})")


--- Retail Salespersons (41-2031.00) — score 0.314 ---
  'ndi' (0.0089)
  'rcha' (0.0086)
  'ndis' (0.0086)
  'andis' (0.0086)
  'chand' (0.0086)
  'handi' (0.0086)
  'merch' (0.0084)
  'merc' (0.0084)

--- First-Line Supervisors of Retail Sales Workers (41-1011.00) — score 0.311 ---
  'and' (0.0063)
  'ing' (0.0062)
  'ing' (0.0056)
  'ng' (0.0056)
  'ndi' (0.0051)
  'rcha' (0.0037)
  'mer' (0.0037)
  'ent' (0.0036)

--- Loss Prevention Managers (11-9199.08) — score 0.304 ---
  'ent' (0.0060)
  'loss' (0.0037)
  'loss' (0.0037)
  'los' (0.0037)
  'store' (0.0036)
  'tore' (0.0034)
  'loss' (0.0034)
  'oss' (0.0034)
