In [4]:
from datetime import datetime
import pandas as pd
import requests

with open('/Users/simoncha/Desktop/projects/jobs/valid_companies.txt', "r") as f:
    companies = [line.strip() for line in f.readlines()]

In [10]:
all_jobs = []

for company in companies[2:3]:
    company_slug = company.lower().replace(" ", "").replace("-", "").replace(".", "")
    url = f"https://boards-api.greenhouse.io/v1/boards/{company_slug}/jobs"

    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            print(f"‚ùå Failed for: {company} ({response.status_code})")
            continue

        data = response.json()
        for job in data.get("jobs", []):
            job_info = {
                "company": company,
                "title": job.get("title"),
                "location": job.get("location", {}).get("name"),
                "url": job.get("absolute_url"),
                "posted": job.get("first_published")
            }
            all_jobs.append(job_info)

        print(f"‚úÖ Retrieved {len(data.get('jobs', []))} jobs for {company}")

    except Exception as e:
        print(f"‚ö†Ô∏è Error for {company}: {e}")

# Optional: Print summary
print(f"\nTotal jobs collected: {len(all_jobs)}")


‚úÖ Retrieved 52 jobs for Duolingo

Total jobs collected: 52


In [11]:
import time
import re
import numpy as np
import ollama
from sentence_transformers import SentenceTransformer

# =========================================================
# 1. LOCATION FILTER (US ONLY)
# =========================================================

us_states = {
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL',
    'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT',
    'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
    'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
}

us_keywords = ['united states', 'us', 'usa']

location_exclusions = [
    "australia", "emea", "united kingdom", "india", "ind",
    "deu", "dusseldorf", "toronto", "mexico"
]

us_jobs = []

for job in all_jobs:
    location = job.get("location", "")
    if not location:
        continue

    loc_lower = location.lower()

    if any(bad in loc_lower for bad in location_exclusions):
        continue

    if (
        any(kw in loc_lower for kw in us_keywords)
        or any(state in location for state in us_states)
        or "remote" in loc_lower
    ):
        us_jobs.append(job)

print(f"üá∫üá∏ Found {len(us_jobs)} U.S. or U.S.-Remote jobs.")

# =========================================================
# 2. LEXICAL SENIORITY FILTER (IMMEDIATELY AFTER LOCATION)
# =========================================================

EXCLUDE_TERMS = [
    "vp", "vice president", "svp", "evp",
    "director", "sr director", "senior director",
    "head of", "principal", "staff",
    "manager", "senior manager", "area manager",
    "executive", "recruiter", "human resources", "hr", "senior"
]

def normalize_title(title):
    title = title.lower()
    title = re.sub(r"[^\w\s]", " ", title)
    title = re.sub(r"\s+", " ", title).strip()
    return title

def passes_lexical_filter(title):
    norm = normalize_title(title)
    return not any(term in norm for term in EXCLUDE_TERMS)

lexical_pass_jobs = []
lexical_rejected_jobs = []

for job in us_jobs:
    title = job.get("title", "")
    if not title:
        continue

    if passes_lexical_filter(title):
        lexical_pass_jobs.append(job)
    else:
        lexical_rejected_jobs.append(job)

print(f"üö´ Rejected by lexical filter: {len(lexical_rejected_jobs)}")
print(f"‚úÖ Passed lexical filter: {len(lexical_pass_jobs)}")

# =========================================================
# 3. EMBEDDING SIMILARITY FILTER
# =========================================================

TARGET_ROLE_DESCRIPTION = """
Entry level or new graduate role in data science, analytics,
computer science, software engineering, data engineering,
applied statistics, or machine learning, based in the United States.
"""

# =========================================================
# 3. EMBEDDING SIMILARITY + AUTO YES / NO ROUTING
# =========================================================

EMBEDDING_FLOOR = 0.19     # auto-NO
AUTO_YES_SCORE = 0.45     # auto-YES

embedder = SentenceTransformer("all-MiniLM-L6-v2")

titles = []
job_refs = []

for job in lexical_pass_jobs:
    title = job.get("title", "")
    if title:
        titles.append(title)
        job_refs.append(job)

# Batch embed titles
title_embeddings = embedder.encode(
    titles,
    normalize_embeddings=True
)

# Embed target role
target_embedding = embedder.encode(
    [TARGET_ROLE_DESCRIPTION],
    normalize_embeddings=True
)[0]

# Cosine similarity
similarities = np.dot(title_embeddings, target_embedding)

# Routing buckets
auto_yes_jobs = []
llm_candidate_jobs = []
embedding_rejected_jobs = []

for score, job in zip(similarities, job_refs):
    job["embedding_score"] = float(score)

    if score < EMBEDDING_FLOOR:
        embedding_rejected_jobs.append(job)

    elif score >= AUTO_YES_SCORE:
        auto_yes_jobs.append(job)

    else:
        llm_candidate_jobs.append(job)

print(f"üìâ Auto-NO (embedding < {EMBEDDING_FLOOR}): {len(embedding_rejected_jobs)}")
print(f"‚ö° Auto-YES (embedding ‚â• {AUTO_YES_SCORE}): {len(auto_yes_jobs)}")
print(f"ü§î Sent to LLM: {len(llm_candidate_jobs)}")

# =========================================================
# 4. LLM ARBITRATION (ONLY UNCERTAIN BAND)
# =========================================================

def should_apply(title):
    prompt = f"""
You are helping an upcoming graduate in Statistics decide whether to apply for jobs.

CRITERIA:
- Target roles related to data science, analytics, computer science, software engineering, data engineering, or applied statistics
- Entry-level, new grad, or roles that do NOT require extensive experience
- Role must plausibly exist in the United States

Respond with ONLY one word:
YES or NO

Role title:
{title}
"""
    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": 0.0}
    )

    answer = response["message"]["content"].strip().upper()
    return "YES" if "YES" in answer else "NO"


# Start with auto-YES
llm_no_titles = set()
yes_jobs = auto_yes_jobs[:]
no_jobs = []

for i, job in enumerate(llm_candidate_jobs, start=1):
    title = job.get("title", "")
    if not title:
        continue

    # ---- NEW: skip if already rejected by LLM ----
    if title in llm_no_titles:
        print(f"[LLM SKIP] Previously rejected ‚Üí {title}")
        no_jobs.append(job)
        continue

    decision = should_apply(title)
    print(f"[LLM {i}/{len(llm_candidate_jobs)}] {decision} ‚Üí {title}")

    if decision == "YES":
        yes_jobs.append(job)
    else:
        no_jobs.append(job)
        llm_no_titles.add(title)   # <-- cache rejection

    time.sleep(0.5)


# =========================================================
# 5. SUMMARY
# =========================================================

total = len(all_jobs)
percent_yes = (len(yes_jobs) / total * 100) if total > 0 else 0

print("\n" + "=" * 70)
print("FINAL SUMMARY")
print(f"Total original jobs: {total}")
print(f"US-filtered jobs: {len(us_jobs)}")
print(f"Lexical rejected: {len(lexical_rejected_jobs)}")
print(f"Embedding rejected: {len(embedding_rejected_jobs)}")
print(f"LLM YES (apply): {len(yes_jobs)}")
print(f"LLM NO (skip): {len(no_jobs)}")
print(f"Percent YES overall: {percent_yes:.2f}%")

üá∫üá∏ Found 48 U.S. or U.S.-Remote jobs.
üö´ Rejected by lexical filter: 40
‚úÖ Passed lexical filter: 8
üìâ Auto-NO (embedding < 0.19): 1
‚ö° Auto-YES (embedding ‚â• 0.45): 2
ü§î Sent to LLM: 5
[LLM 1/5] NO ‚Üí Business Development Lead, Japan & Korea
[LLM 2/5] NO ‚Üí Influencer Marketing, Intern
[LLM 3/5] NO ‚Üí Internal Audit, Intern
[LLM 4/5] YES ‚Üí Learning Scientist, Efficacy Research
[LLM 5/5] NO ‚Üí Social Content Creator, Intern

FINAL SUMMARY
Total original jobs: 52
US-filtered jobs: 48
Lexical rejected: 40
Embedding rejected: 1
LLM YES (apply): 3
LLM NO (skip): 4
Percent YES overall: 5.77%


In [None]:
import json
import os
from datetime import datetime

# Columns you want to persist
COLUMNS = [
    "company",
    "title",
    "location",
    "url",
    "posted",
    "applied",
    "applied_at"
]

# Ensure applied fields exist
for job in yes_jobs:
    job.setdefault("applied", False)
    job.setdefault("applied_at", None)

# Filter fields
filtered_jobs = [
    {k: job.get(k) for k in COLUMNS}
    for job in yes_jobs
]

# Ensure runs/ directory exists
os.makedirs("runs", exist_ok=True)

# Timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"runs/upgrade_{timestamp}.json"

# Write JSON
with open(filename, "w", encoding="utf-8") as f:
    json.dump(filtered_jobs, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Saved {len(filtered_jobs)} jobs to {filename}")


‚úÖ Saved 3 jobs to runs/upgrade_20260114_011445.json


In [13]:
from git import Repo
from datetime import datetime

repo = Repo(".")

# Add ALL changes, including untracked files
repo.git.add(all=True)

repo.index.commit(f"Add job run ({timestamp})")

repo.remote(name="origin").push()

print("üöÄ Run data committed and pushed to GitHub")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

üöÄ Run data committed and pushed to GitHub
