In [12]:
import pandas as pd
import numpy as np

def load_data(path="../data/final_job_match_dataset.csv"):
    return pd.read_csv(path)

def add_tenure_features(df):
    df["tenure_months"] = (
        pd.to_datetime(df["enddate"], errors="coerce")
        - pd.to_datetime(df["startdate"], errors="coerce")
    ).dt.days / 30.4
    return df

def add_mobility_features(df):
    # sort by user and startdate so "next job" makes sense
    df = df.copy()
    if "startdate" in df.columns:
        df["startdate"] = pd.to_datetime(df["startdate"], errors="coerce")
    df = df.sort_values(["user_id", "startdate", "position_id"])

    # job_switch = 1 if user has a next position after this one
    df["job_switch"] = df.groupby("user_id")["position_id"].shift(-1).notna().astype(int)
    return df

def add_wage_proxy(df):
    df = df.copy()

    seniority_map = {
        "intern":0, "internship":0,
        "entry":1, "junior":1, "jr":1, "jr.":1,
        "associate":2, "mid":2,
        "senior":3, "sr":3, "sr.":3, "staff":3,
        "lead":4, "manager":4,
        "principal":5, "director":5,
        "vp":6, "vice president":6,
        "cxo":7, "chief":7, "executive":7, "ceo":7, "cto":7, "cfo":7, "coo":7
    }

    # force string safely
    s = df["seniority"].astype("string").str.lower().str.strip()
    df["seniority_score"] = s.map(seniority_map)

    return df



In [13]:
df = load_data()
df = add_mobility_features(df)
df = add_wage_proxy(df)

df["seniority_score"] = df["seniority_score"].fillna(df["seniority_score"].median()).fillna(0)

print(df["seniority"].head(10))
print(df["seniority_score"].describe())


3249    5
3250    1
3251    2
3252    2
3253    5
3254    5
3255    1
456     1
457     1
2598    2
Name: seniority, dtype: int64
count    3346.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: seniority_score, dtype: float64


In [14]:
print("Share mapped seniority_score:", df["seniority_score"].notna().mean())
print(df["seniority"].astype("string").str.lower().value_counts().head(20))


Share mapped seniority_score: 1.0
seniority
1    1237
2    1054
5     361
4     346
3     286
6      51
7      11
Name: count, dtype: int64[pyarrow]


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

df = load_data()
df = add_mobility_features(df)
df = add_wage_proxy(df)

# Fill unmapped seniority values
df["seniority_score"] = df["seniority_score"].fillna(df["seniority_score"].median()).fillna(0)

features = [
    "match_score_final",
    "edu_match_score",
    "exp_match_score",
    "train_match_score",
    "seniority_score"
]

X = df[features].fillna(0)
y = df["job_switch"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = RandomForestClassifier(n_estimators=300, random_state=0)
model.fit(X_train, y_train)

print("Test accuracy:", model.score(X_test, y_test))


Test accuracy: 0.6582089552238806


In [17]:
from sklearn.metrics import classification_report, roc_auc_score

pred = model.predict(X_test)
proba = model.predict_proba(X_test)[:, 1]

print("Base rate (job_switch=1):", y_test.mean())
print(classification_report(y_test, pred))
print("ROC AUC:", roc_auc_score(y_test, proba))


Base rate (job_switch=1): 0.7014925373134329
              precision    recall  f1-score   support

           0       0.41      0.33      0.36       200
           1       0.74      0.80      0.77       470

    accuracy                           0.66       670
   macro avg       0.57      0.56      0.56       670
weighted avg       0.64      0.66      0.65       670

ROC AUC: 0.5614202127659574


In [18]:
df = load_data()
df = add_mobility_features(df)

df = df.sort_values(["user_id","startdate"])

df["next_match"] = df.groupby("user_id")["match_score_final"].shift(-1)
df["match_change"] = df["next_match"] - df["match_score_final"]

print("Average match change:", df["match_change"].mean())
print(df["match_change"].describe())


Average match change: 1.3336528881422322e-05
count    2175.000000
mean        0.000013
std         0.102265
min        -0.459688
25%        -0.035847
50%         0.000000
75%         0.037500
max         0.393257
Name: match_change, dtype: float64


In [19]:
df = load_data()
df = add_wage_proxy(df)
df = add_mobility_features(df)

df = df.sort_values(["user_id","startdate"])
df["next_seniority"] = df.groupby("user_id")["seniority_score"].shift(-1)

print(df[["match_score_final","next_seniority"]].corr())


                   match_score_final  next_seniority
match_score_final                1.0             NaN
next_seniority                   NaN             NaN


In [20]:
within = df.groupby("soc_code_final")["match_score_final"].std().mean()
overall = df["match_score_final"].std()

print("Within-SOC std:", within)
print("Overall std:", overall)


Within-SOC std: 0.05312690277801538
Overall std: 0.09700706085221868


In [21]:
import numpy as np

shuffled = df.copy()
shuffled["match_shuffled"] = np.random.permutation(shuffled["match_score_final"])

print("Real mean:", df["match_score_final"].mean())
print("Shuffled mean:", shuffled["match_shuffled"].mean())


Real mean: 0.4123682590960156
Shuffled mean: 0.4123682590960156


In [22]:
df = load_data()
df = add_mobility_features(df)

df = df.sort_values(["user_id","startdate"])
df["next_soc"] = df.groupby("user_id")["soc_code_final"].shift(-1)

df["occ_switch"] = (df["soc_code_final"] != df["next_soc"]).astype(int)

print(df["occ_switch"].mean())


0.8523610280932457


In [24]:
from sklearn.metrics import roc_auc_score

df = load_data()
df = add_mobility_features(df)
df = df.sort_values(["user_id","startdate"])

df["next_soc"] = df.groupby("user_id")["soc_code_final"].shift(-1)
df["occ_switch"] = (df["soc_code_final"] != df["next_soc"]).astype(int)

valid = df.dropna(subset=["next_soc", "soc_code_final", "match_score_final"]).copy()

y = valid["occ_switch"]
x = valid["match_score_final"]

print("N:", len(valid))
print("Occ switch base rate:", y.mean())
print("AUC (lower score => switch):", roc_auc_score(y, -x))
print("AUC (higher score => stay):", roc_auc_score(1 - y, x))  # same info, different framing


N: 2175
Occ switch base rate: 0.7728735632183908
AUC (lower score => switch): 0.5553061484994232
AUC (higher score => stay): 0.5553061484994233


In [25]:
valid["score_bin"] = pd.qcut(valid["match_score_final"], 10, duplicates="drop")
switch_rate = valid.groupby("score_bin")["occ_switch"].mean()
print(switch_rate)


score_bin
(0.0942, 0.294]    0.848624
(0.294, 0.346]     0.774194
(0.346, 0.378]     0.788991
(0.378, 0.41]      0.751152
(0.41, 0.435]      0.779817
(0.435, 0.45]      0.820276
(0.45, 0.466]      0.797235
(0.466, 0.49]      0.775229
(0.49, 0.524]      0.774194
(0.524, 0.718]     0.619266
Name: occ_switch, dtype: float64


In [27]:
from sklearn.metrics import roc_auc_score

df = load_data()
df = add_mobility_features(df)
df = df.sort_values(["user_id","startdate"])

df["next_match"] = df.groupby("user_id")["match_score_final"].shift(-1)

# define improvement BEFORE subsetting
df["improved_match"] = (df["next_match"] > df["match_score_final"]).astype(int)

valid = df.dropna(subset=["next_match","match_score_final","improved_match"]).copy()

print("N:", len(valid))
print("Improve base rate:", valid["improved_match"].mean())

print("AUC (low match => improvement next):",
      roc_auc_score(valid["improved_match"], -valid["match_score_final"]))


N: 2175
Improve base rate: 0.43310344827586206
AUC (low match => improvement next): 0.705678329312622


In [28]:
valid["score_bin"] = pd.qcut(valid["match_score_final"], 10, duplicates="drop")
print(valid.groupby("score_bin")["improved_match"].mean())


score_bin
(0.0942, 0.294]    0.839450
(0.294, 0.346]     0.576037
(0.346, 0.378]     0.472477
(0.378, 0.41]      0.442396
(0.41, 0.435]      0.513761
(0.435, 0.45]      0.437788
(0.45, 0.466]      0.382488
(0.466, 0.49]      0.298165
(0.49, 0.524]      0.285714
(0.524, 0.718]     0.082569
Name: improved_match, dtype: float64
