In [None]:
#cell 1: Setup 
import warnings
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import __version__ as sklearn_version
import joblib

warnings.filterwarnings("ignore")

DATA_PATH = Path("artifacts/cleaned/employee_full_clean.csv")

SEED = 42
np.random.seed(SEED)

assert DATA_PATH.exists(), f"Missing file: {DATA_PATH.resolve()}"
print("Setup complete. Using:", DATA_PATH.resolve(), "| sklearn:", sklearn_version)


Setup complete. Using: /Users/yongryan/Downloads/bigdataryan /artifacts/cleaned/employee_full_clean.csv | sklearn: 1.6.1


In [None]:
#cell 2: Load Cleaned Data
df = pd.read_csv(DATA_PATH, low_memory=False)
print(f"Loaded: {df.shape[0]:,} rows × {df.shape[1]} cols")
df.head(5)



Loaded: 999,895 rows × 12 cols


Unnamed: 0,job_id,company_id,job_role,education,major,industry,years_experience,distance_from_cbd,is_outlier_years_experience,is_outlier_distance_from_cbd,salary_in_thousands,is_outlier_salary
0,JOB1362684407687,COMP37,CFO,MASTER,MATH,HEALTH,10.0,83.0,False,False,130.0,False
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3.0,73.0,False,False,101.0,False
2,JOB1362684407697,COMP56,JANITOR,HIGH_SCHOOL,NONE,HEALTH,24.0,30.0,False,False,102.0,False
3,JOB1362684407698,COMP7,CEO,MASTER,PHYSICS,EDUCATION,7.0,79.0,False,False,144.0,False
4,JOB1362684407699,COMP4,JUNIOR,NONE,NONE,OIL,8.0,29.0,False,False,79.0,False


In [None]:
#cell 3: Sanity Checks (post-cleaning) + Normalize flag dtypes 
print("Missing values per column:\n", df.isna().sum().sort_values(ascending=False).head(20))
print("\nDtypes:\n", df.dtypes)

required_cols = [
    "job_id","company_id","job_role","education","major","industry",
    "years_experience","distance_from_cbd","salary_in_thousands"
]
missing_required = [c for c in required_cols if c not in df.columns]
assert not missing_required, f"Missing expected columns: {missing_required}"

# job_id check as a warning (some datasets use a different format)
id_ok = df["job_id"].astype(str).str.match(r"^JOB\d+$", na=False).mean()
if id_ok < 0.95:
    print(f"WARNING: job_id pattern 'JOB###' found in only {100*id_ok:.1f}% of rows. Proceeding.")

# normalise outlier flags to clean booleans (even if unused as features)
for c in ["is_outlier_years_experience", "is_outlier_distance_from_cbd", "is_outlier_salary"]:
    if c in df.columns:
        df[c] = df[c].map({True: True, False: False, "True": True, "False": False})
        df[c] = df[c].fillna(False).astype(bool)


Missing values per column:
 is_outlier_salary               225
salary_in_thousands             225
job_id                            0
company_id                        0
education                         0
job_role                          0
major                             0
industry                          0
distance_from_cbd                 0
years_experience                  0
is_outlier_distance_from_cbd      0
is_outlier_years_experience       0
dtype: int64

Dtypes:
 job_id                           object
company_id                       object
job_role                         object
education                        object
major                            object
industry                         object
years_experience                float64
distance_from_cbd               float64
is_outlier_years_experience        bool
is_outlier_distance_from_cbd       bool
salary_in_thousands             float64
is_outlier_salary                object
dtype: object


In [None]:
#cell 4: define feature groups 
target = "salary_in_thousands"

categorical_cols = ["company_id", "job_role", "education", "major", "industry"]
numeric_cols = ["years_experience", "distance_from_cbd"]

# outlier flags created during cleaning 
outlier_cols = [c for c in ["is_outlier_years_experience", "is_outlier_distance_from_cbd", "is_outlier_salary"] if c in df.columns]

print("Categorical:", categorical_cols)
print("Numeric:", numeric_cols)
print("Outlier flags:", outlier_cols)


Categorical: ['company_id', 'job_role', 'education', 'major', 'industry']
Numeric: ['years_experience', 'distance_from_cbd']
Outlier flags: ['is_outlier_years_experience', 'is_outlier_distance_from_cbd', 'is_outlier_salary']


In [None]:
#cell 5: feature Engineering (core) 
df = df.copy()

# 1) log-transform target for skewness (for diagnostics/option later; not used as X)
df["log_salary"] = np.log1p(df[target])

# 2) Experience buckets
df["exp_level"] = pd.cut(
    df["years_experience"],
    bins=[-1, 5, 15, np.inf],
    labels=["junior", "mid", "senior"]
)

# 3) cbd proximity flag (binary engineered numeric; we'll passthrough unscaled)
df["near_cbd"] = (df["distance_from_cbd"] < 10).astype(int)

# lock categorical dtypes for stable OHE
for c in ["company_id","job_role","education","major","industry","exp_level"]:
    if c in df.columns:
        df[c] = df[c].astype("category")

print(df[["years_experience","exp_level","distance_from_cbd","near_cbd"]].head())


   years_experience exp_level  distance_from_cbd  near_cbd
0              10.0       mid               83.0         0
1               3.0    junior               73.0         0
2              24.0    senior               30.0         0
3               7.0       mid               79.0         0
4               8.0       mid               29.0         0


In [None]:
#cell 6: (Optional) Interactions 
ADD_INTERACTIONS = True  # set false to get minimal features

if ADD_INTERACTIONS:
    if {"education","industry"}.issubset(df.columns):
        df["edu_industry"] = (df["education"].astype(str) + "__" + df["industry"].astype(str))
        df["edu_industry"] = df["edu_industry"].astype("category")
    if {"job_role","exp_level"}.issubset(df.columns):
        df["role_exp"] = (df["job_role"].astype(str) + "__" + df["exp_level"].astype(str))
        df["role_exp"] = df["role_exp"].astype("category")
    print("Added interactions:", [c for c in ["edu_industry","role_exp"] if c in df.columns])


Added interactions: ['edu_industry', 'role_exp']


In [None]:
#cell 7: fefine final X columns & Split 
extra_cat = [c for c in ["edu_industry","role_exp"] if c in df.columns]
categorical_model_cols = categorical_cols + extra_cat

X_base_cols = categorical_model_cols + numeric_cols + ["exp_level","near_cbd"]

#outlier flags as weak signals:
USE_OUTLIER_FLAGS = False
if USE_OUTLIER_FLAGS:
    X_base_cols += outlier_cols

#remove target & log target if they slipped in
X_base_cols = [c for c in X_base_cols if c not in [target, "log_salary"]]

#drop rows with missing target
keep_mask = df[target].notna()
df_use = df.loc[keep_mask].reset_index(drop=True)

X = df_use[X_base_cols].copy()
y = df_use[target].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=SEED
)

print("Train shapes:", X_train.shape, y_train.shape)
print("Test  shapes:", X_test.shape,  y_test.shape)


Train shapes: (799736, 11) (799736,)
Test  shapes: (199934, 11) (199934,)


In [None]:
#cell 8: Rare-category bucketing (train-driven)
APPLY_RARE_BUCKETING = True
RARE_THRESH = 10  # categories with < RARE_THRESH occurrences in train -> "__OTHER__"

#capture which categories were considered rare per column on TRAIN
rare_map = {}  # {col_name: [rare_levels_as_str]}

if APPLY_RARE_BUCKETING:
    for c in categorical_model_cols + ["exp_level"]:
        if c in X_train.columns:
            vc = X_train[c].astype(str).value_counts()
            rare = list(vc[vc < RARE_THRESH].index)
            if rare:
                rare_map[c] = rare

                X_train[c] = X_train[c].astype(str).where(~X_train[c].astype(str).isin(rare), "__OTHER__")
                X_test[c]  = X_test[c].astype(str).where(~X_test[c].astype(str).isin(rare), "__OTHER__")

                # keep dtype tidy
                X_train[c] = X_train[c].astype("category")
                X_test[c]  = X_test[c].astype("category")

    print(f"Rare-category bucketing applied where needed (threshold={RARE_THRESH}). Columns: {list(rare_map.keys())}")
else:
    rare_map = {}


Rare-category bucketing applied where needed (threshold=10). Columns: ['job_role', 'industry', 'edu_industry', 'role_exp']


In [None]:
#cell 9: preprocessing(OHE)
def _ohe(handle_unknown="ignore", make_sparse=True, drop=None):
    """Return a OneHotEncoder that works across sklearn versions."""
    major, minor = (int(x) for x in sklearn_version.split(".")[:2])
    kwargs = dict(handle_unknown=handle_unknown)
    if drop is not None:  # eg "if_binary" for linear models' full-rank design
        kwargs["drop"] = drop
    if (major, minor) >= (1, 2):
        kwargs["sparse_output"] = make_sparse
    else:
        kwargs["sparse"] = make_sparse
    return OneHotEncoder(**kwargs)

# include exp_level in OHE; pass through near_cbd (no scaling)
cat_for_ohe = [c for c in categorical_model_cols + ["exp_level"] if c in X_train.columns]
num_for_scale = [c for c in numeric_cols if c in X_train.columns]
bin_passthrough = [c for c in ["near_cbd"] if c in X_train.columns]


DROP_IF_BINARY = False
drop_mode = "if_binary" if DROP_IF_BINARY else None

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=True, with_std=True), num_for_scale),
        ("cat", _ohe(make_sparse=True, drop=drop_mode), cat_for_ohe),
        ("bin", "passthrough", bin_passthrough),
    ],
    remainder="drop"
)

print("Numerics to scale:", num_for_scale)
print("Categoricals to OHE:", cat_for_ohe)
print("Binary passthrough:", bin_passthrough)
print("OHE drop mode:", drop_mode)


Numerics to scale: ['years_experience', 'distance_from_cbd']
Categoricals to OHE: ['company_id', 'job_role', 'education', 'major', 'industry', 'edu_industry', 'role_exp', 'exp_level']
Binary passthrough: ['near_cbd']
OHE drop mode: None


In [None]:
#cell 10: Fit & Build Stable Feature Names 
preprocessor.fit(X_train)

feature_names = []

# numeric names (same order as in num_for_scale)
feature_names += num_for_scale

# OHE names
ohe = preprocessor.named_transformers_["cat"]
ohe_names = list(ohe.get_feature_names_out(cat_for_ohe))
feature_names += ohe_names

# passthrough (binary flags)
bin_cols = []
for name, trans, cols in preprocessor.transformers_:
    if name == "bin":
        bin_cols = list(cols)
        break
feature_names += bin_cols

print("Total transformed features:", len(feature_names))
print("First 20 feature names:", feature_names[:20])


Total transformed features: 162
First 20 feature names: ['years_experience', 'distance_from_cbd', 'company_id_COMP0', 'company_id_COMP1', 'company_id_COMP10', 'company_id_COMP11', 'company_id_COMP12', 'company_id_COMP13', 'company_id_COMP14', 'company_id_COMP15', 'company_id_COMP16', 'company_id_COMP17', 'company_id_COMP18', 'company_id_COMP19', 'company_id_COMP2', 'company_id_COMP20', 'company_id_COMP21', 'company_id_COMP22', 'company_id_COMP23', 'company_id_COMP24']


In [None]:
#cell 11: Transform Datasets 
X_train_t = preprocessor.transform(X_train)
X_test_t  = preprocessor.transform(X_test)

print("Transformed shapes (X_train_t, X_test_t):", X_train_t.shape, X_test_t.shape)


Transformed shapes (X_train_t, X_test_t): (799736, 162) (199934, 162)


In [None]:
#cell 12: diagnostics / sanity Checks
from scipy import sparse

# 1) no leakage in raw X
assert target not in X_train.columns and "log_salary" not in X_train.columns, "Target/leakage in X!"

# 2) sparse/dense type
print("X_train_t is sparse:", sparse.issparse(X_train_t))

# 3) quick business sanity: avg salary by exp_level (monotonic-ish expectation)
if "exp_level" in df_use.columns:
    g = df_use.groupby("exp_level")[target].mean().reindex(["junior","mid","senior"])
    print("\nAvg salary by exp_level:\n", g)

# 4) distance effect: near_cbd vs salary
if "near_cbd" in df_use.columns:
    print("\nAvg salary near_cbd=0/1:\n", df_use.groupby("near_cbd")[target].mean())


X_train_t is sparse: True

Avg salary by exp_level:
 exp_level
junior     96.960881
mid       138.044550
senior    132.193247
Name: salary_in_thousands, dtype: float64

Avg salary near_cbd=0/1:
 near_cbd
0    125.158025
1    134.227061
Name: salary_in_thousands, dtype: float64


In [None]:
#cell 13: zero-variance check (on transformed train)
from scipy import sparse
import numpy as np

if sparse.issparse(X_train_t):
    nnz_per_col = np.diff(X_train_t.tocsc().indptr)
    zero_var_cols = np.where(nnz_per_col == 0)[0]
else:
    zero_var_cols = np.where(X_train_t.std(axis=0) == 0)[0]

if len(zero_var_cols):
    print("WARNING: zero-variance columns in transformed X_train_t:", len(zero_var_cols))
else:
    print("Zero-variance check passed: none found.")


Zero-variance check passed: none found.


In [None]:
#cell 14: persist Artifacts (preprocessor, splits, meta)
ART_DIR = Path("artifacts/prepared")
ART_DIR.mkdir(parents=True, exist_ok=True)

# X_train_t = preprocessor.transform(X_train)
# X_test_t  = preprocessor.transform(X_test)

joblib.dump(preprocessor, ART_DIR / "preprocessor.joblib")

joblib.dump(
    {"X_train_t": X_train_t, "X_test_t": X_test_t, "y_train": y_train, "y_test": y_test},
    ART_DIR / "dataset_splits.joblib"
)

feature_meta = {
    "feature_names": feature_names,
    "ohe_feature_names": ohe_names,
    "cat_for_ohe": cat_for_ohe,
    "num_for_scale": num_for_scale,
    "bin_cols": bin_cols,
    "categorical_model_cols": categorical_model_cols,
    "used_interactions": [c for c in ["edu_industry","role_exp"] if c in X_train.columns],
    "rare_bucketing": {
        "applied": bool(rare_map),
        "threshold": RARE_THRESH if APPLY_RARE_BUCKETING else None,
        "rare_map": rare_map,  # exact rare levels per column learned on TRAIN
    },
    "ohe_drop_mode": drop_mode,
}

joblib.dump(feature_meta, ART_DIR / "feature_meta.joblib")
print("Saved artifacts to:", ART_DIR.resolve())


Saved artifacts to: /Users/yongryan/Downloads/bigdataryan /artifacts/prepared
