In [2]:
!pip install -q mlflow==2.16.0 openpyxl scikit-learn==1.5.1 joblib databricks-sdk==0.36.0

import pandas as pd
import numpy as np
import os
import joblib
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt

# Modeling + preprocessing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

print("‚úî All libraries imported successfully.")

‚úî All libraries imported successfully.


In [3]:
# =================================================================================
# CONFIGURATION: Connect to Databricks MLflow
# ===================================================================================
# NOTE FOR SECURITY: Avoid hardcoding secrets.
# Please use Colab's "Secrets" feature (key icon üîë on the left) to store
# your host and token with the names 'DATABRICKS_HOST' and 'DATABRICKS_TOKEN'.
# The code below will then securely access them.

from google.colab import userdata

# Get the host and token from Colab's secret manager
databricks_host = userdata.get('DATABRICKS_HOST')
databricks_token = userdata.get('DATABRICKS_TOKEN')

# Set environment variables for the MLflow client to use
os.environ['DATABRICKS_HOST'] = databricks_host
os.environ['DATABRICKS_TOKEN'] = databricks_token

# Set the MLflow tracking URI to "databricks" to connect to your workspace
mlflow.set_tracking_uri("databricks")

# Set the name of the experiment. If it doesn't exist, MLflow will create it.
experiment_name = "/Users/shukhany@asu.edu/Perm_Eb2_Approval Rate"
mlflow.set_experiment(experiment_name)


<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/3214210273023929', creation_time=1764396522689, experiment_id='3214210273023929', last_update_time=1764397352398, lifecycle_stage='active', name='/Users/shukhany@asu.edu/Perm_Eb2_Approval Rate', tags={'mlflow.experiment.sourceName': '/Users/shukhany@asu.edu/Perm_Eb2_Approval '
                                 'Rate',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'shukhany@asu.edu',
 'mlflow.ownerId': '72440934139385'}>

In [4]:
from google.colab import files

# Upload PERM_2024.xlsx from your local machine
print("üìÇ Please upload PERM_2024.xlsx")
uploaded = files.upload()  # choose the file in the dialog

# Load Excel into a DataFrame
perm_raw = pd.read_excel("PERM_2024.xlsx", engine="openpyxl")

print("‚úÖ PERM 2024 loaded.")
print("Shape:", perm_raw.shape)
print("First 5 columns:", perm_raw.columns[:5].tolist())

üìÇ Please upload PERM_2024.xlsx


Saving PERM_2024.xlsx to PERM_2024 (1).xlsx
‚úÖ PERM 2024 loaded.
Shape: (92258, 155)
First 5 columns: ['CASE_NUMBER', 'CASE_STATUS', 'RECEIVED_DATE', 'DECISION_DATE', 'REFILE']


In [5]:
# ---------- Build modeling dataframe for FY 2024 ----------

# 1) Target: Certified vs everything else
perm = perm_raw[perm_raw["CASE_STATUS"].notna()].copy()
perm["TARGET_APPROVED"] = np.where(perm["CASE_STATUS"] == "Certified", 1, 0)

# Single-year flag
perm["FISCAL_YEAR"] = 2024

# 2) Columns we need for the model
cols_needed = [
    "TARGET_APPROVED",
    "PW_SOC_CODE",
    "NAICS_CODE",
    "PW_WAGE",
    "PW_UNIT_OF_PAY",
    "WAGE_OFFER_FROM",
    "WAGE_OFFER_TO",
    "WAGE_OFFER_UNIT_OF_PAY",
    "MINIMUM_EDUCATION",
    "WORKSITE_STATE",
    "FW_OWNERSHIP_INTEREST",
    "FISCAL_YEAR",
]

missing = [c for c in cols_needed if c not in perm.columns]
print("Missing columns:", missing)  # should be []

perm_model = perm[cols_needed].copy()

# 3) Wage normalization to annual
def to_annual(wage_series, unit_series):
    unit_series = unit_series.astype(str)
    factor_map = {
        "Hour": 2080,
        "Week": 52,
        "Bi-Weekly": 26,
        "Month": 12,
        "Year": 1,
    }
    factors = unit_series.map(factor_map)
    return wage_series * factors

# Prevailing wage annualized
perm_model["PW_WAGE_ANNUAL"] = to_annual(
    perm_model["PW_WAGE"], perm_model["PW_UNIT_OF_PAY"]
)

# Offered wage (midpoint of from/to) annualized
offer_mid = perm_model[["WAGE_OFFER_FROM", "WAGE_OFFER_TO"]].mean(axis=1)
perm_model["OFFER_WAGE_ANNUAL"] = to_annual(
    offer_mid, perm_model["WAGE_OFFER_UNIT_OF_PAY"]
)

# Wage ratio: how generous the offer is vs prevailing wage
perm_model["WAGE_RATIO"] = (
    perm_model["OFFER_WAGE_ANNUAL"] / perm_model["PW_WAGE_ANNUAL"]
)

# 4) Clean nonsense values
perm_model = perm_model.replace([np.inf, -np.inf], np.nan)

# Require valid wage fields + education
perm_model = perm_model.dropna(
    subset=["PW_WAGE_ANNUAL", "OFFER_WAGE_ANNUAL", "WAGE_RATIO", "MINIMUM_EDUCATION"]
)

print("Final modeling shape:", perm_model.shape)
print("Approval rate:", perm_model["TARGET_APPROVED"].mean())

Missing columns: []
Final modeling shape: (74247, 15)
Approval rate: 0.38358452193354614


In [6]:
# ---------- Train/test split + preprocessing ----------

# Target & features
y = perm_model["TARGET_APPROVED"].astype(int)
X = perm_model.drop(columns=["TARGET_APPROVED"])

numeric_features = ["PW_WAGE_ANNUAL", "OFFER_WAGE_ANNUAL", "WAGE_RATIO"]
categorical_features = [c for c in X.columns if c not in numeric_features]

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

# Ensure categoricals are strings (avoids OneHotEncoder type errors)
for c in categorical_features:
    X[c] = X[c].astype(str)

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Overall approval rate:", y.mean())

Numeric features: ['PW_WAGE_ANNUAL', 'OFFER_WAGE_ANNUAL', 'WAGE_RATIO']
Categorical features: ['PW_SOC_CODE', 'NAICS_CODE', 'PW_WAGE', 'PW_UNIT_OF_PAY', 'WAGE_OFFER_FROM', 'WAGE_OFFER_TO', 'WAGE_OFFER_UNIT_OF_PAY', 'MINIMUM_EDUCATION', 'WORKSITE_STATE', 'FW_OWNERSHIP_INTEREST', 'FISCAL_YEAR']
Train shape: (59397, 14) Test shape: (14850, 14)
Overall approval rate: 0.38358452193354614


In [7]:
from collections import OrderedDict

models = OrderedDict()

# 1. Logistic Regression (correct baseline for binary outcome)
models["LogisticRegression"] = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

# 2. Decision Tree (interpretable)
models["DecisionTree"] = DecisionTreeClassifier(
    max_depth=8,
    min_samples_leaf=200,
    random_state=42
)

# 3. Naive Bayes (very fast baseline)
models["GaussianNB"] = GaussianNB()

# 4. Random Forest (strong, non-linear, handles interactions)
models["RandomForest"] = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=100,
    n_jobs=-1,
    class_weight="balanced_subsample",
    random_state=42
)

# 5. Neural Network (MLP) ‚Äì moderate size for runtime
models["NeuralNet_MLP"] = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation="relu",
    max_iter=30,
    random_state=42
)

print("Models defined:", list(models.keys()))

Models defined: ['LogisticRegression', 'DecisionTree', 'GaussianNB', 'RandomForest', 'NeuralNet_MLP']


In [8]:
# ---------- Cell 7: Training Loop with Classification Metrics ----------

results = []

for name, base_model in models.items():
    print(f"\n=== Training {name} ===")

    clf = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", base_model),
    ])

    with mlflow.start_run(run_name=name):

        # ---- Fit model ----
        clf.fit(X_train, y_train)

        # ---- Get probability scores ----
        # All remaining models either support predict_proba or decision_function
        if hasattr(clf["clf"], "predict_proba"):
            y_proba = clf.predict_proba(X_test)[:, 1]

        elif hasattr(clf["clf"], "decision_function"):
            # Normalize decision function scores into [0, 1]
            scores = clf.decision_function(X_test)
            smin, smax = scores.min(), scores.max()
            y_proba = (scores - smin) / (smax - smin + 1e-9)

        else:
            # Fallback (should NOT occur for your current model list)
            y_proba = clf.predict(X_test)
            y_proba = np.clip(y_proba, 0, 1)

        # ---- Convert probs ‚Üí binary predictions (threshold = 0.5) ----
        y_pred = (y_proba >= 0.5).astype(int)

        # ---- Classification metrics ----
        test_accuracy  = accuracy_score(y_test, y_pred)
        test_f1        = f1_score(y_test, y_pred, zero_division=0)
        test_precision = precision_score(y_test, y_pred, zero_division=0)
        test_recall    = recall_score(y_test, y_pred, zero_division=0)
        roc_auc        = roc_auc_score(y_test, y_proba)

        # ---- Log metrics to MLflow ----
        mlflow.log_metric("test_accuracy",  test_accuracy)
        mlflow.log_metric("test_f1",        test_f1)
        mlflow.log_metric("test_precision", test_precision)
        mlflow.log_metric("test_recall",    test_recall)
        mlflow.log_metric("roc_auc",        roc_auc)

        # ---- Print on screen ----
        print(
            f"{name} | AUC: {roc_auc:.4f} | Acc: {test_accuracy:.4f} | "
            f"F1: {test_f1:.4f} | Prec: {test_precision:.4f} | Rec: {test_recall:.4f}"
        )

        # ---- Save fitted model pipeline ----
        joblib.dump(clf, f"model_{name}.pkl")

        # ---- Append to results table ----
        results.append({
            "model": name,
            "roc_auc": roc_auc,
            "accuracy": test_accuracy,
            "f1": test_f1,
            "precision": test_precision,
            "recall": test_recall,
        })

# ---- Convert to DataFrame ----
results_df = pd.DataFrame(results).sort_values("roc_auc", ascending=False)
results_df


=== Training LogisticRegression ===
LogisticRegression | AUC: 0.8102 | Acc: 0.7154 | F1: 0.6700 | Prec: 0.6033 | Rec: 0.7533


2025/11/29 06:33:14 INFO mlflow.tracking._tracking_service.client: üèÉ View run LogisticRegression at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929/runs/7ead78b0c7ec415c9d2617a964e02bad.
2025/11/29 06:33:14 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929.



=== Training DecisionTree ===
DecisionTree | AUC: 0.6495 | Acc: 0.6228 | F1: 0.0325 | Prec: 1.0000 | Rec: 0.0165


2025/11/29 06:34:58 INFO mlflow.tracking._tracking_service.client: üèÉ View run DecisionTree at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929/runs/dd046f720f94423ab5fdc247c46b1ab6.
2025/11/29 06:34:58 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929.



=== Training GaussianNB ===
GaussianNB | AUC: 0.5807 | Acc: 0.5083 | F1: 0.5817 | Prec: 0.4317 | Rec: 0.8915


2025/11/29 06:35:20 INFO mlflow.tracking._tracking_service.client: üèÉ View run GaussianNB at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929/runs/b429d07d79cb41b19f55941df020b503.
2025/11/29 06:35:20 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929.



=== Training RandomForest ===
RandomForest | AUC: 0.7065 | Acc: 0.6216 | F1: 0.6077 | Prec: 0.5045 | Rec: 0.7642


2025/11/29 06:35:47 INFO mlflow.tracking._tracking_service.client: üèÉ View run RandomForest at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929/runs/f18c487a422a4c59802fbfddbc1eaeab.
2025/11/29 06:35:47 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929.



=== Training NeuralNet_MLP ===




NeuralNet_MLP | AUC: 0.7889 | Acc: 0.7248 | F1: 0.6220 | Prec: 0.6572 | Rec: 0.5904


2025/11/29 06:44:09 INFO mlflow.tracking._tracking_service.client: üèÉ View run NeuralNet_MLP at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929/runs/15229794de3d40ffaf19cd7dce6cc681.
2025/11/29 06:44:09 INFO mlflow.tracking._tracking_service.client: üß™ View experiment at: https://dbc-ca0a3ce7-da46.cloud.databricks.com/ml/experiments/3214210273023929.


Unnamed: 0,model,roc_auc,accuracy,f1,precision,recall
0,LogisticRegression,0.810231,0.715421,0.67005,0.603346,0.753336
4,NeuralNet_MLP,0.788942,0.724781,0.622029,0.657221,0.590414
3,RandomForest,0.706511,0.621616,0.607749,0.504462,0.764221
1,DecisionTree,0.649459,0.622761,0.03247,1.0,0.016503
2,GaussianNB,0.580653,0.508283,0.581739,0.431729,0.891503


In [9]:
# ---------- Pick best model by ROC-AUC and save for deployment ----------

best_row = results_df.iloc[0]
best_model_name = best_row["model"]
print("üèÜ Best model by ROC-AUC:", best_model_name)
print(best_row)

# Load the saved pipeline for that model
best_model = joblib.load(f"model_{best_model_name}.pkl")

# Save under a standard name for Streamlit
joblib.dump(best_model, "model_perm_best.pkl")
print("‚úÖ Saved best model as model_perm_best.pkl")

# Optional: download to your local machine (for Streamlit app)
# from google.colab import files
# files.download("model_perm_best.pkl")

üèÜ Best model by ROC-AUC: LogisticRegression
model        LogisticRegression
roc_auc                0.810231
accuracy               0.715421
f1                      0.67005
precision              0.603346
recall                 0.753336
Name: 0, dtype: object
‚úÖ Saved best model as model_perm_best.pkl


In [10]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib

@st.cache_resource
def load_model():
    return joblib.load("model_perm_best.pkl")

model = load_model()

st.title("EB-2 PERM Approval Probability Estimator (FY2024)")

st.markdown("This tool estimates the probability that a PERM case will be certified using FY2024 data.")

col1, col2 = st.columns(2)

with col1:
    pw_soc_code = st.text_input("Prevailing Wage SOC Code", "15-1252")
    naics_code = st.text_input("Employer NAICS Code", "5415")
    minimum_education = st.selectbox(
        "Minimum Education Required",
        ["High School", "Associate", "Bachelor's", "Master's", "Doctorate"],
        index=2
    )
    worksite_state = st.text_input("Worksite State", "AZ")

with col2:
    pw_wage = st.number_input("Prevailing Wage Amount", min_value=0.0, value=90000.0)
    pw_unit = st.selectbox("Prevailing Wage Unit", ["Year", "Month", "Week", "Bi-Weekly", "Hour"])
    wage_offer_from = st.number_input("Wage Offer From", min_value=0.0, value=95000.0)
    wage_offer_to   = st.number_input("Wage Offer To", min_value=0.0, value=105000.0)
    wage_offer_unit = st.selectbox("Wage Offer Unit", ["Year", "Month", "Week", "Bi-Weekly", "Hour"])

ownership_interest = st.selectbox(
    "Ownership Interest?",
    ["N", "Y"],
    index=0
)

if st.button("Estimate Approval Probability"):

    input_row = {
        "PW_SOC_CODE": [pw_soc_code],
        "NAICS_CODE": [naics_code],
        "PW_WAGE": [pw_wage],
        "PW_UNIT_OF_PAY": [pw_unit],
        "WAGE_OFFER_FROM": [wage_offer_from],
        "WAGE_OFFER_TO": [wage_offer_to],
        "WAGE_OFFER_UNIT_OF_PAY": [wage_offer_unit],
        "MINIMUM_EDUCATION": [minimum_education],
        "WORKSITE_STATE": [worksite_state],
        "FW_OWNERSHIP_INTEREST": [ownership_interest],
        "FISCAL_YEAR": [2024],
    }

    df = pd.DataFrame(input_row)
    prob = model.predict_proba(df)[:, 1][0] * 100

    st.subheader(f"Approval Probability: {prob:.1f}%")

Writing app.py


In [11]:
from google.colab import files
files.download("app.py")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
%%writefile requirements.txt
streamlit
pandas
numpy
scikit-learn==1.5.1
joblib

Writing requirements.txt


In [13]:
from google.colab import files
files.download("requirements.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>