In [1]:
import numpy as np, pandas as pd, onnxruntime as ort
from pathlib import Path
from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
DATA_PATH  = 'data/synth_data_for_training.csv'
MODEL_PATH = 'model/model_1.onnx'   # change to model_2.onnx to test the other
THRESHOLD  = 0.50
MAX_ROWS   = 20000

In [3]:
DATA_PATH, MODEL_PATH = Path(DATA_PATH), Path(MODEL_PATH)
assert DATA_PATH.exists(), f"Missing data at {DATA_PATH}"
assert MODEL_PATH.exists(), f"Missing model at {MODEL_PATH}"

df = pd.read_csv(DATA_PATH)
label_col = next((c for c in ['checked','label','target','y','is_fraud','Outcome'] if c in df.columns), None)

# pick columns (gender/age) robustly by name patterns
gender_col = next((c for c in df.columns if c.lower() == 'persoon_geslacht_vrouw'), None)
age_col = next((c for c in df.columns if c.lower() == 'persoon_leeftijd_bij_onderzoek'), None)

print('Data shape:', df.shape)
print('Label column:', label_col)
print('Gender column:', gender_col)
print('Age column:', age_col)

# Separate X / y
y = df[label_col].astype(int).to_numpy() if label_col else None
Xdf = df.drop(columns=[label_col], errors='ignore').copy()

# Coerce to numeric matrix for ONNX
for c in Xdf.columns:
    Xdf[c] = pd.to_numeric(Xdf[c], errors='coerce')
Xdf = Xdf.fillna(0.0)

# Optional sampling
if len(Xdf) > MAX_ROWS:
    Xdf = Xdf.sample(MAX_ROWS, random_state=0)
    if y is not None:
        y = y[Xdf.index]

print('X shape:', Xdf.shape)

Data shape: (12645, 316)
Label column: checked
Gender column: persoon_geslacht_vrouw
Age column: persoon_leeftijd_bij_onderzoek
X shape: (12645, 315)


In [4]:
def load_onnx(model_path: Path):
    sess = ort.InferenceSession(str(model_path), providers=['CPUExecutionProvider'])
    in_name  = sess.get_inputs()[0].name
    out_name = sess.get_outputs()[0].name
    return sess, in_name, out_name

def predict_proba(sess, in_name, out_name, Xdf: pd.DataFrame):
    X = Xdf.to_numpy().astype(np.float32)
    out = sess.run([out_name], {in_name: X})[0]
    if out.ndim == 1: 
        return out
    if out.shape[1] == 1: 
        return out.ravel()
    return out[:, -1]   # positive class

sess, in_name, out_name = load_onnx(MODEL_PATH)
probs_original = predict_proba(sess, in_name, out_name, Xdf)
pred_original  = (probs_original >= THRESHOLD).astype(int)

print('ONNX input:', in_name, '| output:', out_name, '| preds:', probs_original.shape)

# Baseline accuracy (if available)
if y is not None:
    acc = accuracy_score(y, pred_original)
    try:
        auc = roc_auc_score(y, probs_original)
    except Exception:
        auc = None
    print({'baseline_accuracy': acc, 'baseline_auc': auc})
else:
    print('No label column; skipping baseline accuracy.')

ONNX input: X | output: output_label | preds: (12645,)
{'baseline_accuracy': 0.9680506128904706, 'baseline_auc': 0.8782587856095918}


In [5]:
assert gender_col is not None, 'Could not find the gender column (persoon_geslacht_vrouw).'
X_gender = Xdf.copy()
# Flip 0 <-> 1; if any values are not {0,1}, we round and then flip safely.
g = X_gender[gender_col].round().clip(0,1).astype(int)
X_gender[gender_col] = 1 - g

probs_gender = predict_proba(sess, in_name, out_name, X_gender)
pred_gender  = (probs_gender >= THRESHOLD).astype(int)

changed_preds_gender = int(np.sum(pred_original != pred_gender))
pct_changed_gender   = float(np.mean(pred_original != pred_gender))

acc_gender = accuracy_score(y, pred_gender) if y is not None else None

print({'changed_predictions': changed_preds_gender, 
       'pct_changed': pct_changed_gender, 
       'accuracy_after': acc_gender})

{'changed_predictions': 157, 'pct_changed': 0.012415974693554765, 'accuracy_after': 0.9654408857255833}


In [6]:
assert age_col is not None, 'Could not find the age column (persoon_leeftijd_*).'
X_age = Xdf.copy()
X_age[age_col] = np.clip(X_age[age_col] + 1, 0, 130)  # keep ages reasonable

probs_age = predict_proba(sess, in_name, out_name, X_age)
pred_age  = (probs_age >= THRESHOLD).astype(int)

changed_preds_age = int(np.sum(pred_original != pred_age))
pct_changed_age   = float(np.mean(pred_original != pred_age))

acc_age = accuracy_score(y, pred_age) if y is not None else None

print({'changed_predictions': changed_preds_age, 
       'pct_changed': pct_changed_age, 
       'accuracy_after': acc_age})

{'changed_predictions': 122, 'pct_changed': 0.009648082245947015, 'accuracy_after': 0.962514827995255}
