In [5]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

# 1) Daten laden â€“ Pfad relativ zum Notebook
excel_path = Path("..") / "data" / "raw" / "PSP_Jan_Feb_2019.xlsx"
df = pd.read_excel(excel_path)

print("Shape:", df.shape)
print(df.head(3))

# 2) Feature Engineering

# Zeitspalten
df["tmsp"] = pd.to_datetime(df["tmsp"], errors="coerce")
df["tmsp_min"] = df["tmsp"].dt.floor("T")
df["hour"] = df["tmsp"].dt.hour
df["dow"] = df["tmsp"].dt.dayofweek

# Kaufversuch-Identifikation 
if "attempt_id" not in df.columns:
    df["attempt_id"] = (
        df["tmsp_min"].astype(str)
        + "|" + df["country"].astype(str)
        + "|" + df["amount"].astype(str)
    )
if "attempt_no" not in df.columns:
    df["attempt_no"] = df.groupby("attempt_id").cumcount() + 1
if "is_retry" not in df.columns:
    df["is_retry"] = (df["attempt_no"] > 1).astype(int)

# Sicherheitstypen korrekt als int
df["3D_secured"] = df["3D_secured"].astype(int)
df["is_retry"] = df["is_retry"].astype(int)

print(df.dtypes)


Shape: (50410, 8)
   Unnamed: 0                tmsp  country  amount  success      PSP  \
0           0 2019-01-01 00:01:11  Germany      89        0  UK_Card   
1           1 2019-01-01 00:01:17  Germany      89        1  UK_Card   
2           2 2019-01-01 00:02:49  Germany     238        0  UK_Card   

   3D_secured    card  
0           0    Visa  
1           0    Visa  
2           1  Diners  


  df["tmsp_min"] = df["tmsp"].dt.floor("T")


Unnamed: 0             int64
tmsp          datetime64[ns]
country               object
amount                 int64
success                int64
PSP                   object
3D_secured             int64
card                  object
tmsp_min      datetime64[ns]
hour                   int32
dow                    int32
attempt_id            object
attempt_no             int64
is_retry               int64
dtype: object


In [None]:
# 3) GebÃ¼hrenmatrix
FEES = {
    "Moneycard":  {"success": 5.0,  "fail": 2.0},
    "Goldcard":   {"success": 10.0, "fail": 5.0},
    "UK_Card":    {"success": 3.0,  "fail": 1.0},
    "Simplecard": {"success": 1.0,  "fail": 0.5},
}

# 4) Faire Train/Test-Trennung 
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(df, groups=df["attempt_id"]))
train, test = df.iloc[train_idx].copy(), df.iloc[test_idx].copy()

print("Train/Test sizes:", train.shape, test.shape)

# 5) BASELINE â€“ Kosten-optimiert aus Historie
p_success_psp = train.groupby("PSP")["success"].mean()

def expected_cost(psp, p_success):
    return p_success * FEES[psp]["success"] + (1 - p_success) * FEES[psp]["fail"]

exp_cost_psp = {psp: expected_cost(psp, p) for psp, p in p_success_psp.items()}
baseline_global_psp = min(exp_cost_psp, key=exp_cost_psp.get)

SEG = ["country", "card", "3D_secured"]
seg_rates = train.groupby(SEG + ["PSP"])["success"].mean().reset_index()

rows = []
for keys, part in seg_rates.groupby(SEG):
    costs = {row["PSP"]: expected_cost(row["PSP"], row["success"]) for _, row in part.iterrows()}
    best_psp = min(costs, key=costs.get)
    rows.append((*keys, best_psp))
seg_best = pd.DataFrame(rows, columns=SEG + ["best_psp"])

def baseline_segmented(row):
    m = seg_best.merge(pd.DataFrame([row[SEG].to_dict()]), on=SEG, how="inner")
    return m["best_psp"].iloc[0] if len(m) else baseline_global_psp

test["psp_baseline_global"] = baseline_global_psp
test["psp_baseline_seg"] = test.apply(baseline_segmented, axis=1)

def realized_cost(row):
    psp = row["PSP"]
    return FEES[psp]["success"] if row["success"] == 1 else FEES[psp]["fail"]

test["realized_cost_observed"] = test.apply(realized_cost, axis=1)

def realized_cost_for_policy(row, policy_col):
    chosen = row[policy_col]
    if chosen != row["PSP"]:
        return np.nan
    return FEES[chosen]["success"] if row["success"] == 1 else FEES[chosen]["fail"]

for col in ["psp_baseline_global", "psp_baseline_seg"]:
    test[f"real_cost_{col}"] = test.apply(realized_cost_for_policy, axis=1, policy_col=col)

def summarize_policy(col):
    sub = test[~test[f"real_cost_{col}"].isna()]
    return {
        "policy": col,
        "coverage": len(sub) / len(test),
        "mean_cost": sub[f"real_cost_{col}"].mean(),
        "success_rate": sub["success"].mean()
    }

baseline_eval = pd.DataFrame([
    summarize_policy("psp_baseline_global"),
    summarize_policy("psp_baseline_seg")
])
print("\nBaseline (Agreement-Subset):")
print(baseline_eval)

# 6) PrÃ¤zises Vorhersagemodell
CAT = ["country", "card", "3D_secured", "is_retry", "dow", "hour"]
NUM = ["amount"]
FEATURES = CAT + NUM

for c in ["3D_secured", "is_retry", "dow", "hour"]:
    train[c] = train[c].astype(int)
    test[c]  = test[c].astype(int)

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), CAT),
        ("num", StandardScaler(), NUM),
    ]
)

base_clf = LogisticRegression(max_iter=1000, class_weight="balanced")

models = {}
for psp in FEES.keys():
    tr_psp = train[train["PSP"] == psp]
    if tr_psp.empty:
        continue
    X = tr_psp[FEATURES]
    y = tr_psp["success"]
    pipe = Pipeline(steps=[("pre", preprocess), ("clf", base_clf)])
    pipe.fit(X, y)
    models[psp] = pipe

def expected_cost_row(row):
    Xrow = pd.DataFrame([row[FEATURES].to_dict()])
    costs = {}
    for psp, model in models.items():
        p = float(model.predict_proba(Xrow)[:, 1])
        c = p * FEES[psp]["success"] + (1 - p) * FEES[psp]["fail"]
        costs[psp] = (p, c)
    best_psp = min(costs, key=lambda k: costs[k][1])
    out = {"psp_model_choice": best_psp}
    out.update({f"p_{k}": v[0] for k, v in costs.items()})
    out.update({f"c_{k}": v[1] for k, v in costs.items()})
    return pd.Series(out)

pred = test.apply(expected_cost_row, axis=1)
test = pd.concat([test, pred], axis=1)

test["real_cost_psp_model_choice"] = test.apply(
    realized_cost_for_policy, axis=1, policy_col="psp_model_choice"
)

sub = test[~test["real_cost_psp_model_choice"].isna()]
policy_eval = pd.DataFrame([{
    "policy": "model_router",
    "coverage": len(sub)/len(test),
    "mean_cost": sub["real_cost_psp_model_choice"].mean(),
    "success_rate": sub["success"].mean()
}])

print("\nModel-Router (Agreement-Subset):")
print(policy_eval)

# Modellbasierte erwartete Kosten
test["exp_cost_model_choice"] = test[[c for c in test.columns if c.startswith("c_")]].min(axis=1)

def exp_cost_of_policy_choice(row, policy_col):
    chosen = row[policy_col]
    return row[f"c_{chosen}"]

test["exp_cost_baseline_global"] = test.apply(exp_cost_of_policy_choice, axis=1, policy_col="psp_baseline_global")
test["exp_cost_baseline_seg"]    = test.apply(exp_cost_of_policy_choice, axis=1, policy_col="psp_baseline_seg")

model_based_eval = pd.DataFrame({
    "policy": ["model_router", "baseline_global", "baseline_seg"],
    "mean_expected_cost": [
        test["exp_cost_model_choice"].mean(),
        test["exp_cost_baseline_global"].mean(),
        test["exp_cost_baseline_seg"].mean()
    ]
})
print("\nModel-based (expected) costs â€“ full coverage:")
print(model_based_eval)

# 7) Interpretierbarkeit
try:
    psp = "UK_Card" if "UK_Card" in models else list(models.keys())[0]
    pipe = models[psp]
    X_test = test[FEATURES]
    y_test = test["success"]
    res = permutation_importance(pipe, X_test, y_test, n_repeats=5, random_state=42, scoring="roc_auc")
    try:
        feat_names = pipe.named_steps["pre"].get_feature_names_out()
    except Exception:
        feat_names = [f"f{i}" for i in range(len(res.importances_mean))]
    imp = (
        pd.DataFrame({"feature": feat_names, "importance": res.importances_mean})
        .sort_values("importance", ascending=False)
        .head(15)
    )
    print(f"\nPermutation Importance â€“ Top-Features fÃ¼r PSP '{psp}':")
    print(imp)
except Exception as e:
    print("\nPermutation Importance konnte nicht berechnet werden:", e)

# 8) Segment-Auswertung
def cost_from_psp_and_success(psp, success):
    return FEES[psp]["success"] if success == 1 else FEES[psp]["fail"]

test["cost_observed"] = test.apply(lambda r: cost_from_psp_and_success(r["PSP"], r["success"]), axis=1)
test["cost_model_expected"] = test["exp_cost_model_choice"]

seg_cols = ["country", "card"]
seg_report = (
    test.groupby(seg_cols)
    .agg(
        n=("success", "size"),
        success_rate=("success", "mean"),
        cost_observed=("cost_observed", "mean"),
        cost_model_expected=("cost_model_expected", "mean"),
    )
    .reset_index()
    .sort_values("n", ascending=False)
    .head(12)
)

print("\nSegment-Report (Top nach Volumen):")
print(seg_report)


Train/Test sizes: (40345, 14) (10065, 14)

Baseline (Agreement-Subset):
                policy  coverage  mean_cost  success_rate
0  psp_baseline_global  0.252658   0.578844      0.157688
1     psp_baseline_seg  0.252658   0.578844      0.157688


  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = float(model.predict_proba(Xrow)[:, 1])
  p = floa