In [1]:
import pandas as pd
import numpy as np


In [2]:
FILE_PATH = "txn_behavior_clean_data.csv"

df = pd.read_csv(FILE_PATH)

In [3]:
w_txn = 0.4
w_amt = 0.4
w_country = 0.2

In [4]:
threshold_review = 0.45
threshold_medium = 0.20

In [5]:
required_columns = [
    "customer_id",
    "txn_frequency",
    "avg_txn_value",
    "high_risk_country",
    "prior_alert_flag"
]

missing_cols = set(required_columns) - set(df.columns)
if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")

In [6]:
def min_max_normalize(series: pd.Series) -> pd.Series:
    min_val = series.min()
    max_val = series.max()
    
    if max_val == min_val:
        return pd.Series([0.0] * len(series))
    
    return (series - min_val) / (max_val - min_val)

df["norm_txn_cnt"] = min_max_normalize(df["txn_frequency"])
df["norm_txn_amt"] = min_max_normalize(df["avg_txn_value"])


In [7]:
def calculate_risk(df, w_txn, w_amt, w_country):
    return (
        w_txn * df["norm_txn_cnt"] +
        w_amt * df["norm_txn_amt"] +
        w_country * df["high_risk_country"]
    )


In [8]:
df["fraud_risk_score"] = calculate_risk(
    df, w_txn, w_amt, w_country
)


In [9]:
what_if_scenarios = {
    "baseline": (0.4, 0.4, 0.2),
    "high_txn_focus": (0.6, 0.3, 0.1),
    "high_amt_focus": (0.3, 0.6, 0.1),
    "high_geo_focus": (0.3, 0.3, 0.4)
}

for scenario, (wt, wa, wc) in what_if_scenarios.items():
    df[f"risk_score_{scenario}"] = calculate_risk(
        df, wt, wa, wc
    ).round(3)


In [10]:
def get_action(score):
    if score >= threshold_review:
        return "Review"
    elif score >= threshold_medium:
        return "Monitor"
    else:
        return "No Action"

df["recommended_action"] = df["fraud_risk_score"].apply(get_action)

df["risk_band"] = pd.cut(
    df["fraud_risk_score"],
    bins=[0, threshold_medium, threshold_review, 1],
    labels=["Low", "Medium", "High"]
)

df["predicted_flag"] = (df["fraud_risk_score"] >= threshold_review).astype(int)


In [11]:
assert df["fraud_risk_score"].between(0, 1).all(), "Risk score out of bounds"

In [12]:
df = df.sort_values("fraud_risk_score", ascending=False)
df.head(10)


review_count = (df["recommended_action"] == "Review").sum()
total = len(df)

print(f"Recommended for review: {review_count}/{total} ({review_count/total:.1%})")
print("\nTop 10 highest risk customers:")
print(
    df[["customer_id", "fraud_risk_score", "risk_band", "recommended_action"]]
    .head(10)
    .to_string(index=False)
)


Recommended for review: 5/20 (25.0%)

Top 10 highest risk customers:
customer_id  fraud_risk_score risk_band recommended_action
     CX-014          0.558890      High             Review
     CX-001          0.533731      High             Review
     CX-004          0.531429      High             Review
     CX-020          0.478355      High             Review
     CX-008          0.464049      High             Review
     CX-006          0.418014    Medium            Monitor
     CX-009          0.401961    Medium            Monitor
     CX-013          0.285434    Medium            Monitor
     CX-010          0.228571    Medium            Monitor
     CX-015          0.207843    Medium            Monitor


In [13]:
df[["norm_txn_cnt", "norm_txn_amt", "fraud_risk_score"]] = \
df[["norm_txn_cnt", "norm_txn_amt", "fraud_risk_score"]].round(3)


In [14]:
df.to_csv("customer_risk_output.csv", index=False)
print("\nSaved: customer_risk_output.csv")


Saved: customer_risk_output.csv
