In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

path = "/content/drive/MyDrive/Dataset DSCI 510/combined_nhis_dataset_with_fraud_types (1).csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Patient ID,AGE,GENDER,DATE OF ENCOUNTER,DATE OF DISCHARGE,Amount Billed,DIAGNOSIS,FRAUD_TYPE
0,1,25.0,F,2025-02-11,2025-02-11,16800.0,CYESIS LMP,Phantom Billing
1,2,30.0,M,2025-02-13,2025-02-13,6300.0,WAX IMPACTION,Wrong Diagnosis
2,3,35.0,M,2025-02-13,2025-02-13,6160.0,CYESIS LMP,Wrong Diagnosis
3,4,48.0,M,2025-02-18,2025-02-18,0.0,TONSILITIS OBSTRUCTIVE SLEEP APEANA,Ghost Enrollee
4,5,58.0,F,2025-02-18,2025-02-18,8400.0,REFRACTIVE ERROR,No Fraud


In [None]:
df['FRAUD_TYPE'].value_counts(dropna=False,normalize=True)

Unnamed: 0_level_0,proportion
FRAUD_TYPE,Unnamed: 1_level_1
No Fraud,0.574063
Phantom Billing,0.207622
Ghost Enrollee,0.20105
Wrong Diagnosis,0.017265


In [None]:
df.shape

(20388, 8)

In [None]:
df.head()

Unnamed: 0,Patient ID,AGE,GENDER,DATE OF ENCOUNTER,DATE OF DISCHARGE,Amount Billed,DIAGNOSIS,FRAUD_TYPE
0,1,25.0,F,2025-02-11,2025-02-11,16800.0,CYESIS LMP,Phantom Billing
1,2,30.0,M,2025-02-13,2025-02-13,6300.0,WAX IMPACTION,Wrong Diagnosis
2,3,35.0,M,2025-02-13,2025-02-13,6160.0,CYESIS LMP,Wrong Diagnosis
3,4,48.0,M,2025-02-18,2025-02-18,0.0,TONSILITIS OBSTRUCTIVE SLEEP APEANA,Ghost Enrollee
4,5,58.0,F,2025-02-18,2025-02-18,8400.0,REFRACTIVE ERROR,No Fraud


In [None]:
df.columns = df.columns.astype(str).str.strip().str.lower()

In [None]:
import numpy as np
fraud_type_col = "fraud_type"

s = df[fraud_type_col].astype(str).str.strip().str.lower()

df["fraud_binary"] = np.where(s.isin(["phantom billing", "ghost enrolee", "ghost enrollee"]),
    1,
    0
).astype(int)

In [None]:
df["fraud_binary"].value_counts(dropna=False,normalize=True)

Unnamed: 0_level_0,proportion
fraud_binary,Unnamed: 1_level_1
0,0.591328
1,0.408672


In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score

try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
except Exception:
    import sys
    !{sys.executable} -m pip -q install imbalanced-learn
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline

try:
    from IPython.display import display
except Exception:
    display = print

In [None]:
def make_lift_table(y_true: pd.Series, y_prob: np.ndarray, n_bins: int = 10) -> pd.DataFrame:
    y_true = pd.Series(y_true).astype(int).reset_index(drop=True)
    y_prob = pd.Series(np.asarray(y_prob).astype(float)).reset_index(drop=True)

    tmp = pd.DataFrame({"y_true": y_true, "y_prob": y_prob})
    tmp["rank"] = tmp["y_prob"].rank(method="first", ascending=False)
    tmp["decile"] = pd.qcut(tmp["rank"], q=n_bins, labels=range(1, n_bins + 1)).astype(int)

    overall_rate = tmp["y_true"].mean()
    grouped = (
        tmp.groupby("decile", as_index=False)
        .agg(count=("y_true", "size"), responders=("y_true", "sum"), avg_score=("y_prob", "mean"))
        .sort_values("decile")
        .reset_index(drop=True)
    )
    grouped["response_rate"] = grouped["responders"] / grouped["count"]
    grouped["overall_rate"] = overall_rate

    if overall_rate > 0:
        grouped["lift"] = grouped["response_rate"] / overall_rate
    else:
        grouped["lift"] = np.nan

    grouped = grouped[["decile", "count", "responders", "response_rate", "overall_rate", "lift", "avg_score"]]
    return grouped

In [None]:
def top_decile_lift(lift_table: pd.DataFrame) -> float:
    top = lift_table.loc[lift_table["decile"] == 1, "lift"]
    return float(top.iloc[0]) if len(top) else float("nan")

In [None]:
# Start from original X (before split)
X2 = df.copy()
y = X2["fraud_binary"].astype(int)
X2 = X2.drop(columns=["fraud_binary", "fraud_type"], errors="ignore")


# 1) Drop truly non-informative unique IDs
for col in ["claim_id"]:
    if col in X2.columns:
        X2 = X2.drop(columns=[col])

# 2) Reduce high-cardinality categorical/text columns by keeping top-K
def cap_top_k(df_in, col, k=30):
    if col not in df_in.columns:
        return df_in
    s = df_in[col].astype("string").str.strip().str.lower().fillna("missing")
    top = s.value_counts().nlargest(k).index
    df_in[col] = s.where(s.isin(top), "other")
    return df_in

for col in ["diagnosis", "treatment"]:
    X2 = cap_top_k(X2, col, k=30)

# 3) Split (important: do split BEFORE any encoding that learns frequencies)
X_train, X_test, y_train, y_test = train_test_split(
    X2, y, test_size=0.2, random_state=42, stratify=y
)

# 4) Frequency encode provider_id/patient_id (training-only mapping, no leakage)
def add_freq_encoding(train_df, test_df, col):
    if col not in train_df.columns:
        return train_df, test_df
    tr = train_df[col].astype("string").fillna("missing")
    te = test_df[col].astype("string").fillna("missing")
    freq = tr.value_counts()
    train_df[col + "_freq"] = tr.map(freq).astype(float)
    test_df[col + "_freq"] = te.map(freq).fillna(0).astype(float)
    train_df = train_df.drop(columns=[col])
    test_df = test_df.drop(columns=[col])
    return train_df, test_df

for col in ["provider_id", "patient_id"]:
    X_train, X_test = add_freq_encoding(X_train, X_test, col)

# 5) Now do get_dummies (should be much smaller)
X_train_d = pd.get_dummies(X_train, dummy_na=True)
X_test_d  = pd.get_dummies(X_test, dummy_na=True)
X_test_d  = X_test_d.reindex(columns=X_train_d.columns, fill_value=0)

print("X_train_d shape:", X_train_d.shape, "X_test_d shape:", X_test_d.shape)


X_train_d shape: (16310, 5267) X_test_d shape: (4078, 5267)


In [None]:
pipe4 = ImbPipeline(steps=[
    ("over", RandomOverSampler(random_state=42, sampling_strategy=0.3)),
    ("model", LogisticRegression(max_iter=1000, solver="saga", n_jobs=-1, tol=1e-3)),
])

In [None]:
pipe4.fit(X_train_d, y_train)

ValueError: The specified ratio required to remove samples from the minority class while trying to generate new samples. Please increase the ratio.

In [None]:
# Robust end-to-end fraud modeling pipeline for df (already loaded in Colab)




# def _make_ohe() -> OneHotEncoder:
#     """Create OneHotEncoder with compatibility across sklearn versions."""
#     try:
#         return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
#     except TypeError:
#         return OneHotEncoder(handle_unknown="ignore", sparse=False)


# def create_fraud_binary(df_in: pd.DataFrame, fraud_type_col: str = "fraud_type") -> pd.Series:
#     if fraud_type_col not in df_in.columns:
#         raise KeyError(f"Column '{fraud_type_col}' not found in df. Available columns: {list(df_in.columns)[:30]} ...")
#     s = df_in[fraud_type_col].astype(str).str.strip().str.lower()
#     mapping = {
#         "no fraud": 0,
#         "no fraud.": 0,
#         "no_fraud": 0,
#         "wrong diagnosis": 0,
#         "wrong_diagnosis": 0,
#         "phantom billing": 1,
#         "phantom_billing": 1,
#         "ghost enrolee": 1,
#         "ghost enrollee": 1,
#         "ghost_enrolee": 1,
#         "ghost_enrollee": 1,
#     }
#     y = s.map(mapping)
#     return y



# ---------- Main ----------

df = df.copy()

# Split into features X and target y
drop_cols = ["fraud_binary"]
if fraud_type_col in df.columns:
    drop_cols.append(fraud_type_col)

X = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")
y = df["fraud_binary"].astype(int)

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=["number", "bool"]).columns.tolist()
categorical_features = [c for c in X_train.columns if c not in numeric_features]

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", _make_ohe()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

# Model and SMOTE pipeline (SMOTE only on training set via imblearn Pipeline, no leakage)
model = LogisticRegression(max_iter=2000, class_weight=None)

pipe = ImbPipeline(
    steps=[
        ("preprocess", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("model", model),
    ]
)

# Fit model
pipe.fit(X_train, y_train)

# Predict probabilities and compute recall at threshold 0.5
train_prob = pipe.predict_proba(X_train)[:, 1]
test_prob = pipe.predict_proba(X_test)[:, 1]

train_pred = (train_prob >= 0.5).astype(int)
test_pred = (test_prob >= 0.5).astype(int)

train_recall = recall_score(y_train, train_pred, zero_division=0)
test_recall = recall_score(y_test, test_pred, zero_division=0)

print(f"\nTrain recall @0.5: {train_recall:.4f}")
print(f"Test  recall @0.5: {test_recall:.4f}")

# Lift tables
train_lift = make_lift_table(y_train, train_prob, n_bins=10)
test_lift = make_lift_table(y_test, test_prob, n_bins=10)

print(f"\nTop decile lift (train): {top_decile_lift(train_lift):.4f}")
print(f"Top decile lift (test):  {top_decile_lift(test_lift):.4f}")

print("\nLift Table (Train):")
display(train_lift)

print("\nLift Table (Test):")
display(test_lift)


KeyError: "Column 'fraud_type' not found in df. Available columns: ['Patient ID', 'AGE', 'GENDER', 'DATE OF ENCOUNTER', 'DATE OF DISCHARGE', 'Amount Billed', 'DIAGNOSIS', 'FRAUD_TYPE'] ..."