In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import OneClassSVM
from sklearn.mixture import GaussianMixture
from sklearn.metrics import roc_auc_score, average_precision_score

## 1- Load Part D Dataset

In [None]:
# 1. Read Medicare Part D data and LEIE data.
combined_df = pd.read_csv("outputs/Combined_LEIE_Medicare_2017_2019_DOWNSIZED_1mil.csv")

In [None]:
combined_df.head()

In [None]:
# 2. Inspect column names to avoid KeyError. 
print("Combined columns:", combined_df.columns.tolist())

In [None]:
# 3. Quick sanity check: 
print("\n=== Medicare Merged Dataset ===")
print("Merged Dataset shape:", combined_df.shape)
print("FraudFlag distribution:\n", combined_df["TARGET"].value_counts())

## 2- Normalization and Mapping of Features

In [None]:
# 1. We want to select only truly numeric features (excluding identifiers).
#    Using select_dtypes prevents KeyError that arises if you manually list wrong names.
numeric_cols = combined_df.select_dtypes(include=[np.number]).columns.tolist()
print("\nAll numeric columns (including IDs & target):\n", numeric_cols)

### Normalization

In [None]:
# 2. From these numeric columns, drop 'NPI' (identifier) and 'FraudFlag' (target).
features_list = [col for col in numeric_cols if col not in ["npi", "TARGET"]]
print("\nFeatures selected for modeling (numeric_cols without 'NPI' and 'FraudFlag'):\n", features_list)

In [None]:
# 1. Create a StandardScaler instance to normalize these numeric features.
scaler_med = StandardScaler()


# 2. Fit & transform these features in merged_df, overwriting them with scaled values.
combined_df[features_list] = scaler_med.fit_transform(combined_df[features_list])

In [None]:
# 3- Define mappings for common boolean-like values
bool_mappings = {
    'True': 1, 'False': 0,
    'true': 1, 'false': 0,
    True: 1, False: 0,
    'Positive': 1, 'Negative': 0,
    'Yes': 1, 'No': 0
}

# 4- Identify boolean or boolean-like columns
bool_cols = []

for col in combined_df.columns:
    col_data = combined_df[col]
    if col_data.dtype == 'bool':
        bool_cols.append(col)
    elif col_data.dtype == 'object' and set(col_data.dropna().unique()).issubset(set(bool_mappings.keys())):
        bool_cols.append(col)
    elif pd.api.types.is_numeric_dtype(col_data) and set(col_data.dropna().unique()).issubset({0, 1}):
        bool_cols.append(col)

# 5- Convert values in those columns using the mapping
for col in bool_cols:
    combined_df[col] = combined_df[col].map(bool_mappings).astype('int')

## 3- Data Splitting

In [None]:
# 1. Separate into X_med (feature matrix) and y_med (target vector)
X_med = combined_df.drop(columns=["TARGET"])
y_med = combined_df["TARGET"]

In [None]:
y_med

## 4- Cross Validation

In [None]:
# 1. Use StratifiedKFold to keep class imbalance roughly equal across folds.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
repeats = 10

In [None]:
results_med = {
    'majority': {
        'OCAN': {'y_true': [], 'y_pred': []},
    },
    'minority': {
        'OCAN': {'y_true': [], 'y_pred': []},
    }
}

## 5- Model Training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score, average_precision_score

class Generator(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        return recon, z

class Discriminator(nn.Module):
    def __init__(self, latent_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim, latent_dim),
            nn.ReLU(),
            nn.Linear(latent_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, z):
        return self.net(z)

class OCAN:
    def __init__(self,
                 input_dim,
                 hidden_dim=64,
                 lr_g=1e-3,
                 lr_d=1e-3,
                 n_epochs=50,
                 batch_size=128,
                 device='cpu'):
        self.device = device
        self.gen = Generator(input_dim, hidden_dim).to(device)
        self.dis = Discriminator(hidden_dim // 2).to(device)
        self.opt_g = optim.Adam(self.gen.parameters(), lr=lr_g)
        self.opt_d = optim.Adam(self.dis.parameters(), lr=lr_d)
        self.criterion_recon = nn.MSELoss()
        self.criterion_adv   = nn.BCELoss()
        self.n_epochs        = n_epochs
        self.batch_size      = batch_size

        # history dict for losses and metrics
        self.history = {
            'loss_d': [],   # discriminator loss
            'loss_g': [],   # generator loss (recon + adv)
            'auc':   [],    # validation AUC
            'aupr':  []     # validation AUPR
        }

    def fit(self, X, X_val=None, y_val=None):
        """Train on X; if (X_val, y_val) provided, compute AUC/AUPR each epoch."""
        ds        = TensorDataset(torch.tensor(X, dtype=torch.float32))
        loader    = DataLoader(ds, batch_size=self.batch_size, shuffle=True)
        has_val   = (X_val is not None and y_val is not None)
        for epoch in range(self.n_epochs):
            epoch_loss_d = 0.0
            epoch_loss_g = 0.0
            for (x_batch,) in loader:
                x_batch = x_batch.to(self.device)

                # ——— Train Discriminator ———
                with torch.no_grad():
                    _, z_real = self.gen(x_batch)
                z_fake    = torch.randn_like(z_real).to(self.device)
                d_real    = self.dis(z_real)
                d_fake    = self.dis(z_fake)
                loss_d    = self.criterion_adv(d_real, torch.ones_like(d_real)) + \
                            self.criterion_adv(d_fake, torch.zeros_like(d_fake))
                self.opt_d.zero_grad()
                loss_d.backward()
                self.opt_d.step()

                # ——— Train Generator (autoencoder + adversarial) ———
                recon, z  = self.gen(x_batch)
                d_out     = self.dis(z)
                loss_recon= self.criterion_recon(recon, x_batch)
                loss_g_adv= self.criterion_adv(d_out, torch.zeros_like(d_out))
                loss_g    = loss_recon + 1e-2 * loss_g_adv
                self.opt_g.zero_grad()
                loss_g.backward()
                self.opt_g.step()

                epoch_loss_d += loss_d.item()
                epoch_loss_g += loss_g.item()

            # average over batches
            avg_d = epoch_loss_d / len(loader)
            avg_g = epoch_loss_g / len(loader)
            self.history['loss_d'].append(avg_d)
            self.history['loss_g'].append(avg_g)

            # if validation set provided, compute AUC/AUPR
            if has_val:
                raw_scores = self.score_samples(X_val)
                # calibrate with your existing function
                calib     = calibrate_scores(raw_scores, y_val, method="sigmoid")
                probs     = (calib.predict_proba(raw_scores.reshape(-1,1))[:,1]
                             if hasattr(calib, "predict_proba")
                             else calib.predict(raw_scores.reshape(-1,1)))
                auc   = roc_auc_score(y_val, probs)
                aupr  = average_precision_score(y_val, probs)
                self.history['auc'].append(auc)
                self.history['aupr'].append(aupr)
                print(f"Epoch {epoch+1}/{self.n_epochs}  "
                      f"loss_d={avg_d:.4f}  loss_g={avg_g:.4f}  "
                      f"AUC={auc:.4f}  AUPR={aupr:.4f}")
            else:
                print(f"Epoch {epoch+1}/{self.n_epochs}  "
                      f"loss_d={avg_d:.4f}  loss_g={avg_g:.4f}")

    def score_samples(self, X):
        ds     = TensorDataset(torch.tensor(X, dtype=torch.float32))
        loader = DataLoader(ds, batch_size=self.batch_size, shuffle=False)
        scores = []
        with torch.no_grad():
            for (x_batch,) in loader:
                x_batch = x_batch.to(self.device)
                recon, z = self.gen(x_batch)
                rec_err  = torch.mean((recon - x_batch)**2, dim=1)
                d_out    = self.dis(z).squeeze()
                scores.append((rec_err + d_out).cpu())
        return torch.cat(scores).numpy()


# -----------------------------------------
# 2) Integrate OCAN into your cross-val loop
# -----------------------------------------
for r in range(repeats):
    for fold, (train_idx, test_idx) in enumerate(skf.split(X_med, y_med), start=1):
        X_train_m, X_test_m = X_med.values[train_idx], X_med.values[test_idx]
        y_train_m, y_test_m = y_med.values[train_idx], y_med.values[test_idx]

        # ---- Majority OCAN ----
        X_train_major = X_train_m[y_train_m == 0]
        ocan_major = OCAN(input_dim=X_train_major.shape[1],
                          hidden_dim=64,
                          n_epochs=30,
                          batch_size=256,
                          device='cuda' if torch.cuda.is_available() else 'cpu')
        ocan_major.fit(X_train_major)

        raw_scores = ocan_major.score_samples(X_test_m)
        # higher raw_scores → more anomalous
        scores = raw_scores  # no need to negate
        # calibrate
        calib_sig = calibrate_scores(scores, y_test_m, method="sigmoid")
        prob_sig = (calib_sig.predict_proba(scores.reshape(-1,1))[:,1]
                    if hasattr(calib_sig, "predict_proba")
                    else calib_sig.predict(scores.reshape(-1,1)))
        # evaluate
        auc, aupr = roc_auc_score(y_test_m, prob_sig), average_precision_score(y_test_m, prob_sig)
        results_med['majority']['OCAN'].append((auc, aupr))
        predictions_storage['majority']['OCAN']['y_true'].extend(y_test_m.tolist())
        predictions_storage['majority']['OCAN']['y_pred'].extend(prob_sig.tolist())
        print(f"Majority OCAN + Sigmoid: AUC={auc:.4f}, AUPR={aupr:.4f}")

        # ---- Minority OCAN ----
        X_train_min = X_train_m[y_train_m == 1]
        if len(X_train_min) > 0:
            ocan_min = OCAN(input_dim=X_train_min.shape[1],
                            hidden_dim=64,
                            n_epochs=30,
                            batch_size=256,
                            device='cuda' if torch.cuda.is_available() else 'cpu')
            ocan_min.fit(X_train_min)

            raw_scores_min = ocan_min.score_samples(X_test_m)
            scores_min = raw_scores_min
            calib_sig_min = calibrate_scores(scores_min, y_test_m, method="sigmoid")
            prob_sig_min = (calib_sig_min.predict_proba(scores_min.reshape(-1,1))[:,1]
                            if hasattr(calib_sig_min, "predict_proba")
                            else calib_sig_min.predict(scores_min.reshape(-1,1)))
            auc_min = roc_auc_score(y_test_m, prob_sig_min)
            aupr_min = average_precision_score(y_test_m, prob_sig_min)
            results_med['minority']['OCAN'].append((auc_min, aupr_min))
            predictions_storage['minority']['OCAN']['y_true'].extend(y_test_m.tolist())
            predictions_storage['minority']['OCAN']['y_pred'].extend(prob_sig_min.tolist())
            print(f"Minority OCAN + Sigmoid: AUC={auc_min:.4f}, AUPR={aupr_min:.4f}")

    print(f"--- Finished Medicare Repeat {r+1} with OCAN ---")

## 6- Evaluation

In [None]:
for group in results_med:
    print(f'\n>>> {group.upper()}')
    for method in results_med[group]:
        scores = results_med[group][method]
        if scores:
            mean_auc, mean_aupr = compute_mean_scores(scores)
            print(f'{method:20s} | AUC: {mean_auc:.4f} | AUPRC: {mean_aupr:.4f}')
        else:
            print(f'{method:20s} | No scores available.')