# OOF Parity Audit

This notebook verifies that the predictions saved in `oof_predictions.csv` can be reproduced exactly by loading the saved models and running inference on the corresponding validation data. This is a critical step to ensure there are no bugs in the training/validation pipeline (e.g., incorrect data augmentations being applied during validation, wrong model being saved/loaded, etc.).

In [1]:
# 1. Imports & Configuration

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedGroupKFold
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm.auto import tqdm

from utils_preproc import load_and_preprocess

# This configuration MUST match the one used for training in `01_seti_baseline.ipynb`
class CFG:
    data_dir = '.'
    train_path = os.path.join(data_dir, 'train')
    train_labels_path = os.path.join(data_dir, 'train_labels.csv')
    oof_preds_path = 'oof_predictions.csv'
    
    model_name = 'tf_efficientnet_b0_ns'
    img_size = 256
    in_channels = 3
    num_classes = 1
    
    batch_size = 128 # Can be larger for inference
    n_folds = 5
    seed = 42
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {CFG.device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
# 2. Load Data & Recreate Folds

# Load the original training labels
df = pd.read_csv(CFG.train_labels_path)
df['group'] = df['id'].apply(lambda x: x[:2])
def get_train_file_path(image_id):
    return f"{CFG.train_path}/{image_id[0]}/{image_id}.npy"
df['file_path'] = df['id'].apply(get_train_file_path)

# Recreate the exact same folds used in training
skf = StratifiedGroupKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['target'], df['group'])):
    df.loc[val_idx, 'fold'] = int(fold)
df['fold'] = df['fold'].astype(int)

print("Folds recreated successfully.")
print(df.head())

# Load the OOF predictions that were saved during training
oof_df = pd.read_csv(CFG.oof_preds_path)
print("\nOOF predictions file loaded:")
print(oof_df.head())

Folds recreated successfully.
                id  target group                      file_path  fold
0  d5d85dafc41d5b3       0    d5  ./train/d/d5d85dafc41d5b3.npy     2
1  6170c3d29bd5874       0    61  ./train/6/6170c3d29bd5874.npy     0
2  87989f418ca1301       0    87  ./train/8/87989f418ca1301.npy     1
3  3087c24fbcb2c3b       0    30  ./train/3/3087c24fbcb2c3b.npy     3
4  8b04fea0d8d49c8       0    8b  ./train/8/8b04fea0d8d49c8.npy     0

OOF predictions file loaded:
                id  target   preds
0  6170c3d29bd5874       0  0.1703
1  8b04fea0d8d49c8       0  0.3818
2  3ee4f147a176231       0  0.3848
3  3883652d935831a       0  0.1525
4  3ec3a45d56e31a4       1  0.6200


In [3]:
# 3. Define Model, Dataset, and Transforms for Re-prediction

# Transforms must be identical to the validation transforms in the training script
def get_transforms():
    return A.Compose([
        A.Resize(CFG.img_size, CFG.img_size),
        ToTensorV2(),
    ])

# Dataset class must be identical
class SETIDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.file_paths = df['file_path'].values
        self.labels = df['target'].values
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        image = load_and_preprocess(file_path, do_asinh=True)
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        label = torch.tensor(self.labels[idx]).float()
        return image, label

# Model class must be identical
class SETIModel(nn.Module):
    def __init__(self, model_name=CFG.model_name, pretrained=False):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, in_chans=CFG.in_channels, num_classes=CFG.num_classes)

    def forward(self, x):
        x = self.model(x)
        return x

In [6]:
# 4. Perform Parity Check for a Single Fold

def re_predict_fold(fold_num):
    print(f"--- Verifying Fold {fold_num} ---")
    
    # 1. Get validation data for this fold
    valid_df = df[df['fold'] == fold_num].reset_index(drop=True)
    
    # 2. Create dataset and dataloader (no augmentations, as in validation)
    valid_dataset = SETIDataset(valid_df, transform=get_transforms())
    valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=4, pin_memory=True)
    
    # 3. Load the trained model for this fold
    model = SETIModel().to(CFG.device)
    model_path = f'{CFG.model_name}_fold{fold_num}_best.pth'
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    # 4. Run inference to get new predictions
    re_preds = []
    with torch.no_grad():
        for images, _ in tqdm(valid_loader, desc=f"Re-predicting Fold {fold_num}"):
            images = images.to(CFG.device)
            with torch.cuda.amp.autocast():
                y_preds = model(images)
            re_preds.append(y_preds.sigmoid().to('cpu').numpy())
    
    re_preds = np.concatenate(re_preds).flatten()
    
    # 5. Get the original OOF predictions for this fold
    original_oof_preds = oof_df[oof_df['id'].isin(valid_df['id'])].sort_values('id')['preds'].values
    # We need to sort both by ID to ensure they align
    temp_df = pd.DataFrame({'id': valid_df['id'], 're_pred': re_preds}).sort_values('id')
    aligned_re_preds = temp_df['re_pred'].values
    
    # 6. Compare the predictions
    print(f"Number of original preds: {len(original_oof_preds)}")
    print(f"Number of re-calculated preds: {len(aligned_re_preds)}")
    
    # Display first 5 predictions for a manual check
    print("\nFirst 5 Original OOF Preds:", original_oof_preds[:5])
    print("First 5 Re-calculated Preds:", aligned_re_preds[:5])
    
    # Check if they are numerically close
    is_close = np.allclose(original_oof_preds, aligned_re_preds, rtol=1e-4, atol=1e-6)
    print(f"\nParity Check Passed for Fold {fold_num}: {is_close}")
    if not is_close:
        diff = np.abs(original_oof_preds - aligned_re_preds)
        print(f"Max absolute difference: {np.max(diff)}")
        print(f"Mean absolute difference: {np.mean(diff)}")
    
    return is_close, aligned_re_preds

# Run the check for Fold 0 and capture the re-calculated predictions
parity_ok, aligned_re_preds_fold0 = re_predict_fold(fold_num=0)

--- Verifying Fold 0 ---


Re-predicting Fold 0:   0%|          | 0/84 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():
Re-predicting Fold 0:   1%|          | 1/84 [00:01<01:58,  1.43s/it]

Re-predicting Fold 0:   5%|▍         | 4/84 [00:01<00:24,  3.22it/s]

Re-predicting Fold 0:   7%|▋         | 6/84 [00:02<00:29,  2.67it/s]

Re-predicting Fold 0:  11%|█         | 9/84 [00:03<00:24,  3.12it/s]

Re-predicting Fold 0:  14%|█▍        | 12/84 [00:03<00:15,  4.76it/s]

Re-predicting Fold 0:  17%|█▋        | 14/84 [00:04<00:17,  3.93it/s]

Re-predicting Fold 0:  20%|██        | 17/84 [00:04<00:17,  3.83it/s]

Re-predicting Fold 0:  26%|██▌       | 22/84 [00:05<00:14,  4.39it/s]

Re-predicting Fold 0:  30%|██▉       | 25/84 [00:06<00:14,  4.15it/s]

Re-predicting Fold 0:  33%|███▎      | 28/84 [00:06<00:09,  5.64it/s]

Re-predicting Fold 0:  36%|███▌      | 30/84 [00:07<00:12,  4.42it/s]

Re-predicting Fold 0:  39%|███▉      | 33/84 [00:08<00:12,  4.22it/s]

Re-predicting Fold 0:  43%|████▎     | 36/84 [00:08<00:08,  5.69it/s]

Re-predicting Fold 0:  45%|████▌     | 38/84 [00:09<00:10,  4.51it/s]

Re-predicting Fold 0:  49%|████▉     | 41/84 [00:09<00:10,  4.23it/s]

Re-predicting Fold 0:  52%|█████▏    | 44/84 [00:10<00:07,  5.69it/s]

Re-predicting Fold 0:  55%|█████▍    | 46/84 [00:10<00:08,  4.53it/s]

Re-predicting Fold 0:  58%|█████▊    | 49/84 [00:11<00:08,  4.29it/s]

Re-predicting Fold 0:  62%|██████▏   | 52/84 [00:11<00:05,  5.77it/s]

Re-predicting Fold 0:  64%|██████▍   | 54/84 [00:12<00:06,  4.60it/s]

Re-predicting Fold 0:  68%|██████▊   | 57/84 [00:13<00:06,  4.28it/s]

Re-predicting Fold 0:  71%|███████▏  | 60/84 [00:13<00:04,  5.75it/s]

Re-predicting Fold 0:  74%|███████▍  | 62/84 [00:14<00:04,  4.66it/s]

Re-predicting Fold 0:  77%|███████▋  | 65/84 [00:14<00:04,  4.38it/s]

Re-predicting Fold 0:  81%|████████  | 68/84 [00:15<00:02,  5.87it/s]

Re-predicting Fold 0:  83%|████████▎ | 70/84 [00:15<00:03,  4.62it/s]

Re-predicting Fold 0:  87%|████████▋ | 73/84 [00:16<00:02,  4.34it/s]

Re-predicting Fold 0:  90%|█████████ | 76/84 [00:16<00:01,  5.83it/s]

Re-predicting Fold 0:  93%|█████████▎| 78/84 [00:17<00:01,  4.70it/s]

Re-predicting Fold 0:  96%|█████████▋| 81/84 [00:18<00:00,  4.43it/s]

Re-predicting Fold 0: 100%|██████████| 84/84 [00:18<00:00,  6.03it/s]

Re-predicting Fold 0: 100%|██████████| 84/84 [00:18<00:00,  4.58it/s]




Number of original preds: 10684
Number of re-calculated preds: 10684

First 5 Original OOF Preds: [0.4     0.4546  0.3237  0.02014 0.3823 ]
First 5 Re-calculated Preds: [0.4     0.4546  0.3237  0.02014 0.3823 ]

Parity Check Passed for Fold 0: False
Max absolute difference: 0.00024218750000004619
Mean absolute difference: 3.2158787889098195e-05


In [8]:
# 5. Sanity Checks (as per Expert Advice)

from sklearn.metrics import roc_auc_score

# Check 1: Per-fold AUC for the re-predicted fold
print("--- Sanity Check 1: Re-calculated Fold 0 AUC ---")
# Use the 'aligned_re_preds_fold0' variable captured from the previous cell
valid_sorted = df[df.fold==0].sort_values('id').reset_index(drop=True)
recalculated_auc = roc_auc_score(valid_sorted['target'].values, aligned_re_preds_fold0)
print(f'Re-calculated AUC for Fold 0: {recalculated_auc:.4f}')
print(f"Original AUC reported for Fold 0 in training log: 0.5535")

# Check 2: Global OOF AUC from the saved file
print("\n--- Sanity Check 2: Global OOF AUC from file ---")
oof_df_loaded = pd.read_csv('oof_predictions.csv')
# Merge with original df to get targets, ensuring alignment
merged_oof = df[['id','target']].merge(oof_df_loaded, on='id', how='inner')
# When merging, pandas creates 'target_x' (from left df) and 'target_y' (from right df)
global_oof_auc = roc_auc_score(merged_oof.target_x, merged_oof.preds)
print(f'Global OOF AUC calculated from file: {global_oof_auc:.4f}')
print(f"Global OOF AUC reported in training log: 0.5561")
print(f'OOF preds min/max/mean: {merged_oof.preds.min():.4f} / {merged_oof.preds.max():.4f} / {merged_oof.preds.mean():.4f}')

# Check 3: Quick model weight sanity check
print("\n--- Sanity Check 3: Model Weight Sanity ---")
for f in range(CFG.n_folds):
    try:
        m = SETIModel()
        m.load_state_dict(torch.load(f'{CFG.model_name}_fold{f}_best.pth'))
        w = next(m.parameters())
        print(f'Fold {f} weight mean/std: {float(w.mean()):.6f} / {float(w.std()):.6f}')
    except Exception as e:
        print(f"Could not load model for fold {f}: {e}")

--- Sanity Check 1: Re-calculated Fold 0 AUC ---
Re-calculated AUC for Fold 0: 0.5535
Original AUC reported for Fold 0 in training log: 0.5535

--- Sanity Check 2: Global OOF AUC from file ---
Global OOF AUC calculated from file: 0.5561
Global OOF AUC reported in training log: 0.5561
OOF preds min/max/mean: 0.0000 / 1.0000 / 0.3005

--- Sanity Check 3: Model Weight Sanity ---
Fold 0 weight mean/std: -0.035266 / 0.868574


  model = create_fn(
Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  print(f'Fold {f} weight mean/std: {float(w.mean()):.6f} / {float(w.std()):.6f}')


Fold 1 weight mean/std: -0.034986 / 0.868564
Fold 2 weight mean/std: -0.035462 / 0.868543
Fold 3 weight mean/std: -0.035098 / 0.868546


Fold 4 weight mean/std: -0.034933 / 0.868580
