# OOF Analysis and Threshold Optimization

The goal of this notebook is to:
1. Generate Out-of-Fold (OOF) predictions using the 3 models trained previously.
2. Analyze the OOF predictions to find a more optimal F1 threshold than the one found during individual fold training.
3. Potentially explore more advanced thresholding techniques (e.g., per-class thresholds).
4. Generate a new submission file using the optimized threshold(s).

In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
from torchvision import transforms as T
from sklearn.metrics import f1_score
from tqdm import tqdm
import gc
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

class CFG:
    # General
    seed = 42
    num_workers = 0 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Data paths
    data_dir = './'
    train_csv_path = os.path.join(data_dir, 'train.csv')
    labels_csv_path = os.path.join(data_dir, 'labels.csv')
    train_img_dir = os.path.join(data_dir, 'train')

    # Model
    model_name = 'tf_efficientnet_b4_ns'
    img_size = 384
    num_classes = None # Placeholder, will be set after class definition
    model_paths = [
        'models/tf_efficientnet_b4_ns_fold0_best.pth',
        'models/tf_efficientnet_b4_ns_fold1_best.pth',
        'models/tf_efficientnet_b4_ns_fold2_best.pth'
    ]
    n_folds = 3

    # Inference
    batch_size = 16 # OOM with 32, reducing to 16

# Set num_classes correctly after CFG class is defined
CFG.num_classes = len(pd.read_csv(CFG.labels_csv_path))

# Clean up memory
torch.cuda.empty_cache()
gc.collect()

# Sanity check model paths and num_classes
print("--- Verifying Model Paths ---")
for path in CFG.model_paths:
    if os.path.exists(path):
        print(f"OK: Found {path} (Size: {os.path.getsize(path) / 1e6:.2f} MB)")
    else:
        print(f"ERROR: Missing {path}")
print(f"Number of classes set in CFG: {CFG.num_classes}")

In [None]:
# DEBUG: List directory contents to resolve path ambiguity using os module
import os
print("--- Listing CWD Contents ---")
try:
    print(os.listdir('.'))
except Exception as e:
    print(f"Error listing CWD: {e}")

print("\n--- Listing 'models/' Contents ---")
try:
    print(os.listdir('models'))
except Exception as e:
    print(f"Error listing 'models/': {e}")

print("\n--- Listing 'train/' Contents ---")
try:
    print(os.listdir('train'))
except Exception as e:
    print(f"Error listing 'train/': {e}")

In [None]:
## 1. Data Loading and Target Recreation (THE FINAL FIX)
# After comparing with `01_training_pipeline.ipynb`, the root cause is identified.
# The training script first AGGREGATES `train.csv` by `id` and then creates targets/folds.
# My previous OOF script skipped the aggregation, leading to a structural mismatch.
# This fix replicates the aggregation step exactly.

print("--- Re-creating Dataframe with Aggregation and Correct Label Mapping ---")

# 1. Load base dataframes
train_df_raw = pd.read_csv(CFG.train_csv_path)
labels_df = pd.read_csv(CFG.labels_csv_path)
folds_df = pd.read_feather('train_folds.feather')[['id', 'fold']] # Only use this for fold assignments

# 2. Aggregate train_df_raw, exactly as in the training notebook.
# This creates one row per image and sorts the dataframe by 'id' due to groupby's behavior.
print("Aggregating train data by image ID...")
df_agg = train_df_raw.groupby('id')['attribute_ids'].apply(lambda x: ' '.join(x)).reset_index()

# 3. Merge the aggregated dataframe with the fold assignments.
# Since both df_agg and folds_df are sorted by 'id', the merge is clean and preserves order.
print("Merging aggregated data with folds...")
df = pd.merge(df_agg, folds_df, on='id')

# 4. Create the correct attribute_id -> index mapping
attr_ids_from_csv = labels_df['attribute_id'].values
attr_id_to_idx = {attr_id: i for i, attr_id in enumerate(attr_ids_from_csv)}
CFG.num_classes = len(labels_df)
print(f"Number of classes set from labels.csv: {CFG.num_classes}")

# 5. Create the one-hot encoded targets using the correct mapping on the correctly structured df
targets = np.zeros((len(df), CFG.num_classes), dtype=np.int8)
for i, row in tqdm(df.iterrows(), total=len(df), desc="Creating target vectors"):
    attr_ids_for_image = [int(attr_id) for attr_id in row['attribute_ids'].split()]
    for attr_id in attr_ids_for_image:
        if attr_id in attr_id_to_idx:
            targets[i, attr_id_to_idx[attr_id]] = 1

# 6. Add targets and filepaths to the dataframe
df['targets'] = list(targets)
df['filepath'] = df['id'].apply(lambda x: os.path.join(CFG.train_img_dir, f'{x}.png'))

# 7. Final verification
print(f"Shape of the final dataframe: {df.shape}")
assert df.shape[0] == 120801, "Dataframe has incorrect number of rows after aggregation!"
display(df.head())
print(f"Fold distribution:\n{df['fold'].value_counts()}")
assert len(df['targets'].iloc[0]) == CFG.num_classes, "Target vector length mismatch!"
print("Target creation and data preparation successful. The dataframe now matches the training structure.")

In [None]:
def get_valid_transforms():
    # CORRECTED: Based on 01_training_pipeline.ipynb, the validation transforms
    # use aspect-ratio preserving resize, center crop, and ImageNet normalization.
    # This was the critical bug.
    print("--- Applying CORRECTED validation transforms (Resize+CenterCrop, ImageNet Norm) ---")
    return T.Compose([
        T.Resize(CFG.img_size), # Preserves aspect ratio, resizes smaller edge to img_size
        T.CenterCrop(CFG.img_size),
        T.ToTensor(),
        T.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225],
        ),
    ])

class iMetDataset(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df
        self.filepaths = df['filepath'].values
        self.labels = df['targets'].values
        self.transforms = transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        filepath = self.filepaths[idx]
        image = Image.open(filepath).convert('RGB')
        
        if self.transforms:
            image = self.transforms(image)
            
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return image, label

class iMetModel(nn.Module):
    def __init__(self, model_name, pretrained=False): # Set pretrained=False as we load weights manually
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained, num_classes=CFG.num_classes)

    def forward(self, x):
        return self.model(x)

In [None]:
# DEBUG: Verify the normalization constants being loaded

print("--- Verifying data config from timm ---")
try:
    # This is the function the expert suggested for a quick check
    data_config = timm.data.resolve_model_data_config(CFG.model_name)
    print("Result from timm.data.resolve_model_data_config:")
    print(f"  Mean: {data_config['mean']}")
    print(f"  Std: {data_config['std']}")
except Exception as e:
    print(f"Error with resolve_model_data_config: {e}")

print("\nThis confirms that `timm` is resolving the config to standard ImageNet defaults, not TF-style defaults.")
print("The expert was right about the *type* of bug (normalization mismatch), but the suggested fix using `resolve_data_config` did not work as expected because of how timm handles this specific deprecated model name.")
print("The correct normalization for TF-trained models is mean=[0.5, 0.5, 0.5] and std=[0.5, 0.5, 0.5]. I will apply this manually.")

In [None]:
# EXPERT ADVICE: Purge stale OOF artifacts before regenerating
print("--- Purging stale OOF files ---")
stale_files = ['oof_preds.npy', 'oof_labels.npy']
for f in stale_files:
    if os.path.exists(f):
        os.remove(f)
        print(f"Removed stale file: {f}")
    else:
        print(f"File not found (already clean): {f}")

In [None]:
# EXPERT ADVICE: Quick single-image probe (sanity check)
print("--- Running single-image sanity check ---")
sample_df = df.iloc[[0]].reset_index(drop=True)
# Get the transforms defined in cell 4
valid_transforms = get_valid_transforms()
ds = iMetDataset(sample_df, transforms=valid_transforms)
x, y = ds[0]

# Load model correctly
model = iMetModel(CFG.model_name, pretrained=False).to(CFG.device)
state_dict = torch.load(CFG.model_paths[0], map_location=CFG.device)
model.load_state_dict(state_dict, strict=True)
model.eval()

# Run inference on the single image
with torch.no_grad():
    p = model(x.unsqueeze(0).to(CFG.device)).sigmoid().cpu().numpy()

print(f'Probs range for single image: min={p.min():.6f}, max={p.max():.6f}')
max_prob_idx = np.argmax(p)
print(f'Predicted class index with max prob: {max_prob_idx}')

true_labels_indices = np.where(y.numpy() == 1)[0]
print(f'True label indices: {true_labels_indices}')

# Check if the max prob corresponds to a true label
if max_prob_idx in true_labels_indices:
    print("\nSUCCESS: The highest probability prediction corresponds to a true label.")
    print("DIAGNOSIS: The basic pipeline (model, weights, transforms) is correct for a single item.")
    print("The bug is likely in the batching/looping logic of the full OOF generation.")
else:
    print("\nFAILURE: The highest probability prediction does NOT correspond to a true label.")
    print("DIAGNOSIS: The transforms or model weights are fundamentally wrong, despite the high probability value.")

del model, state_dict, ds, x, y, p
gc.collect()
torch.cuda.empty_cache()

In [None]:
## 3. Generate OOF Predictions (BREAKTHROUGH FIX APPLIED)

# This version uses the corrected 'squishing' transform and a larger batch size.
# Diagnostic prints have been removed for a clean, fast run.

# Initialize lists to store predictions, labels, and indices from all folds
all_preds_list = []
all_labels_list = []
all_indices_list = []

for fold in range(CFG.n_folds):
    print(f"--- Generating OOF for Fold {fold} ---")
    
    model_path = CFG.model_paths[fold]
    
    # FIX for OOM: Create model on CPU, load weights, THEN move to GPU.
    model = iMetModel(CFG.model_name, pretrained=False) # 1. Create on CPU
    state_dict = torch.load(model_path, map_location='cpu', weights_only=True) # 2. Load weights to CPU
    model.load_state_dict(state_dict, strict=True)
    model.to(CFG.device) # 3. Move fully loaded model to GPU
    model.eval()
    
    valid_df_fold = df[df['fold'] == fold]
    original_indices = valid_df_fold.index.values
    
    valid_dataset = iMetDataset(valid_df_fold.reset_index(drop=True), transforms=get_valid_transforms())
    valid_loader = DataLoader(valid_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers)
    
    fold_preds = []
    fold_labels = []
    pbar = tqdm(valid_loader, desc=f"Predicting Fold {fold}")
    with torch.no_grad():
        for images, labels in pbar:
            images = images.to(CFG.device)
            logits = model(images)
            preds = logits.sigmoid().cpu().numpy()
            
            fold_preds.append(preds)
            fold_labels.append(labels.cpu().numpy())
    
    fold_preds = np.concatenate(fold_preds)
    fold_labels = np.concatenate(fold_labels)
    
    all_preds_list.append(fold_preds)
    all_labels_list.append(fold_labels)
    all_indices_list.append(original_indices)
    
    del model, state_dict, valid_df_fold, valid_dataset, valid_loader, fold_preds, fold_labels
    torch.cuda.empty_cache()
    gc.collect()

print("\nFinished prediction loops for all folds.")

# Re-assemble the OOF predictions in the original dataframe order
all_preds = np.concatenate(all_preds_list)
all_labels = np.concatenate(all_labels_list)
all_indices = np.concatenate(all_indices_list)
sort_order = np.argsort(all_indices)
oof_preds = all_preds[sort_order]
oof_labels = all_labels[sort_order]

print("Verifying alignment...")
expected_labels = np.stack(df['targets'].values)
assert np.array_equal(oof_labels, expected_labels), "FATAL: OOF labels do not match!"
print("Verification successful. OOF arrays are correctly aligned.")

np.save('oof_preds.npy', oof_preds)
np.save('oof_labels.npy', oof_labels)
print("OOF predictions and labels saved to .npy files.")

# Final check of the overall OOF F1 score
print("\n--- Calculating Final OOF F1 Score ---")
best_f1 = 0
best_threshold = 0
for threshold in np.linspace(0.1, 0.4, 31):
    f1 = f1_score(oof_labels, (oof_preds > threshold).astype(int), average='micro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold
print(f"Best OOF F1 Score: {best_f1:.4f} at threshold {best_threshold:.2f}")

In [None]:
# 4. Spearman Correlation Check
from scipy.stats import spearmanr

print("--- Performing Spearman Correlation Check ---")

# 1. Load the classifier bias from the trained model
state = torch.load(CFG.model_paths[0], map_location='cpu')
bias = state['model.classifier.bias'].cpu().numpy()
print(f"Loaded bias vector with shape: {bias.shape}")

# 2. Calculate class frequencies from the current dataframe
# This assumes the 'targets' column in the current 'df' reflects the class order we are testing
freq = np.array(np.stack(df['targets'].values)).mean(0)
print(f"Calculated frequency vector with shape: {freq.shape}")

# 3. Check for shape mismatch
if bias.shape[0] != freq.shape[0]:
    print(f"FATAL: Shape mismatch! Bias shape {bias.shape} vs Freq shape {freq.shape}")
else:
    # 4. Compute Spearman correlation
    corr, pval = spearmanr(bias, freq)
    print(f'Spearman corr(bias, freq): {corr:.4f}')
    print(f'p-value: {pval}')

    if np.abs(corr) < 0.1:
        print("\nWARNING: Correlation is near zero. This strongly suggests a class index mismatch between training and inference.")
    else:
        print("\nINFO: Correlation is significant. The class index mapping is likely correct.")

del state, bias, freq
gc.collect()

In [None]:
# DEBUG: Verify file existence before loading
import os
print('--- CWD contents before loading OOF files ---')
print(sorted(os.listdir('.')))

In [None]:
# This cell was modified based on expert advice to use sample-wise F2 and a wider threshold range.
# Second run: Refining the search at higher thresholds as the score was still rising at 0.90.

print("--- Finding Best Global Threshold (Sample-wise F2 - Refined High Range) ---")

# Reload OOF predictions and labels to ensure a clean state
oof_preds = np.load('oof_preds.npy')
oof_labels = np.load('oof_labels.npy')

# Define a more granular range of thresholds at the high end
thresholds = np.arange(0.85, 1.0, 0.01)
scores = []
avg_preds_per_image = []
avg_trues_per_image = np.mean(oof_labels.sum(axis=1))

# Calculate F-beta score for each threshold
pbar = tqdm(thresholds, desc="Searching thresholds for Sample-wise F2 (High Range)")
for threshold in pbar:
    y_pred_thresh = (oof_preds > threshold).astype(np.int8)
    
    # Apply fallback for empty predictions (vectorized)
    empty_preds_mask = y_pred_thresh.sum(axis=1) == 0
    if np.any(empty_preds_mask):
        argmax_indices = oof_preds[empty_preds_mask].argmax(axis=1)
        y_pred_thresh[empty_preds_mask, argmax_indices] = 1

    # Calculate sample-wise F2 score
    score = fbeta_score(oof_labels, y_pred_thresh, beta=2, average='samples', zero_division=0)
    scores.append(score)
    avg_preds_per_image.append(np.mean(y_pred_thresh.sum(axis=1)))

# Find the best threshold and score
best_score = max(scores)
best_threshold_idx = np.argmax(scores)
best_threshold = thresholds[best_threshold_idx]
best_avg_preds = avg_preds_per_image[best_threshold_idx]

print(f"\nBest Sample-wise F2 score on OOF: {best_score:.4f}")
print(f"Best threshold: {best_threshold:.4f}")
print(f"Avg predicted labels/image at best threshold: {best_avg_preds:.2f}")
print(f"Avg true labels/image in OOF: {avg_trues_per_image:.2f}")

# Plot the scores
plt.figure(figsize=(12, 6))

# Plot F2 score
plt.subplot(1, 2, 1)
plt.plot(thresholds, scores, marker='o')
plt.title('Global Threshold vs. Sample-wise F2 Score (High Range)')
plt.xlabel('Threshold')
plt.ylabel('Sample-wise F2 Score')
plt.grid(True)
plt.axvline(best_threshold, color='r', linestyle='--', label=f'Best T={best_threshold:.2f}')
plt.legend()

# Plot avg labels
plt.subplot(1, 2, 2)
plt.plot(thresholds, avg_preds_per_image, marker='o', label='Avg. Predicted Labels')
plt.axhline(avg_trues_per_image, color='g', linestyle='--', label=f'Avg. True Labels ({avg_trues_per_image:.2f})')
plt.title('Avg. Predicted Labels vs. Threshold (High Range)')
plt.xlabel('Threshold')
plt.ylabel('Avg. Labels per Image')
plt.grid(True)
plt.axvline(best_threshold, color='r', linestyle='--', label=f'Best T={best_threshold:.2f}')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
print("--- Visualizing OOF Prediction Distribution ---")

# Plot a histogram of all predictions
plt.figure(figsize=(12, 6))
plt.hist(oof_preds.flatten(), bins=100, log=True)
plt.title('Distribution of OOF Predictions (Log Scale)')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency (Log Scale)')
plt.axvline(best_threshold, color='r', linestyle='--', label=f'Global Best Threshold ({best_threshold:.4f})')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

print("\nDiagnosis:")
print("The histogram shows the vast majority of predictions are very close to 0. ")
print("This forces the optimizer to pick a very high threshold to separate the few confident positive predictions from the sea of negatives, leading to high precision but terrible recall.")
print("This strongly supports the per-class thresholding strategy.")

In [None]:
print(f"--- Analyzing Metrics at Best Global Threshold: {best_threshold:.4f} ---")

# Ensure oof_preds and oof_labels are loaded
if 'oof_preds' not in locals():
    print('Reloading OOF files...')
    oof_preds = np.load('oof_preds.npy')
    oof_labels = np.load('oof_labels.npy').astype(np.uint8)

# Get predictions using the best global threshold
y_pred = (oof_preds > best_threshold).astype(np.uint8)
y_true = oof_labels

# Manually calculate micro-averaged precision and recall for speed
# This is much faster than sklearn's implementation for large arrays
tp = np.sum((y_true == 1) & (y_pred == 1))
fp = np.sum((y_true == 0) & (y_pred == 1))
fn = np.sum((y_true == 1) & (y_pred == 0))

precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

print(f"Micro Precision: {precision:.4f}")
print(f"Micro Recall:    {recall:.4f}")
print(f"Micro F1-Score:  {f1:.4f} (should match previous calculation)")

print("\nDiagnosis:")
if recall < 0.3:
    print("Confirmed: Recall is extremely low, as hypothesized. The high threshold is filtering out too many true positives.")
else:
    print("Recall is not as low as expected. The issue might be more complex.")

# Also, let's see how many images have ZERO predictions with this threshold
num_no_preds = np.sum(y_pred.sum(axis=1) == 0)
print(f"\nNumber of images with ZERO predicted labels: {num_no_preds} / {len(y_pred)} ({num_no_preds/len(y_pred)*100:.2f}%)")

In [None]:
# This cell is modified to implement a highly efficient, sorting-based per-class F2 thresholding.
import time
from tqdm import tqdm

print("--- Finding Optimal Thresholds Per Class (F2 Score, Efficient Sorting Method) ---")

# Ensure oof_preds and oof_labels are loaded
if 'oof_preds' not in locals() or 'oof_labels' not in locals():
    print('Reloading OOF files...')
    oof_preds = np.load('oof_preds.npy')
    oof_labels = np.load('oof_labels.npy')

# Use the best global threshold as a default for regularization
BEST_GLOBAL_THRESHOLD = 0.93
MIN_SAMPLES_FOR_OPT = 5
beta = 2
beta2 = beta**2

num_classes = oof_preds.shape[1]
per_class_thresholds = np.zeros(num_classes)
class_counts = oof_labels.sum(axis=0)
low_sample_mask = class_counts < MIN_SAMPLES_FOR_OPT
num_low_sample_classes = np.sum(low_sample_mask)

print(f"Found {num_low_sample_classes} classes with < {MIN_SAMPLES_FOR_OPT} samples. Using global threshold for them.")
per_class_thresholds[low_sample_mask] = BEST_GLOBAL_THRESHOLD

start_time = time.time()

# Loop over classes that have enough samples
optimizable_classes_indices = np.where(~low_sample_mask)[0]

for i in tqdm(optimizable_classes_indices, desc="Optimizing thresholds"):
    y_true_class = oof_labels[:, i]
    y_pred_probs_class = oof_preds[:, i]
    
    # Sort predictions and labels
    desc_score_indices = np.argsort(y_pred_probs_class, kind="mergesort")[::-1]
    y_pred_probs_class = y_pred_probs_class[desc_score_indices]
    y_true_class = y_true_class[desc_score_indices]
    
    # Calculate cumulative true positives and false positives
    tps = np.cumsum(y_true_class)
    fps = np.arange(1, len(y_true_class) + 1) - tps
    
    # Total number of positive samples for this class
    total_positives = np.sum(y_true_class)
    if total_positives == 0:
        per_class_thresholds[i] = BEST_GLOBAL_THRESHOLD
        continue
        
    # Calculate F2 score for each possible threshold
    # (The thresholds are the prediction values themselves)
    precision = tps / (tps + fps)
    recall = tps / total_positives
    
    denominator = (beta2 * precision) + recall
    f2_scores = np.divide((1 + beta2) * (precision * recall), denominator, out=np.zeros_like(denominator, dtype=float), where=denominator!=0)
    
    # Find the best score and corresponding threshold
    best_idx = np.argmax(f2_scores)
    best_threshold = y_pred_probs_class[best_idx]
    per_class_thresholds[i] = best_threshold

end_time = time.time()
print(f"\nFinished optimizing {len(optimizable_classes_indices)} thresholds in {end_time - start_time:.2f} seconds.")

# Save the thresholds for inference
np.save('per_class_thresholds.npy', per_class_thresholds)
print("Per-class thresholds saved to 'per_class_thresholds.npy'")

# Display some stats about the found thresholds
print(f"\nThreshold stats:")
print(f"  Min: {np.min(per_class_thresholds):.4f}")
print(f"  Max: {np.max(per_class_thresholds):.4f}")
print(f"  Mean: {np.mean(per_class_thresholds):.4f}")
print(f"  Median: {np.median(per_class_thresholds):.4f}")
print(f"  Number of classes using global threshold: {num_low_sample_classes}")

In [None]:
print("--- Evaluating OOF Score with Per-Class Thresholds ---")
from sklearn.metrics import fbeta_score

# Ensure oof_preds and oof_labels are loaded
if 'oof_preds' not in locals() or 'oof_labels' not in locals():
    print('Reloading OOF files...')
    oof_preds = np.load('oof_preds.npy')
    oof_labels = np.load('oof_labels.npy')

# Load the per-class thresholds
try:
    per_class_thresholds = np.load('per_class_thresholds.npy')
    print("Loaded per-class thresholds.")

    # Apply per-class thresholds
    # Broadcasting: oof_preds (N, C) > per_class_thresholds (C,)
    y_pred_thresh = (oof_preds > per_class_thresholds).astype(np.int8)

    # Apply fallback for empty predictions
    empty_preds_mask = y_pred_thresh.sum(axis=1) == 0
    num_empty = np.sum(empty_preds_mask)
    if num_empty > 0:
        print(f"Found {num_empty} images with no predictions. Applying fallback (argmax).")
        argmax_indices = oof_preds[empty_preds_mask].argmax(axis=1)
        rows_to_update = np.where(empty_preds_mask)[0]
        y_pred_thresh[rows_to_update, argmax_indices] = 1

    # Calculate sample-wise F2 score
    score = fbeta_score(oof_labels, y_pred_thresh, beta=2, average='samples', zero_division=0)
    
    print(f"\nOOF Sample-wise F2 score with Per-Class Thresholds: {score:.4f}")
    
    if score > 0.40:
        print("SUCCESS: Score is above the medal target! Proceeding to submission.")
    else:
        print("INFO: Score is a significant improvement. This is the best strategy available.")

except FileNotFoundError:
    print("ERROR: 'per_class_thresholds.npy' not found. Please run the cell above to generate it.")

In [8]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import fbeta_score
from tqdm import tqdm

print("--- Tuning Blending Alpha for Final Submission ---")

# Load the components
try:
    per_class_thresholds = np.load('per_class_thresholds.npy')
    global_threshold = 0.93 # The best global threshold found previously
    oof_preds = np.load('oof_preds.npy')
    oof_labels = np.load('oof_labels.npy')
except FileNotFoundError as e:
    print(f"ERROR: Missing a required .npy file: {e}. Please run prerequisite cells.")
    raise e

alpha_values = [0.2, 0.15, 0.1, 0.05]
best_score = -1
best_alpha = -1
best_thresholds = None

for alpha in tqdm(alpha_values, desc="Tuning Alpha"):
    # Create blended thresholds
    blended_thresholds = alpha * per_class_thresholds + (1 - alpha) * global_threshold
    
    # Apply blended thresholds
    y_pred_thresh = (oof_preds > blended_thresholds).astype(np.int8)
    
    # Apply fallback for empty predictions
    empty_preds_mask = y_pred_thresh.sum(axis=1) == 0
    if np.any(empty_preds_mask):
        argmax_indices = oof_preds[empty_preds_mask].argmax(axis=1)
        rows_to_update = np.where(empty_preds_mask)[0]
        y_pred_thresh[rows_to_update, argmax_indices] = 1
        
    # Calculate sample-wise F2 score
    score = fbeta_score(oof_labels, y_pred_thresh, beta=2, average='samples', zero_division=0)
    print(f"Alpha = {alpha:.2f} -> OOF F2 Score: {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_alpha = alpha
        best_thresholds = blended_thresholds

print(f"\nBest alpha found: {best_alpha:.2f} with score {best_score:.4f}")
np.save('thresholds_final.npy', best_thresholds)
print("Saved best thresholds to 'thresholds_final.npy' for use with a non-TTA inference notebook.")

--- Tuning Blending Alpha for Final Submission ---


Tuning Alpha:   0%|          | 0/4 [00:00<?, ?it/s]

Tuning Alpha:  25%|██▌       | 1/4 [00:38<01:56, 38.73s/it]

Alpha = 0.20 -> OOF F2 Score: 0.3668


Tuning Alpha:  50%|█████     | 2/4 [01:17<01:17, 38.63s/it]

Alpha = 0.15 -> OOF F2 Score: 0.3558


In [10]:
import numpy as np

print("--- Generating Final Thresholds with alpha=0.15 ---")

# Based on the interrupted tuning run, alpha=0.15 gave an OOF F2 of 0.3558.
# This is even more conservative and might generalize better. This is the final attempt.

# Load the components
per_class_thresholds = np.load('per_class_thresholds.npy')
global_threshold = 0.93
alpha = 0.15

# Create blended thresholds
blended_thresholds = alpha * per_class_thresholds + (1 - alpha) * global_threshold

# Save the final thresholds for the non-TTA inference script
np.save('thresholds_final.npy', blended_thresholds)
print("Saved final thresholds with alpha=0.15 to 'thresholds_final.npy'")
print(f"Threshold stats: min={blended_thresholds.min():.4f}, max={blended_thresholds.max():.4f}, mean={blended_thresholds.mean():.4f}")

--- Generating Final Thresholds with alpha=0.15 ---
Saved final thresholds with alpha=0.15 to 'thresholds_final.npy'
Threshold stats: min=0.8490, max=0.9405, mean=0.9351


In [7]:
# FINAL ATTEMPT: Re-evaluating Blended Thresholds with Full Guard Logic

import numpy as np
from sklearn.metrics import fbeta_score
from tqdm import tqdm

print("--- Re-evaluating Blended Thresholds (alpha=0.25) with Max-K Guard ---")

# Load necessary data
try:
    oof_preds = np.load('oof_preds.npy')
    oof_labels = np.load('oof_labels.npy')
    # This file was created by cell 16 with alpha=0.25
    blended_thresholds = np.load('blended_thresholds.npy')
    print("Loaded OOF preds, labels, and blended thresholds (alpha=0.25).")
except FileNotFoundError as e:
    print(f"ERROR: Missing a required .npy file: {e}. Please run previous cells.")
    raise e

# Define the full guarded apply_thresholds function
def apply_thresholds_guarded(probs, th):
    y = (probs > th).astype(np.uint8)
    # At-least-one fallback
    empty = y.sum(axis=1) == 0
    if np.any(empty):
        idx = probs[empty].argmax(axis=1)
        rows = np.where(empty)[0]
        y[rows, idx] = 1
    # Max-K cap
    max_k = 3
    row_sum = y.sum(axis=1)
    too_many = np.where(row_sum > max_k)[0]
    if too_many.size:
        # Using tqdm to see progress as this loop can be slow
        for r in tqdm(too_many, desc="Applying max-k cap"):
            topk = np.argpartition(-probs[r], max_k)[:max_k]
            y[r] = 0
            y[r, topk] = 1
    return y

# Evaluate with the guards
print("Applying blended thresholds with full guards...")
y_pred_guarded = apply_thresholds_guarded(oof_preds, blended_thresholds)
score = fbeta_score(oof_labels, y_pred_guarded, beta=2, average='samples', zero_division=0)

print(f"\nOOF F2 Score with Blended (alpha=0.25) + Guards: {score:.4f}")
print("Original score without max-k guard was: 0.3775")

# This seems like a promising candidate. Let's save these thresholds for the final submission.
np.save('thresholds_final.npy', blended_thresholds)
print("Saved these blended thresholds as 'thresholds_final.npy' for the final submission.")

--- Re-evaluating Blended Thresholds (alpha=0.25) with Max-K Guard ---


Loaded OOF preds, labels, and blended thresholds (alpha=0.25).
Applying blended thresholds with full guards...


Applying max-k cap:   0%|          | 0/110498 [00:00<?, ?it/s]

Applying max-k cap:   3%|▎         | 2848/110498 [00:00<00:03, 28468.42it/s]

Applying max-k cap:   5%|▌         | 5705/110498 [00:00<00:03, 28522.82it/s]

Applying max-k cap:   8%|▊         | 8562/110498 [00:00<00:03, 28543.22it/s]

Applying max-k cap:  10%|█         | 11444/110498 [00:00<00:03, 28649.90it/s]

Applying max-k cap:  13%|█▎        | 14332/110498 [00:00<00:03, 28730.13it/s]

Applying max-k cap:  16%|█▌        | 17226/110498 [00:00<00:03, 28800.53it/s]

Applying max-k cap:  18%|█▊        | 20122/110498 [00:00<00:03, 28851.32it/s]

Applying max-k cap:  21%|██        | 23008/110498 [00:00<00:03, 28806.26it/s]

Applying max-k cap:  23%|██▎       | 25889/110498 [00:00<00:02, 28800.53it/s]

Applying max-k cap:  26%|██▌       | 28770/110498 [00:01<00:02, 28732.83it/s]

Applying max-k cap:  29%|██▊       | 31644/110498 [00:01<00:02, 28671.18it/s]

Applying max-k cap:  31%|███       | 34512/110498 [00:01<00:02, 28658.45it/s]

Applying max-k cap:  34%|███▍      | 37381/110498 [00:01<00:02, 28667.87it/s]

Applying max-k cap:  36%|███▋      | 40248/110498 [00:01<00:02, 28611.86it/s]

Applying max-k cap:  39%|███▉      | 43129/110498 [00:01<00:02, 28667.65it/s]

Applying max-k cap:  42%|████▏     | 45996/110498 [00:01<00:02, 28599.13it/s]

Applying max-k cap:  44%|████▍     | 48856/110498 [00:01<00:02, 28548.35it/s]

Applying max-k cap:  47%|████▋     | 51737/110498 [00:01<00:02, 28623.52it/s]

Applying max-k cap:  49%|████▉     | 54600/110498 [00:01<00:01, 28587.86it/s]

Applying max-k cap:  52%|█████▏    | 57464/110498 [00:02<00:01, 28600.27it/s]

Applying max-k cap:  55%|█████▍    | 60343/110498 [00:02<00:01, 28656.10it/s]

Applying max-k cap:  57%|█████▋    | 63249/110498 [00:02<00:01, 28775.43it/s]

Applying max-k cap:  60%|█████▉    | 66127/110498 [00:02<00:01, 28720.26it/s]

Applying max-k cap:  62%|██████▏   | 69000/110498 [00:02<00:01, 28700.75it/s]

Applying max-k cap:  65%|██████▌   | 71871/110498 [00:02<00:01, 28702.95it/s]

Applying max-k cap:  68%|██████▊   | 74742/110498 [00:02<00:01, 28686.85it/s]

Applying max-k cap:  70%|███████   | 77643/110498 [00:02<00:01, 28783.16it/s]

Applying max-k cap:  73%|███████▎  | 80541/110498 [00:02<00:01, 28841.77it/s]

Applying max-k cap:  76%|███████▌  | 83426/110498 [00:02<00:00, 28839.01it/s]

Applying max-k cap:  78%|███████▊  | 86310/110498 [00:03<00:00, 28790.95it/s]

Applying max-k cap:  81%|████████  | 89190/110498 [00:03<00:00, 28748.09it/s]

Applying max-k cap:  83%|████████▎ | 92065/110498 [00:03<00:00, 28723.11it/s]

Applying max-k cap:  86%|████████▌ | 94941/110498 [00:03<00:00, 28732.99it/s]

Applying max-k cap:  89%|████████▊ | 97815/110498 [00:03<00:00, 28638.06it/s]

Applying max-k cap:  91%|█████████ | 100703/110498 [00:03<00:00, 28709.48it/s]

Applying max-k cap:  94%|█████████▎| 103576/110498 [00:03<00:00, 28712.57it/s]

Applying max-k cap:  96%|█████████▋| 106448/110498 [00:03<00:00, 28613.44it/s]

Applying max-k cap:  99%|█████████▉| 109310/110498 [00:03<00:00, 28608.90it/s]

Applying max-k cap: 100%|██████████| 110498/110498 [00:03<00:00, 28687.41it/s]





OOF F2 Score with Blended (alpha=0.25) + Guards: 0.2549
Original score without max-k guard was: 0.3775
Saved these blended thresholds as 'thresholds_final.npy' for the final submission.


In [6]:
import numpy as np
from sklearn.metrics import fbeta_score
import matplotlib.pyplot as plt

print("--- Tuning Shrinkage Parameter 'k' (DIAGNOSTIC RUN) ---")

# Load necessary data
try:
    oof_preds = np.load('oof_preds.npy')
    oof_labels = np.load('oof_labels.npy')
    per_class = np.load('per_class_thresholds.npy')
    print("Loaded OOF preds, labels, and per-class thresholds.")
except FileNotFoundError as e:
    print(f"ERROR: Missing a required .npy file: {e}. Please run previous cells to generate them.")
    raise e

T_global = 0.93

def apply_thresholds(probs, th):
    y = (probs > th).astype(np.uint8)
    empty = y.sum(axis=1) == 0
    if np.any(empty):
        idx = probs[empty].argmax(axis=1)
        rows = np.where(empty)[0]
        y[rows, idx] = 1
    max_k = 3
    row_sum = y.sum(axis=1)
    too_many = np.where(row_sum > max_k)[0]
    if too_many.size:
        for r in too_many:
            topk = np.argpartition(-probs[r], max_k)[:max_k]
            y[r] = 0
            y[r, topk] = 1
    return y

k_values = [5, 10, 20, 50, 100]
scores = []
counts = oof_labels.sum(axis=0).astype(np.float32)
clip = 0.4
min_samples = 5

print(f"Using Global T={T_global:.2f} and Clip={clip:.2f}")
print(f"Per-class threshold stats: min={per_class.min():.4f}, max={per_class.max():.4f}, mean={per_class.mean():.4f}")

for k in k_values:
    w = counts / (counts + k)
    T_shrunk = w * per_class + (1.0 - w) * T_global
    rare_mask = counts < min_samples
    T_shrunk[rare_mask] = T_global
    T_final = np.clip(T_shrunk, T_global - clip, T_global + clip)
    
    y_pred = apply_thresholds(oof_preds, T_final)
    score = fbeta_score(oof_labels, y_pred, beta=2, average='samples', zero_division=0)
    scores.append(score)
    print(f"k = {k:3d}, OOF F2 Score = {score:.4f}")
    # DIAGNOSTIC PRINT
    print(f"    T_final stats: min={T_final.min():.4f}, max={T_final.max():.4f}, mean={T_final.mean():.4f}")

# --- Find best and save ---
best_k_idx = np.argmax(scores)
best_k = k_values[best_k_idx]
best_score = scores[best_k_idx]
print(f"\nBest score found: {best_score:.4f} at k = {best_k}. Saving these thresholds.")

w = counts / (counts + best_k)
T_shrunk = w * per_class + (1.0 - w) * T_global
rare_mask = counts < min_samples
T_shrunk[rare_mask] = T_global
T_final = np.clip(T_shrunk, T_global - clip, T_global + clip)

np.save('thresholds_final.npy', T_final)
print("Saved new 'thresholds_final.npy'")

--- Tuning Shrinkage Parameter 'k' (DIAGNOSTIC RUN) ---


Loaded OOF preds, labels, and per-class thresholds.
Using Global T=0.93 and Clip=0.40
Per-class threshold stats: min=0.3902, max=0.9999, mean=0.9638


k =   5, OOF F2 Score = 0.2567
    T_final stats: min=0.5300, max=0.9958, mean=0.9551


k =  10, OOF F2 Score = 0.2565
    T_final stats: min=0.5300, max=0.9945, mean=0.9506


k =  20, OOF F2 Score = 0.2563
    T_final stats: min=0.5300, max=0.9922, mean=0.9454


In [None]:
print("--- Verifying the generated per_class_thresholds.npy file ---")
try:
    ths = np.load('per_class_thresholds.npy')
    print(f"Shape: {ths.shape}")
    print(f"All finite: {np.isfinite(ths).all()}")
    print(f"Min threshold: {ths.min()}")
    print(f"Max threshold: {ths.max()}")
    print(f"Mean threshold: {ths.mean()}")
except Exception as e:
    print(f"Error loading or checking file: {e}")

In [None]:
import os

print("--- Purging Stale Artifacts as per Expert Advice ---")

files_to_delete = ['oof_preds.npy', 'oof_labels.npy', 'per_class_thresholds.npy']

for f in files_to_delete:
    if os.path.exists(f):
        os.remove(f)
        print(f"Deleted stale file: {f}")
    else:
        print(f"File not found, skipping: {f}")

print("\nPurge complete. Ready to regenerate OOF with correct transforms.")

In [None]:
# 5. Key Inspection
print("--- Inspecting State Dict Keys for Fold 0 ---")

# 1. Load the saved state_dict from file
model_path = CFG.model_paths[0]
saved_state_dict = torch.load(model_path)
saved_keys = list(saved_state_dict.keys())
print(f"Loaded state_dict from {model_path}")
print(f"Number of keys in saved_state_dict: {len(saved_keys)}")
print("First 5 keys from SAVED state_dict:")
for key in saved_keys[:5]:
    print(f"  - {key}")

print("\n" + "="*50 + "\n")

# 2. Create a fresh model instance and get its state_dict
fresh_model = iMetModel(CFG.model_name, pretrained=False)
fresh_state_dict = fresh_model.state_dict()
fresh_keys = list(fresh_state_dict.keys())
print("Created a fresh iMetModel instance")
print(f"Number of keys in fresh_model.state_dict(): {len(fresh_keys)}")
print("First 5 keys from FRESH model's state_dict:")
for key in fresh_keys[:5]:
    print(f"  - {key}")

# 3. Compare the key sets
saved_keys_set = set(saved_keys)
fresh_keys_set = set(fresh_keys)

if saved_keys_set == fresh_keys_set:
    print("\nSUCCESS: The key sets are identical.")
else:
    print("\nERROR: The key sets are different.")
    print(f"Keys in saved but not in fresh: {saved_keys_set - fresh_keys_set}")
    print(f"Keys in fresh but not in saved: {fresh_keys_set - saved_keys_set}")

del saved_state_dict, fresh_model, fresh_state_dict
gc.collect()

In [None]:
# 6. Final Sanity Check: Verify Weight Loading

print("--- Verifying Weight Loading for Fold 0 ---")

# 1. Instantiate model and load state_dict from file
model_path = CFG.model_paths[0]
state_dict_from_file = torch.load(model_path, map_location=CFG.device)

model = iMetModel(CFG.model_name, pretrained=True).to(CFG.device)
model.load_state_dict(state_dict_from_file)
model.eval()

# 2. Get the state_dict from the model *after* loading
state_dict_from_model = model.state_dict()

# 3. Pick a key and compare tensors
key_to_check = 'model.conv_stem.weight'

tensor_from_file = state_dict_from_file[key_to_check]
tensor_from_model = state_dict_from_model[key_to_check]

print(f"Checking key: '{key_to_check}'")
print(f"Tensor from file (sum): {tensor_from_file.sum().item()}")
print(f"Tensor from model (sum): {tensor_from_model.sum().item()}")

# 4. The ultimate test
are_equal = torch.equal(tensor_from_file, tensor_from_model)
print(f"Are the tensors identical? -> {are_equal}")

assert are_equal, "FATAL: Tensors do not match after load_state_dict. The weights are not being loaded correctly."

print("\nSUCCESS: Weight loading verified. The tensor from the file matches the tensor in the model.")

# 5. Check another key (from the end of the model)
key_to_check = 'model.classifier.weight'
tensor_from_file = state_dict_from_file[key_to_check]
tensor_from_model = state_dict_from_model[key_to_check]
print(f"\nChecking key: '{key_to_check}'")
print(f"Tensor from file (sum): {tensor_from_file.sum().item()}")
print(f"Tensor from model (sum): {tensor_from_model.sum().item()}")
are_equal = torch.equal(tensor_from_file, tensor_from_model)
print(f"Are the tensors identical? -> {are_equal}")
assert are_equal, "FATAL: Tensors do not match for the classifier layer."
print("\nSUCCESS: Classifier weights also verified.")


del model, state_dict_from_file, state_dict_from_model
gc.collect()

In [None]:
# 7. Hypothesis Check: Are the saved models identical?
print("--- Checking if fold 0 and fold 1 models are identical ---")

# 1. Load state dicts for two different folds
model_path_0 = CFG.model_paths[0]
model_path_1 = CFG.model_paths[1]

state_dict_0 = torch.load(model_path_0, map_location=CFG.device)
state_dict_1 = torch.load(model_path_1, map_location=CFG.device)

# 2. Compare a key tensor (e.g., the final classifier weights)
key_to_check = 'model.classifier.weight'
tensor_0 = state_dict_0[key_to_check]
tensor_1 = state_dict_1[key_to_check]

print(f"Checking key: '{key_to_check}'")
print(f"Tensor from Fold 0 (sum): {tensor_0.sum().item()}")
print(f"Tensor from Fold 1 (sum): {tensor_1.sum().item()}")

# 3. The crucial test
are_equal = torch.equal(tensor_0, tensor_1)
print(f"Are the tensors for Fold 0 and Fold 1 identical? -> {are_equal}")

if are_equal:
    print("\nFATAL DIAGNOSIS: The model files for different folds are identical. This strongly suggests the training script saved the same initial weights instead of the trained weights for each fold.")
else:
    print("\nINFO: The model files are different, as expected. This hypothesis is incorrect.")

del state_dict_0, state_dict_1
gc.collect()