In [None]:
import os
# Set this before importing PyTorch to reduce memory fragmentation, as suggested by the OOM error message and expert advice.
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
print(f"PYTORCH_CUDA_ALLOC_CONF set to: {os.environ.get('PYTORCH_CUDA_ALLOC_CONF')}")

# SIIM-COVID19: Classifier Training

This notebook focuses on the second part of the two-model pipeline: training an image classifier for the study-level task.

## Plan
1.  **Setup:** Load data and define configurations.
2.  **Dataset & DataLoaders:** Create a PyTorch `Dataset` to load the pre-processed PNG images and their corresponding study-level labels.
3.  **Model Definition:** Use `timm` to create a pre-trained EfficientNet model.
4.  **Training Loop:** Implement a standard training and validation loop.
5.  **Train Model:** Train the classifier on a single fold to establish a baseline.

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
import torch
import timm
from tqdm import tqdm

# --- Configuration ---
DATA_DIR = './'
PNG_DIR = 'train_png/'
N_SPLITS = 5
RANDOM_STATE = 42

print(f"PyTorch version: {torch.__version__}")
print(f"timm version: {timm.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# --- Load Data and Create Folds ---
print("Loading and preparing metadata...")

# Load original data
df_study = pd.read_csv(os.path.join(DATA_DIR, 'train_study_level.csv'))
df_image = pd.read_csv(os.path.join(DATA_DIR, 'train_image_level.csv'))

# Clean up IDs and merge
df_study['StudyInstanceUID'] = df_study['id'].apply(lambda x: x.replace('_study', ''))
df_image['image_id'] = df_image['id'].apply(lambda x: x.replace('_image', ''))
df_merged = df_image.merge(df_study, on='StudyInstanceUID', how='left')

# Add image path
df_merged['image_path'] = df_merged['image_id'].apply(lambda x: os.path.join(PNG_DIR, f"{x}.png"))

# Create a single target column for stratification and labels
label_cols = ['Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']
df_merged['label_name'] = df_merged[label_cols].idxmax(axis=1)
label_map = {name: i for i, name in enumerate(label_cols)}
df_merged['label_id'] = df_merged['label_name'].map(label_map)

# Create Folds (reproducing the exact same folds as in the detector notebook)
df_folds = df_merged.drop_duplicates('StudyInstanceUID').reset_index(drop=True)
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
groups = df_folds['StudyInstanceUID']
y_stratify = df_folds['label_name']
df_folds['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(sgkf.split(df_folds, y_stratify, groups)):
    df_folds.loc[val_idx, 'fold'] = fold

# Merge fold info back into the main dataframe
df_merged = df_merged.merge(df_folds[['StudyInstanceUID', 'fold']], on='StudyInstanceUID', how='left')

print("Data loaded and folds created.")
print(f"Total images: {len(df_merged)}")
print("Fold distribution:")
print(df_merged['fold'].value_counts())
df_merged.head()

In [None]:
import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import Dataset, DataLoader

# --- Dataset Class ---
class SIIMClassifierDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.image_paths = df['image_path'].values
        self.labels = df['label_id'].values
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']

        return image, label

# --- Augmentations ---
def get_transforms(img_size, is_train=True):
    if is_train:
        return A.Compose([
            A.Resize(img_size, img_size),
            A.HorizontalFlip(p=0.5),
            A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.1, rotate_limit=15, p=0.5),
            A.RandomBrightnessContrast(p=0.5),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])
    else:
        return A.Compose([
            A.Resize(img_size, img_size),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])

print("Dataset class and augmentation functions defined.")

In [None]:
# --- Create DataLoaders ---
IMG_SIZE = 512
BATCH_SIZE = 16
NUM_WORKERS = 4
FOLD_TO_TRAIN = 0

print(f"Preparing DataLoaders for Fold {FOLD_TO_TRAIN}...")

# Get data for the specific fold
df_train = df_merged[df_merged['fold'] != FOLD_TO_TRAIN].reset_index(drop=True)
df_val = df_merged[df_merged['fold'] == FOLD_TO_TRAIN].reset_index(drop=True)

# Create datasets
train_dataset = SIIMClassifierDataset(df_train, transform=get_transforms(IMG_SIZE, is_train=True))
val_dataset = SIIMClassifierDataset(df_val, transform=get_transforms(IMG_SIZE, is_train=False))

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

In [None]:
# --- Model, Loss, Optimizer ---
MODEL_NAME = 'tf_efficientnet_b4_ns'
NUM_CLASSES = 4
LEARNING_RATE = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Creating model: {MODEL_NAME}")
model = timm.create_model(MODEL_NAME, pretrained=True, num_classes=NUM_CLASSES)
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

print(f"Model loaded on {device}")
print(f"Loss function: CrossEntropyLoss")
print(f"Optimizer: AdamW with LR={LEARNING_RATE}")

In [None]:
from tqdm import tqdm
import torch
import numpy as np
from sklearn.metrics import roc_auc_score
from torch.cuda.amp import autocast

def train_one_epoch(model, train_loader, optimizer, criterion, device, scaler):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(train_loader, desc="Training", leave=False)
    for images, labels in progress_bar:
        images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        
        optimizer.zero_grad(set_to_none=True)
        
        with autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
        
    return total_loss / len(train_loader)

def validate_one_epoch(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        progress_bar = tqdm(val_loader, desc="Validating", leave=False)
        for images, labels in progress_bar:
            images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            
            with autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            all_preds.append(torch.softmax(outputs, dim=1).cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            
    avg_loss = total_loss / len(val_loader)
    
    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)
    
    try:
        auc_score = roc_auc_score(all_labels, all_preds, multi_class='ovo', labels=np.unique(all_labels))
    except ValueError:
        auc_score = -1
        
    return avg_loss, auc_score

print("Training and validation functions defined (with AMP).")

In [None]:
import time
from torch.cuda.amp import GradScaler

NUM_EPOCHS = 10 # Start with a reasonable number of epochs for the baseline
BEST_MODEL_PATH = f'classifier_fold{FOLD_TO_TRAIN}_best.pth'

best_val_loss = float('inf')
scaler = GradScaler()

print("--- Starting Classifier Training with AMP ---")

for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device, scaler)
    val_loss, val_auc = validate_one_epoch(model, val_loader, criterion, device)
    
    elapsed_time = time.time() - start_time
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS} | "
          f"Time: {elapsed_time:.0f}s | "
          f"Train Loss: {train_loss:.4f} | "
          f"Val Loss: {val_loss:.4f} | "
          f"Val AUC: {val_auc:.4f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"  -> New best model saved to {BEST_MODEL_PATH} (Val Loss: {best_val_loss:.4f})")
        
print("\n--- Classifier Training Finished ---")