# Plan: TGS Salt Identification Challenge

Objectives:
- Establish GPU-ready environment and fast baseline segmentation pipeline.
- Robust CV with folds saved; produce OOF + test predictions.
- Target Dice/BCE-trained U-Net with ImageNet encoder; TTA + postprocessing.

Data & Metric:
- Images: 101x101 grayscale; Masks same size. depths.csv provides scalar feature.
- Metric: mean precision at IoU thresholds (0.5..0.95).

Validation:
- 5-fold StratifiedKFold by mask coverage buckets (e.g., 0, (0,0.1], ..., (0.9,1]).
- Deterministic seeds; save folds indices to disk.

Baseline Model:
- PyTorch + timm: U-Net (or FPN/DeepLabV3) with ResNet34/50 encoder.
- Input: pad to 128x128; 2 channels [image, depth_norm] or concat depth via broadcast.
- Loss: BCEWithLogits + Dice (e.g., 0.5/0.5).
- Optimizer: AdamW; Scheduler: Cosine or OneCycle. Mixed precision.
- Augment: flips, shifts, slight rotate, brightness/contrast, elastic (light).

Inference:
- TTA: hflip/vflip (4x) average logits.
- Postprocess: sigmoid -> threshold tuning on OOF; remove small objects, fill holes.

Milestones:
1) Env check (GPU), install torch/cu121 + libs.
2) EDA: verify files, shapes, coverage distribution; leak check.
3) Data pipeline + folds saving.
4) Baseline train 5-10 epochs to verify; get OOF score proxy (Dice).
5) Full training 30-50 epochs with early stopping; log per-fold times.
6) TTA + postproc threshold search; generate submission.csv.
7) If time: larger encoder, blend seeds/models.

We will request expert review at: plan, after EDA, after baseline, after tuning, and before final submit.

In [None]:
import os, sys, subprocess, time, json, shutil
from pathlib import Path
import pandas as pd

print('=== GPU CHECK: nvidia-smi ===', flush=True)
try:
    out = subprocess.run(['bash','-lc','nvidia-smi || true'], capture_output=True, text=True, check=False)
    print(out.stdout)
except Exception as e:
    print('nvidia-smi error:', e)

print('=== Torch CUDA Sanity ===', flush=True)
try:
    import torch
    print('torch:', torch.__version__, 'CUDA build:', getattr(torch.version,'cuda',None))
    print('CUDA available:', torch.cuda.is_available())
    if torch.cuda.is_available():
        print('GPU:', torch.cuda.get_device_name(0))
except Exception as e:
    print('Torch not available or error:', e)

print('=== List files ===', flush=True)
for p in sorted(Path('.').iterdir()):
    try:
        print(p, '->', 'dir' if p.is_dir() else p.stat().st_size, 'bytes' if p.is_file() else '')
    except Exception:
        print(p)

print('=== Peek CSVs ===', flush=True)
for csv in ['train.csv','depths.csv','sample_submission.csv']:
    if Path(csv).exists():
        df = pd.read_csv(csv)
        print(csv, df.shape)
        print(df.head(3))

print('=== Train/Test folders ===', flush=True)
for d in ['train','test']:
    p = Path(d)
    if p.exists():
        imgs = list(p.glob('images/*.png'))[:3]
        masks = list(p.glob('masks/*.png'))[:3] if (p/'masks').exists() else []
        print(d, 'images sample:', [str(x.name) for x in imgs])
        print(d, 'masks sample:', [str(x.name) for x in masks])

print('ENV CHECK COMPLETE', flush=True)

In [None]:
import os, sys, subprocess, shutil, time
from pathlib import Path

def pip(*args):
    print('> pip', *args, flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

print('=== Install CUDA 12.1 Torch stack ===', flush=True)
for pkg in ('torch','torchvision','torchaudio'):
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', pkg], check=False)
for d in (
    '/app/.pip-target/torch',
    '/app/.pip-target/torchvision',
    '/app/.pip-target/torchaudio',
    '/app/.pip-target/torch-2.4.1.dist-info',
    '/app/.pip-target/torchvision-0.19.1.dist-info',
    '/app/.pip-target/torchaudio-2.4.1.dist-info',
    '/app/.pip-target/torchgen',
    '/app/.pip-target/functorch',
):
    if os.path.exists(d):
        print('Removing', d, flush=True)
        shutil.rmtree(d, ignore_errors=True)

pip('install',
    '--index-url','https://download.pytorch.org/whl/cu121',
    '--extra-index-url','https://pypi.org/simple',
    'torch==2.4.1','torchvision==0.19.1','torchaudio==2.4.1')

Path('constraints.txt').write_text('torch==2.4.1\ntorchvision==0.19.1\ntorchaudio==2.4.1\n')

print('=== Install segmentation deps (honor constraints) ===', flush=True)
pip('install','-c','constraints.txt',
    'segmentation-models-pytorch==0.3.3',
    'timm',
    'albumentations',
    'opencv-python-headless',
    'scikit-image',
    'scikit-learn',
    'scipy',
    'pandas',
    'numpy',
    '--upgrade-strategy','only-if-needed')

print('=== Sanity check torch/CUDA ===', flush=True)
import torch
print('torch:', torch.__version__, 'built CUDA:', getattr(torch.version,'cuda',None), flush=True)
print('CUDA available:', torch.cuda.is_available(), flush=True)
assert str(getattr(torch.version,'cuda','')).startswith('12.1'), f'Wrong CUDA build: {torch.version.cuda}'
assert torch.cuda.is_available(), 'CUDA not available after install'
print('GPU:', torch.cuda.get_device_name(0), flush=True)
print('INSTALL COMPLETE', flush=True)

In [None]:
# Folds + utilities: coverage bins, depth norm stats, RLE, pad/crop
import os, gc, math, time, json
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import StratifiedKFold

DATA_DIR = Path('.')
TRAIN_IMG_DIR = DATA_DIR/'train/images'
TRAIN_MASK_DIR = DATA_DIR/'train/masks'
TEST_IMG_DIR = DATA_DIR/'test/images'
OUT_DIR = DATA_DIR/'out'; OUT_DIR.mkdir(exist_ok=True)

def read_gray(path):
    img = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise FileNotFoundError(path)
    return img

def reflect_pad_to_128(img):
    # img: HxW (101x101)
    h, w = img.shape[:2]
    assert h == 101 and w == 101, f'Unexpected shape: {img.shape}'
    pad_top = (128 - h) // 2
    pad_bottom = 128 - h - pad_top
    pad_left = (128 - w) // 2
    pad_right = 128 - w - pad_left
    return cv2.copyMakeBorder(img, pad_top, pad_bottom, pad_left, pad_right, borderType=cv2.BORDER_REFLECT_101)

def crop_center_101(img):
    # img: 128x128 -> center crop back to 101x101
    h, w = img.shape[:2]
    assert h == 128 and w == 128, f'Unexpected shape: {img.shape}'
    s = 101
    y0 = (h - s)//2
    x0 = (w - s)//2
    return img[y0:y0+s, x0:x0+s]

def rle_encode(mask):
    # mask: 2D binary (H,W) 0/1; TGS expects column-major flatten
    pixels = mask.T.flatten()
    # 1-indexed runs
    runs = []
    prev = -2
    for i, val in enumerate(pixels, start=1):
        if val and (i > 1 and pixels[i-2] == 0):
            runs.append(i)
        if val and (i == len(pixels) or pixels[i-1] == 0):
            runs.append(i - (runs[-1] if runs else i) + 1)
    return ' '.join(map(str, runs))

def coverage_of_mask(mask):
    return float(mask.sum()) / float(mask.size)

print('Scanning train ids...', flush=True)
train_ids = sorted([p.stem for p in TRAIN_IMG_DIR.glob('*.png')])
print('Train count:', len(train_ids), flush=True)

print('Load depths...', flush=True)
depths = pd.read_csv(DATA_DIR/'depths.csv')
depths = depths.set_index('id').reindex(train_ids)
z_vals = depths['z'].values.astype(np.float32)
z_min, z_max = float(np.nanmin(z_vals)), float(np.nanmax(z_vals))
print('Depth z_min/z_max:', z_min, z_max, flush=True)
json.dump({'z_min': z_min, 'z_max': z_max}, open(OUT_DIR/'depth_norm.json','w'))

print('Compute coverage for stratification...', flush=True)
coverages = []
t0 = time.time()
for i, tid in enumerate(train_ids):
    m = read_gray(TRAIN_MASK_DIR/f'{tid}.png')
    m = (m>127).astype(np.uint8)
    coverages.append(coverage_of_mask(m))
    if (i+1)%500==0:
        print(f'  processed {i+1}/{len(train_ids)} in {time.time()-t0:.1f}s', flush=True)
coverages = np.array(coverages, dtype=np.float32)

print('Build stratification bins...', flush=True)
bins = np.digitize(coverages, bins=np.linspace(0.0, 1.0, 11), right=True)
empty_bin = (coverages == 0.0).astype(int)  # ensure empties separation
y_strat = bins + 100*empty_bin  # combine

folds_csv = OUT_DIR/'folds.csv'
if folds_csv.exists():
    print('folds.csv exists; will overwrite to ensure determinism', flush=True)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rows = []
for fold, (trn_idx, val_idx) in enumerate(skf.split(train_ids, y_strat)):
    for idx in val_idx:
        rows.append({'id': train_ids[idx], 'fold': fold})
folds_df = pd.DataFrame(rows).set_index('id').loc[train_ids].reset_index()
folds_df.to_csv(folds_csv, index=False)
print('Saved folds to', folds_csv, flush=True)
print(folds_df['fold'].value_counts().sort_index())

print('Done folds/utilities setup.', flush=True)

In [None]:
# Training: SMP UNet-ResNet34, 2-channel (image + depth), 5-fold CV, AMP, OneCycle, HFlip TTA
import os, time, math, json, gc
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
import albumentations as A
import segmentation_models_pytorch as smp
from skimage.morphology import remove_small_objects
from scipy.ndimage import binary_fill_holes

DATA_DIR = Path('.')
TRAIN_IMG_DIR = DATA_DIR/'train/images'
TRAIN_MASK_DIR = DATA_DIR/'train/masks'
TEST_IMG_DIR = DATA_DIR/'test/images'
OUT_DIR = DATA_DIR/'out'
OUT_DIR.mkdir(exist_ok=True)

# Reuse helpers from previous cell by redefining if not in scope
def read_gray(path):
    img = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)
    if img is None: raise FileNotFoundError(path)
    return img
def reflect_pad_to_128(img):
    h,w = img.shape[:2]; assert (h,w)==(101,101)
    pt=(128-h)//2; pb=128-h-pt; pl=(128-w)//2; pr=128-w-pl
    return cv2.copyMakeBorder(img, pt, pb, pl, pr, cv2.BORDER_REFLECT_101)
def crop_center_101(img):
    h,w = img.shape[:2]; assert (h,w)==(128,128)
    s=101; y0=(h-s)//2; x0=(w-s)//2; return img[y0:y0+s, x0:x0+s]
def rle_encode(mask):
    pixels = mask.T.flatten()
    runs=[]
    for i in range(1, len(pixels)+1):
        if pixels[i-1] and (i==1 or pixels[i-2]==0):
            runs.append(i)
        if pixels[i-1] and (i==len(pixels) or pixels[i]==0):
            runs.append(i - runs[-1] + 1)
    return ' '.join(map(str, runs))

def tgs_metric(y_true, y_pred):
    # y_true, y_pred: (N, H, W) bool arrays
    thresholds = np.arange(0.5, 1.0, 0.05)
    scores=[]
    for t in thresholds:
        inter = (y_true & y_pred).sum(axis=(1,2)).astype(np.float32)
        union = (y_true | y_pred).sum(axis=(1,2)).astype(np.float32)
        iou = np.where(union>0, inter/union, (y_pred.sum(axis=(1,2))==0))
        scores.append((iou>t).mean())
    return np.mean(scores)

class SaltDataset(Dataset):
    def __init__(self, ids, depths_df, z_min, z_max, aug=None, is_train=True):
        self.ids = ids
        self.depths = depths_df
        self.z_min = z_min; self.z_max = z_max
        self.aug = aug
        self.is_train = is_train
    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        _id = self.ids[idx]
        img = read_gray(TRAIN_IMG_DIR/f'{_id}.png') if self.is_train else read_gray(TEST_IMG_DIR/f'{_id}.png')
        img = reflect_pad_to_128(img).astype(np.float32)/255.0
        if self.is_train:
            mask = read_gray(TRAIN_MASK_DIR/f'{_id}.png')
            mask = reflect_pad_to_128(mask)
            mask = (mask>127).astype(np.float32)
        else:
            mask = np.zeros_like(img, dtype=np.float32)
        # depth channel
        z = float(self.depths.loc[_id, 'z']) if _id in self.depths.index else float(self.depths.loc[_id])
        z_norm = (z - self.z_min) / max(1e-6, (self.z_max - self.z_min))
        depth_ch = np.full_like(img, z_norm, dtype=np.float32)
        img2 = np.stack([img, depth_ch], axis=0)  # (2,128,128)
        if self.aug is not None:
            data = {'image': img2.transpose(1,2,0), 'mask': mask}
            data = self.aug(**data)
            im = data['image'].transpose(2,0,1)
            mk = data['mask']
        else:
            im = img2; mk = mask
        return torch.from_numpy(im).float(), torch.from_numpy(mk[None]).float(), _id

def get_augs():
    train_tfms = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=10, border_mode=cv2.BORDER_REFLECT_101, p=0.7),
        A.GridDistortion(num_steps=3, distort_limit=0.05, border_mode=cv2.BORDER_REFLECT_101, p=0.2),
        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.3),
    ])
    val_tfms = A.Compose([])
    return train_tfms, val_tfms

def bce_dice_loss():
    bce = nn.BCEWithLogitsLoss()
    def dice_loss(logits, targets, eps=1e-6):
        probs = torch.sigmoid(logits)
        num = 2*(probs*targets).sum(dim=(2,3))
        den = (probs*probs + targets*targets).sum(dim=(2,3)) + eps
        dice = 1 - (num/den)
        return dice.mean()
    def loss_fn(logits, targets):
        return 0.5*bce(logits, targets) + 0.5*dice_loss(logits, targets)
    return loss_fn

def train_fold(fold, train_ids, val_ids, depths_df, z_min, z_max, epochs=45, batch_size=64, max_lr=1e-3):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    train_tfms, val_tfms = get_augs()
    ds_tr = SaltDataset(train_ids, depths_df, z_min, z_max, aug=train_tfms, is_train=True)
    ds_va = SaltDataset(val_ids, depths_df, z_min, z_max, aug=val_tfms, is_train=True)
    dl_tr = DataLoader(ds_tr, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
    dl_va = DataLoader(ds_va, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

    model = smp.Unet(encoder_name='resnet34', encoder_weights='imagenet', in_channels=2, classes=1, activation=None)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=max_lr, weight_decay=1e-4)
    steps_per_epoch = max(1, math.ceil(len(ds_tr)/batch_size))
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lr, epochs=epochs, steps_per_epoch=steps_per_epoch, pct_start=0.3, div_factor=10.0, final_div_factor=10.0)
    scaler = GradScaler(enabled=(device=='cuda'))
    loss_fn = bce_dice_loss()

    best_dice = -1.0; best_path = OUT_DIR/f'ckpt_fold{fold}.pth'
    t_start = time.time()
    for epoch in range(1, epochs+1):
        model.train(); tr_loss=0.0; t0=time.time()
        for it,(x,y,_) in enumerate(dl_tr):
            x=x.to(device, non_blocking=True); y=y.to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            with autocast(enabled=(device=='cuda')):
                logits = model(x)
                loss = loss_fn(logits, y)
            scaler.scale(loss).step(optimizer)
            scaler.update()
            scheduler.step()
            tr_loss += loss.item()*x.size(0)
            if (it+1)%50==0: print(f'[fold {fold}] epoch {epoch} iter {it+1}/{len(dl_tr)} elapsed {time.time()-t0:.1f}s', flush=True)
        tr_loss/=len(ds_tr)
        # Val
        model.eval(); dices=[]
        with torch.no_grad():
            for x,y,_ in dl_va:
                x=x.to(device); y=y.to(device)
                logits = model(x)
                probs = torch.sigmoid(logits)
                # threshold 0.5 dice
                pred = (probs>0.5).float()
                num = 2*(pred*y).sum(dim=(2,3))
                den = (pred.sum(dim=(2,3)) + y.sum(dim=(2,3)) + 1e-6)
                dices.append((num/den).detach().cpu().numpy())
        val_dice = float(np.concatenate(dices).mean())
        print(f'[fold {fold}] epoch {epoch}/{epochs} tr_loss {tr_loss:.4f} val_dice {val_dice:.4f} epoch_time {time.time()-t0:.1f}s total {time.time()-t_start:.1f}s', flush=True)
        if val_dice>best_dice:
            best_dice=val_dice
            torch.save({'model': model.state_dict(), 'dice': best_dice}, best_path)
    print(f'[fold {fold}] best_dice {best_dice:.4f} saved {best_path}', flush=True)
    return str(best_path), best_dice

def infer_fold(fold, ckpt_path, val_ids, test_ids, depths_df, z_min, z_max, batch_size=64, tta_hflip=True):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    val_ds = SaltDataset(val_ids, depths_df, z_min, z_max, aug=A.Compose([]), is_train=True)
    test_ds = SaltDataset(test_ids, depths_df, z_min, z_max, aug=A.Compose([]), is_train=False)
    val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    model = smp.Unet(encoder_name='resnet34', encoder_weights=None, in_channels=2, classes=1, activation=None)
    sd = torch.load(ckpt_path, map_location='cpu')['model']
    model.load_state_dict(sd, strict=True); model.to(device); model.eval()
    # VAL logits
    val_logits=[]; val_order=[]
    with torch.no_grad():
        for x,_,ids in val_dl:
            x = x.to(device)
            logits = model(x)
            if tta_hflip:
                x2 = torch.flip(x, dims=[-1])
                logits2 = model(x2)
                logits2 = torch.flip(logits2, dims=[-1])
                logits = 0.5*(logits + logits2)
            val_logits.append(logits.float().cpu().numpy())
            val_order.extend(ids)
    val_logits = np.concatenate(val_logits, axis=0)[:,0]  # (N,128,128)
    # TEST logits
    test_logits=[]; test_order=[]
    with torch.no_grad():
        for x,_,ids in test_dl:
            x = x.to(device)
            logits = model(x)
            if tta_hflip:
                x2 = torch.flip(x, dims=[-1])
                logits2 = model(x2)
                logits2 = torch.flip(logits2, dims=[-1])
                logits = 0.5*(logits + logits2)
            test_logits.append(logits.float().cpu().numpy())
            test_order.extend(ids)
    test_logits = np.concatenate(test_logits, axis=0)[:,0]
    np.save(OUT_DIR/f'val_logits_fold{fold}.npy', val_logits)
    np.save(OUT_DIR/f'test_logits_fold{fold}.npy', test_logits)
    pd.Series(val_order).to_csv(OUT_DIR/f'val_ids_fold{fold}.csv', index=False, header=False)
    pd.Series(test_order).to_csv(OUT_DIR/f'test_ids_fold{fold}.csv', index=False, header=False)
    print(f'[fold {fold}] saved val/test logits', flush=True)
    return val_order, val_logits, test_order, test_logits

# Orchestrate 5-fold train + infer
folds_df = pd.read_csv(OUT_DIR/'folds.csv')
train_ids_all = folds_df['id'].tolist()
fold_by_id = dict(zip(folds_df['id'], folds_df['fold']))
depths = pd.read_csv(DATA_DIR/'depths.csv').set_index('id')
z_stats = json.load(open(OUT_DIR/'depth_norm.json')) if (OUT_DIR/'depth_norm.json').exists() else None
if z_stats is None:
    z_vals = depths.loc[train_ids_all, 'z'].values.astype(np.float32)
    z_stats = {'z_min': float(np.min(z_vals)), 'z_max': float(np.max(z_vals))}
z_min, z_max = z_stats['z_min'], z_stats['z_max']
test_ids_all = sorted([p.stem for p in TEST_IMG_DIR.glob('*.png')])

all_val_ids=[]; all_val_logits=[]; test_logits_folds=[]
for fold in range(5):
    tr_ids = [i for i in train_ids_all if fold_by_id[i]!=fold]
    va_ids = [i for i in train_ids_all if fold_by_id[i]==fold]
    print(f'=== Fold {fold}: train {len(tr_ids)} val {len(va_ids)} ===', flush=True)
    ckpt_path, best = train_fold(fold, tr_ids, va_ids, depths, z_min, z_max, epochs=45, batch_size=64, max_lr=1e-3)
    va_order, va_logits, te_order, te_logits = infer_fold(fold, ckpt_path, va_ids, test_ids_all, depths, z_min, z_max, batch_size=64, tta_hflip=True)
    all_val_ids.extend(va_order); all_val_logits.append(va_logits); test_logits_folds.append(te_logits)
    # free gpu cache
    torch.cuda.empty_cache(); gc.collect()

all_val_logits = np.concatenate(all_val_logits, axis=0)
oof_df = pd.DataFrame({'id': all_val_ids})
oof_df['logit_idx'] = np.arange(len(oof_df))
np.save(OUT_DIR/'oof_logits.npy', all_val_logits)
oof_df.to_csv(OUT_DIR/'oof_order.csv', index=False)
print('Saved OOF logits and order', flush=True)

# Build OOF probs aligned to ground truth and compute tuning
gt_masks=[]; oof_probs=[]
for _id, logit_idx in zip(oof_df['id'].tolist(), oof_df['logit_idx'].tolist()):
    mask = read_gray(TRAIN_MASK_DIR/f'{_id}.png'); mask = reflect_pad_to_128(mask); mask = (mask>127)
    gt_masks.append(mask)
    oof_probs.append(1/(1+np.exp(-all_val_logits[logit_idx])))
gt_masks = np.stack(gt_masks, axis=0)
oof_probs = np.stack(oof_probs, axis=0)

def postprocess(prob, thr, min_size):
    m = prob>thr
    m = remove_small_objects(m, min_size=min_size) if m.any() else m
    m = binary_fill_holes(m)
    return m

ths = np.linspace(0.3, 0.7, 9)
mins = [5,10,20,30,40,50,75,100]
best_score=-1; best_thr=0.5; best_min=0
for thr in ths:
    for ms in mins:
        preds = np.stack([postprocess(p, thr, ms) for p in oof_probs], axis=0)
        score = tgs_metric(gt_masks, preds)
        if score>best_score:
            best_score=score; best_thr=thr; best_min=ms
print(f'Tuned on OOF: best mp-IoU {best_score:.5f} @ thr {best_thr:.3f}, min_size {best_min}', flush=True)

# Average test logits across folds and generate submission
test_logits_folds = np.stack(test_logits_folds, axis=0)  # (5,N,128,128)
test_logits_mean = test_logits_folds.mean(axis=0)
test_probs = 1/(1+np.exp(-test_logits_mean))

sub_ids = test_ids_all
rles=[]
for prob, _id in zip(test_probs, sub_ids):
    # crop to 101x101 before RLE
    prob_101 = crop_center_101((prob*255).astype(np.uint8)) / 255.0
    mask = postprocess(prob_101, best_thr, best_min)
    rles.append(rle_encode(mask.astype(np.uint8)))
sub = pd.DataFrame({'id': sub_ids, 'rle_mask': rles})
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv', flush=True)

In [None]:
import os, sys, traceback
import cv2; cv2.setNumThreads(0)
print('Start incremental import debug', flush=True)
try:
    import torch
    print('OK: torch', torch.__version__, 'cuda avail', torch.cuda.is_available(), flush=True)
except Exception as e:
    print('FAIL: torch'); traceback.print_exc()

try:
    import torchvision
    print('OK: torchvision', torchvision.__version__, flush=True)
except Exception as e:
    print('FAIL: torchvision'); traceback.print_exc()

try:
    import timm
    print('OK: timm', getattr(timm, '__version__', 'unknown'), flush=True)
except Exception as e:
    print('FAIL: timm'); traceback.print_exc()

try:
    import segmentation_models_pytorch as smp
    print('OK: smp', getattr(smp, '__version__', 'unknown'), flush=True)
    m = smp.Unet(encoder_name='resnet34', encoder_weights=None, in_channels=2, classes=1, activation=None)
    print('OK: created UNet model', sum(p.numel() for p in m.parameters())//1000, 'K params', flush=True)
except Exception as e:
    print('FAIL: smp'); traceback.print_exc()
print('Import debug done', flush=True)

In [None]:
import sys, subprocess, os
from pathlib import Path

def pip(*args):
    print('> pip', *args, flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

print('=== Pinning library versions to stabilize imports (per expert advice) ===', flush=True)
Path('constraints.txt').write_text('torch==2.4.1\ntorchvision==0.19.1\ntorchaudio==2.4.1\n')
pip('install','-c','constraints.txt',
    'timm==0.9.12',
    'albumentations==1.4.3',
    'opencv-python-headless==4.10.0.84',
    'scikit-image==0.22.0',
    'scikit-learn==1.3.2',
    'scipy==1.11.4',
    'numpy==1.26.4',
    '--upgrade','--upgrade-strategy','only-if-needed')
print('Re-run incremental import after pinning in next cell.', flush=True)