---

# Task (4+5+6) - Simsiam 2nd SSL Part
### Shourav Deb [2021-3-60-274]

---

## (Task 4.1) CELL 1 - Header / Config

In [1]:
# =========================
# SimSiam - Config
# =========================

import os, random, json, time
import numpy as np
import torch

DATA_DIR = "/kaggle/input/betel-leaf/Betel Leaf Dataset A Primary Dataset From Field And Controlled Environment/Betel Leaf Dataset"

RESOLUTION = 224

PRETRAIN_EPOCHS = 100

LINEAR_EPOCHS = 50
FINETUNE_EPOCHS = 50

BATCH_SIZE = 64
BACKBONE = "resnet18"
SEED = 42
NUM_WORKERS = 2
OUT_DIR = "/kaggle/working/simsiam_task4"


assert os.path.exists(DATA_DIR), f"DATA_DIR not found: {DATA_DIR}"
assert PRETRAIN_EPOCHS >= 100, "PRETRAIN_EPOCHS must be >= 100"
assert isinstance(RESOLUTION, int) and RESOLUTION >= 64, "RESOLUTION must be integer >=64"

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

os.makedirs(OUT_DIR, exist_ok=True)

print("CONFIG")
print("DATA_DIR:", DATA_DIR)
print("RESOLUTION:", RESOLUTION)
print("PRETRAIN_EPOCHS:", PRETRAIN_EPOCHS)
print("BATCH_SIZE:", BATCH_SIZE)
print("BACKBONE:", BACKBONE)
print("OUT_DIR:", OUT_DIR)


CONFIG
DATA_DIR: /kaggle/input/betel-leaf/Betel Leaf Dataset A Primary Dataset From Field And Controlled Environment/Betel Leaf Dataset
RESOLUTION: 224
PRETRAIN_EPOCHS: 100
BATCH_SIZE: 64
BACKBONE: resnet18
OUT_DIR: /kaggle/working/simsiam_task4


## CELL 2 - Imports & Basic Utilities

In [None]:
# =========================
# Imports & Utilities
# =========================

import os
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "3")
os.environ.setdefault("XLA_FLAGS", "--xla_gpu_cuda_data_dir=/usr/local/cuda  --xla_force_host_platform_device_count=1")


import sys
import math
import shutil
from pathlib import Path
from glob import glob
from typing import Optional


from tqdm import tqdm
from PIL import Image

import matplotlib.pyplot as plt
try:
    import seaborn as sns
except Exception as e:
    sns = None
    print("Warning: seaborn import failed — continuing without it:", e)

try:
    import torch
    from torch import nn, optim
    from torch.utils.data import DataLoader, Dataset
    import torchvision
    from torchvision import transforms, models
except Exception as e:
    torch = None
    nn = None
    optim = None
    DataLoader = None
    Dataset = None
    torchvision = None
    transforms = None
    models = None
    print("Warning: PyTorch imports failed or CUDA unavailable:", e)

try:
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neural_network import MLPClassifier
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
    from sklearn.model_selection import train_test_split
except Exception as e:
    print("Warning: scikit-learn import failed:", e)

try:
    import umap
except Exception as e:
    umap = None
    print("Info: umap not available:", e)

try:
    from sklearn.manifold import TSNE
    from sklearn.metrics import silhouette_score
except Exception as e:
    print("Warning importing TSNE / silhouette_score:", e)

try:
    import joblib
    import pickle
except Exception as e:
    print("Warning: joblib/pickle import issue:", e)

def env_diagnostics(show_packages: Optional[list] = None):
    """Print device + common package versions to help debug environment mismatches."""
    print("Python:", sys.version.splitlines()[0])
    # PyTorch & CUDA
    if torch is not None:
        try:
            print("PyTorch:", torch.__version__)
            print("CUDA available:", torch.cuda.is_available())
            if torch.cuda.is_available():
                print("CUDA device count:", torch.cuda.device_count())
                print("CUDA current device:", torch.cuda.current_device())
                print("CUDA device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
        except Exception as e:
            print("PyTorch diagnostic error:", e)
    else:
        print("PyTorch: not available")

    # seaborn / matplotlib
    try:
        import matplotlib
        print("matplotlib:", matplotlib.__version__)
    except Exception:
        print("matplotlib: not available")

    if sns is not None:
        try:
            print("seaborn:", sns.__version__)
        except Exception:
            pass

    # scikit-learn
    try:
        import sklearn
        print("scikit-learn:", sklearn.__version__)
    except Exception:
        print("scikit-learn: not available")

    # umap
    if umap is not None:
        try:
            print("umap-learn:", umap.__version__)
        except Exception:
            pass

    try:
        import importlib
        if importlib.util.find_spec("tensorflow") is not None:
            import tensorflow as tf
            print("TensorFlow:", tf.__version__)
           
        else:
            print("TensorFlow: not installed (or not found in this env)")
    except Exception as e:
        
        print("TensorFlow import safe-check raised an exception (not fatal):", e)


if torch is not None:
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
else:
    DEVICE = "cpu"
print("Device:", DEVICE)


env_diagnostics()


## CELL 3 - Build file manifest (reads dataset structure & prints counts)

In [5]:
from pathlib import Path
import json, os
root = Path("/kaggle/input/betel-leaf/Betel Leaf Dataset A Primary Dataset From Field And Controlled Environment/Betel Leaf Dataset")
assert root.exists(), f"Dataset root missing: {root}"

expected_classes = ["Diseased", "Dried", "Healthy"]
sources = [p.name for p in root.iterdir() if p.is_dir()]
print("Detected top-level source folders:", sources)

filepaths = []
labels = []
found_map = {}

def normalize(name: str):
    """Utility to normalize folder names for matching."""
    return name.lower().replace(" ", "").replace("_", "").replace("-", "")

for src in sources:
    src_dir = root / src
    subdirs = [d.name for d in src_dir.iterdir() if d.is_dir()]
    print(f"\nSource '{src}' subfolders:", subdirs)

    for cls in expected_classes:
        cls_norm = normalize(cls)
        matched = None

       
         for s in subdirs:
            if cls_norm in normalize(s):
                matched = s
                break

        if matched is None:
            print(f"WARNING: class '{cls}' not found under '{src}'")
            continue

        found_map.setdefault(src, {})[cls] = matched
        cls_dir = src_dir / matched

        for p in cls_dir.glob("*"):
            if p.suffix.lower() in [".jpg", ".jpeg", ".png"]:
                filepaths.append(str(p))
                labels.append(expected_classes.index(cls))

print("\nTotal images found:", len(filepaths))
from collections import Counter
ctr = Counter([Path(p).parent.name for p in filepaths])
print("Per-subfolder counts:")
for k, v in ctr.items():
    print(f"  {k}: {v}")

manifest = {
    "classes": expected_classes,
    "sources_detected": sources,
    "found_map": found_map,
    "files": filepaths,
    "labels": labels
}

os.makedirs("/kaggle/working/simsiam_task4", exist_ok=True)
with open("/kaggle/working/simsiam_task4/manifest.json", "w") as f:
    json.dump(manifest, f)
print("\nManifest saved to /kaggle/working/simsiam_task4/manifest.json")

Detected top-level source folders: ['On Field', 'Controlled Environment']

Source 'On Field' subfolders: ['Diseased Leaf', 'Healthy Leaf', 'Dried Leaf']

Source 'Controlled Environment' subfolders: ['Diseased', 'Dried', 'Healthy']

Total images found: 1800
Per-subfolder counts:
  Diseased Leaf: 289
  Dried Leaf: 282
  Healthy Leaf: 336
  Diseased: 220
  Dried: 340
  Healthy: 333

Manifest saved to /kaggle/working/simsiam_task4/manifest.json


## CELL 4 - Transforms (SimSiam two-view + eval transforms)

In [6]:
simsiam_transform = transforms.Compose([
    transforms.RandomResizedCrop(RESOLUTION, scale=(0.2, 1.0), ratio=(0.75, 1.33)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.2),
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.02),
    transforms.RandomGrayscale(p=0.2),
    transforms.GaussianBlur(kernel_size=(3,3), sigma=(0.1, 2.0)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])


eval_transform = transforms.Compose([
    transforms.Resize(int(RESOLUTION * 1.1)),
    transforms.CenterCrop(RESOLUTION),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])


def aug_probe_image(path, n=6):
    img = Image.open(path).convert("RGB")
    outs = []
    for _ in range(n):
        timg = simsiam_transform(img)
        # de-normalize for visualization
        t = timg.numpy().transpose(1,2,0)
        t = t * np.array([0.229,0.224,0.225]) + np.array([0.485,0.456,0.406])
        t = np.clip(t, 0, 1)
        outs.append((t*255).astype(np.uint8))
    return outs


## CELL 5 - Dataset wrappers: TwoViewDataset + ManifestDataset

In [7]:
class TwoViewDataset(Dataset):
    """Returns two different augmented views of the same image (for SimSiam)."""
    def __init__(self, paths, labels, transform):
        self.paths = paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        p = self.paths[idx]
        lbl = self.labels[idx]
        img = Image.open(p).convert("RGB")
        x1 = self.transform(img)
        x2 = self.transform(img)
        return x1, x2, lbl, p

class ManifestDataset(Dataset):
    """Deterministic dataset for feature extraction and downstream training."""
    def __init__(self, paths, labels, transform):
        self.paths = paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        p = self.paths[idx]
        lbl = self.labels[idx]
        img = Image.open(p).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, lbl, p


## CELL 6 - Create train/val/test splits and DataLoaders (deterministic split saved)

In [8]:
paths = manifest["files"]
labels = manifest["labels"]
classes = manifest["classes"]

# First split: fixed test set 20%
train_paths, test_paths, train_labels, test_labels = train_test_split(
    paths, labels, test_size=0.20, stratify=labels, random_state=SEED)

# From train, carve out validation 10% of train
train_paths, val_paths, train_labels, val_labels = train_test_split(
    train_paths, train_labels, test_size=0.10, stratify=train_labels, random_state=SEED)

print("Train:", len(train_paths), "Val:", len(val_paths), "Test:", len(test_paths))

split_manifest = {
    "classes": classes,
    "train": train_paths, "train_labels": train_labels,
    "val": val_paths, "val_labels": val_labels,
    "test": test_paths, "test_labels": test_labels
}
with open(os.path.join(OUT_DIR, "split_manifest.json"), "w") as f:
    json.dump(split_manifest, f)
print("Split manifest saved to", os.path.join(OUT_DIR, "split_manifest.json"))

# DataLoaders for pretraining (two-view)
train_dataset = TwoViewDataset(train_paths, train_labels, simsiam_transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, drop_last=True)

# DataLoaders for evaluation (use ManifestDataset)
val_dataset = ManifestDataset(val_paths, val_labels, eval_transform)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
test_dataset = ManifestDataset(test_paths, test_labels, eval_transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)


Train: 1296 Val: 144 Test: 360
Split manifest saved to /kaggle/working/simsiam_task4/split_manifest.json


## CELL 7 - SimSiam model definition (encoder, projector, predictor)

In [9]:

class SimSiam(nn.Module):
    def __init__(self, backbone="resnet18", pretrained=False, proj_hidden=2048, pred_hidden=512, out_dim=512):
        super().__init__()
        # backbone
        if backbone == "resnet18":
            base = models.resnet18(pretrained=pretrained)
            feat_dim = 512
        elif backbone == "resnet50":
            base = models.resnet50(pretrained=pretrained)
            feat_dim = 2048
        else:
            raise ValueError("backbone must be resnet18 or resnet50")
        
        modules = list(base.children())[:-1]
        self.encoder = nn.Sequential(*modules)
        self.feat_dim = feat_dim

        
        self.projector = nn.Sequential(
            nn.Linear(feat_dim, proj_hidden),
            nn.BatchNorm1d(proj_hidden),
            nn.ReLU(inplace=True),
            nn.Linear(proj_hidden, proj_hidden),
            nn.BatchNorm1d(proj_hidden),
            nn.ReLU(inplace=True),
            nn.Linear(proj_hidden, out_dim)
        )

        
        self.predictor = nn.Sequential(
            nn.Linear(out_dim, pred_hidden),
            nn.BatchNorm1d(pred_hidden),
            nn.ReLU(inplace=True),
            nn.Linear(pred_hidden, out_dim)
        )

    def forward_backbone(self, x):
        
        h = self.encoder(x)
        h = h.view(h.size(0), -1)
        return h

    def forward(self, x1, x2):
        
        h1 = self.forward_backbone(x1)
        h2 = self.forward_backbone(x2)
        z1 = self.projector(h1)
        z2 = self.projector(h2)
        p1 = self.predictor(z1)
        p2 = self.predictor(z2)
        
        return p1, p2, z1.detach(), z2.detach()


## CELL 8 - Loss function (negative cosine similarity) & utilities

In [10]:

def negative_cosine_similarity(p, z):

    p = nn.functional.normalize(p, dim=1)
    z = nn.functional.normalize(z, dim=1)
    return - (p * z).sum(dim=1).mean()


def save_checkpoint(state, filename):
    torch.save(state, filename)
    print("Saved checkpoint:", filename)


## CELL 9 - Pretraining loop

In [10]:

learning_rate = 0.03 * (BATCH_SIZE / 256)
momentum = 0.9
weight_decay = 1e-4


model = SimSiam(backbone=BACKBONE).to(DEVICE)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=PRETRAIN_EPOCHS)


latest_ckpt = os.path.join(OUT_DIR, "simsiam_latest.pth")
best_ckpt = os.path.join(OUT_DIR, "simsiam_best_linearprobe.pth")
encoder_outpath = os.path.join(OUT_DIR, "simsiam_encoder.pth")


def extract_features_from_encoder(encoder, paths_list, transform, batch_size=64):
    encoder.eval()
    ds = ManifestDataset(paths_list, [0]*len(paths_list), transform=transform)
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)
    feats = []
    with torch.no_grad():
        for imgs, _, _ in loader:
            imgs = imgs.to(DEVICE)
            h = encoder(imgs).view(imgs.size(0), -1).cpu().numpy()
            feats.append(h)
    feats = np.vstack(feats)
    return feats


def quick_linear_probe(encoder, train_paths, train_labels, val_paths, val_labels, transform, max_samples=500):

    tpaths = train_paths[:max_samples]; tlabels = train_labels[:max_samples]
    train_feats = extract_features_from_encoder(encoder, tpaths, transform)
    val_feats = extract_features_from_encoder(encoder, val_paths, transform)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(train_feats, tlabels)
    preds = clf.predict(val_feats)
    acc = accuracy_score(val_labels, preds)
    return acc

# Main training loop
print("Starting pretraining for", PRETRAIN_EPOCHS, "epochs (from epoch", start_epoch, ")")
for epoch in range(start_epoch, PRETRAIN_EPOCHS):
    model.train()
    epoch_losses = []
    loop = tqdm(train_loader, desc=f"Pretrain Epoch {epoch+1}/{PRETRAIN_EPOCHS}")
    for x1, x2, lbl, _ in loop:
        x1 = x1.to(DEVICE); x2 = x2.to(DEVICE)
        p1, p2, z1, z2 = model(x1, x2)
        loss = 0.5 * negative_cosine_similarity(p1, z2) + 0.5 * negative_cosine_similarity(p2, z1)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        loop.set_postfix(loss=f"{np.mean(epoch_losses):.4f}")

    scheduler.step()
    avg_loss = float(np.mean(epoch_losses))
    print(f"Epoch {epoch+1} finished. Avg loss: {avg_loss:.4f}")


    ck = {
        "epoch": epoch,
        "model_state": model.state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "scheduler_state": scheduler.state_dict(),
        "avg_loss": avg_loss,
        "manifest": split_manifest
    }
    save_checkpoint(ck, latest_ckpt)


torch.save({"encoder_state_dict": model.encoder.state_dict(), "feat_dim": model.feat_dim},
           encoder_outpath)
print("Pretraining complete. Encoder saved to", encoder_outpath)




Starting pretraining for 100 epochs (from epoch 0 )


Pretrain Epoch 1/100: 100%|██████████| 20/20 [06:57<00:00, 20.89s/it, loss=-0.1287]


Epoch 1 finished. Avg loss: -0.1287
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 2/100: 100%|██████████| 20/20 [06:21<00:00, 19.07s/it, loss=-0.4303]


Epoch 2 finished. Avg loss: -0.4303
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 3/100: 100%|██████████| 20/20 [06:14<00:00, 18.73s/it, loss=-0.5787]


Epoch 3 finished. Avg loss: -0.5787
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 4/100: 100%|██████████| 20/20 [06:10<00:00, 18.51s/it, loss=-0.6860]


Epoch 4 finished. Avg loss: -0.6860
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 5/100: 100%|██████████| 20/20 [06:19<00:00, 18.96s/it, loss=-0.7681]


Epoch 5 finished. Avg loss: -0.7681
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 6/100: 100%|██████████| 20/20 [06:10<00:00, 18.54s/it, loss=-0.8111]


Epoch 6 finished. Avg loss: -0.8111
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 7/100: 100%|██████████| 20/20 [06:13<00:00, 18.67s/it, loss=-0.8327]


Epoch 7 finished. Avg loss: -0.8327
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 8/100: 100%|██████████| 20/20 [06:14<00:00, 18.72s/it, loss=-0.8515]


Epoch 8 finished. Avg loss: -0.8515
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 9/100: 100%|██████████| 20/20 [06:17<00:00, 18.88s/it, loss=-0.8444]


Epoch 9 finished. Avg loss: -0.8444
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 10/100: 100%|██████████| 20/20 [06:17<00:00, 18.89s/it, loss=-0.8696]


Epoch 10 finished. Avg loss: -0.8696
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 11/100: 100%|██████████| 20/20 [06:19<00:00, 18.98s/it, loss=-0.8581]


Epoch 11 finished. Avg loss: -0.8581
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 12/100: 100%|██████████| 20/20 [06:14<00:00, 18.72s/it, loss=-0.8753]


Epoch 12 finished. Avg loss: -0.8753
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 13/100: 100%|██████████| 20/20 [06:06<00:00, 18.35s/it, loss=-0.8759]


Epoch 13 finished. Avg loss: -0.8759
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 14/100: 100%|██████████| 20/20 [06:13<00:00, 18.68s/it, loss=-0.8736]


Epoch 14 finished. Avg loss: -0.8736
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 15/100: 100%|██████████| 20/20 [06:13<00:00, 18.65s/it, loss=-0.8942]


Epoch 15 finished. Avg loss: -0.8942
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 16/100: 100%|██████████| 20/20 [06:19<00:00, 18.98s/it, loss=-0.8858]


Epoch 16 finished. Avg loss: -0.8858
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 17/100: 100%|██████████| 20/20 [06:13<00:00, 18.67s/it, loss=-0.8934]


Epoch 17 finished. Avg loss: -0.8934
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 18/100: 100%|██████████| 20/20 [05:59<00:00, 18.00s/it, loss=-0.8937]


Epoch 18 finished. Avg loss: -0.8937
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 19/100: 100%|██████████| 20/20 [06:17<00:00, 18.90s/it, loss=-0.8942]


Epoch 19 finished. Avg loss: -0.8942
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 20/100: 100%|██████████| 20/20 [06:19<00:00, 18.97s/it, loss=-0.9003]


Epoch 20 finished. Avg loss: -0.9003
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 21/100: 100%|██████████| 20/20 [06:11<00:00, 18.59s/it, loss=-0.8986]


Epoch 21 finished. Avg loss: -0.8986
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 22/100: 100%|██████████| 20/20 [06:11<00:00, 18.58s/it, loss=-0.8984]


Epoch 22 finished. Avg loss: -0.8984
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 23/100: 100%|██████████| 20/20 [06:20<00:00, 19.04s/it, loss=-0.8972]


Epoch 23 finished. Avg loss: -0.8972
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 24/100: 100%|██████████| 20/20 [06:18<00:00, 18.91s/it, loss=-0.8948]


Epoch 24 finished. Avg loss: -0.8948
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 25/100: 100%|██████████| 20/20 [06:03<00:00, 18.16s/it, loss=-0.8922]


Epoch 25 finished. Avg loss: -0.8922
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 26/100: 100%|██████████| 20/20 [06:19<00:00, 18.97s/it, loss=-0.8974]


Epoch 26 finished. Avg loss: -0.8974
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 27/100: 100%|██████████| 20/20 [06:16<00:00, 18.80s/it, loss=-0.8999]


Epoch 27 finished. Avg loss: -0.8999
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 28/100: 100%|██████████| 20/20 [06:10<00:00, 18.54s/it, loss=-0.9013]


Epoch 28 finished. Avg loss: -0.9013
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 29/100: 100%|██████████| 20/20 [06:20<00:00, 19.02s/it, loss=-0.9023]


Epoch 29 finished. Avg loss: -0.9023
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 30/100: 100%|██████████| 20/20 [06:21<00:00, 19.08s/it, loss=-0.9070]


Epoch 30 finished. Avg loss: -0.9070
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 31/100: 100%|██████████| 20/20 [06:21<00:00, 19.06s/it, loss=-0.9009]


Epoch 31 finished. Avg loss: -0.9009
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 32/100: 100%|██████████| 20/20 [06:28<00:00, 19.40s/it, loss=-0.9014]


Epoch 32 finished. Avg loss: -0.9014
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 33/100: 100%|██████████| 20/20 [06:12<00:00, 18.64s/it, loss=-0.8989]


Epoch 33 finished. Avg loss: -0.8989
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 34/100: 100%|██████████| 20/20 [06:18<00:00, 18.91s/it, loss=-0.9083]


Epoch 34 finished. Avg loss: -0.9083
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 35/100: 100%|██████████| 20/20 [06:20<00:00, 19.03s/it, loss=-0.9056]


Epoch 35 finished. Avg loss: -0.9056
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 36/100: 100%|██████████| 20/20 [06:13<00:00, 18.67s/it, loss=-0.9021]


Epoch 36 finished. Avg loss: -0.9021
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 37/100: 100%|██████████| 20/20 [06:19<00:00, 18.96s/it, loss=-0.9027]


Epoch 37 finished. Avg loss: -0.9027
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 38/100: 100%|██████████| 20/20 [06:21<00:00, 19.06s/it, loss=-0.9024]


Epoch 38 finished. Avg loss: -0.9024
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 39/100: 100%|██████████| 20/20 [06:17<00:00, 18.87s/it, loss=-0.9041]


Epoch 39 finished. Avg loss: -0.9041
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 40/100: 100%|██████████| 20/20 [06:15<00:00, 18.77s/it, loss=-0.9015]


Epoch 40 finished. Avg loss: -0.9015
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 41/100: 100%|██████████| 20/20 [06:12<00:00, 18.61s/it, loss=-0.8954]


Epoch 41 finished. Avg loss: -0.8954
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 42/100: 100%|██████████| 20/20 [06:16<00:00, 18.84s/it, loss=-0.9000]


Epoch 42 finished. Avg loss: -0.9000
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 43/100: 100%|██████████| 20/20 [06:20<00:00, 19.05s/it, loss=-0.9058]


Epoch 43 finished. Avg loss: -0.9058
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 44/100: 100%|██████████| 20/20 [06:22<00:00, 19.13s/it, loss=-0.9043]


Epoch 44 finished. Avg loss: -0.9043
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 45/100: 100%|██████████| 20/20 [06:25<00:00, 19.29s/it, loss=-0.9010]


Epoch 45 finished. Avg loss: -0.9010
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 46/100: 100%|██████████| 20/20 [06:22<00:00, 19.13s/it, loss=-0.9076]


Epoch 46 finished. Avg loss: -0.9076
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 47/100: 100%|██████████| 20/20 [06:16<00:00, 18.82s/it, loss=-0.9098]


Epoch 47 finished. Avg loss: -0.9098
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 48/100: 100%|██████████| 20/20 [06:14<00:00, 18.73s/it, loss=-0.9045]


Epoch 48 finished. Avg loss: -0.9045
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 49/100: 100%|██████████| 20/20 [06:23<00:00, 19.17s/it, loss=-0.9105]


Epoch 49 finished. Avg loss: -0.9105
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 50/100: 100%|██████████| 20/20 [06:11<00:00, 18.60s/it, loss=-0.9072]


Epoch 50 finished. Avg loss: -0.9072
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 51/100: 100%|██████████| 20/20 [06:22<00:00, 19.13s/it, loss=-0.9112]


Epoch 51 finished. Avg loss: -0.9112
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 52/100: 100%|██████████| 20/20 [06:18<00:00, 18.93s/it, loss=-0.9114]


Epoch 52 finished. Avg loss: -0.9114
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 53/100: 100%|██████████| 20/20 [06:14<00:00, 18.74s/it, loss=-0.9047]


Epoch 53 finished. Avg loss: -0.9047
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 54/100: 100%|██████████| 20/20 [06:29<00:00, 19.47s/it, loss=-0.8987]


Epoch 54 finished. Avg loss: -0.8987
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 55/100: 100%|██████████| 20/20 [06:21<00:00, 19.05s/it, loss=-0.9183]


Epoch 55 finished. Avg loss: -0.9183
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 56/100: 100%|██████████| 20/20 [06:15<00:00, 18.76s/it, loss=-0.9190]


Epoch 56 finished. Avg loss: -0.9190
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 57/100: 100%|██████████| 20/20 [06:17<00:00, 18.88s/it, loss=-0.9139]


Epoch 57 finished. Avg loss: -0.9139
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 58/100: 100%|██████████| 20/20 [06:24<00:00, 19.23s/it, loss=-0.9146]


Epoch 58 finished. Avg loss: -0.9146
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 59/100: 100%|██████████| 20/20 [06:18<00:00, 18.92s/it, loss=-0.9131]


Epoch 59 finished. Avg loss: -0.9131
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 60/100: 100%|██████████| 20/20 [06:21<00:00, 19.06s/it, loss=-0.9122]


Epoch 60 finished. Avg loss: -0.9122
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 61/100: 100%|██████████| 20/20 [06:19<00:00, 18.98s/it, loss=-0.9176]


Epoch 61 finished. Avg loss: -0.9176
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 62/100: 100%|██████████| 20/20 [06:19<00:00, 18.97s/it, loss=-0.9130]


Epoch 62 finished. Avg loss: -0.9130
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 63/100: 100%|██████████| 20/20 [06:18<00:00, 18.93s/it, loss=-0.9147]


Epoch 63 finished. Avg loss: -0.9147
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 64/100: 100%|██████████| 20/20 [06:21<00:00, 19.07s/it, loss=-0.9130]


Epoch 64 finished. Avg loss: -0.9130
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 65/100: 100%|██████████| 20/20 [06:19<00:00, 18.99s/it, loss=-0.9130]


Epoch 65 finished. Avg loss: -0.9130
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 66/100: 100%|██████████| 20/20 [06:20<00:00, 19.02s/it, loss=-0.9187]


Epoch 66 finished. Avg loss: -0.9187
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 67/100: 100%|██████████| 20/20 [06:19<00:00, 18.97s/it, loss=-0.9150]


Epoch 67 finished. Avg loss: -0.9150
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 68/100: 100%|██████████| 20/20 [06:05<00:00, 18.28s/it, loss=-0.9201]


Epoch 68 finished. Avg loss: -0.9201
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 69/100: 100%|██████████| 20/20 [06:22<00:00, 19.14s/it, loss=-0.9135]


Epoch 69 finished. Avg loss: -0.9135
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 70/100: 100%|██████████| 20/20 [06:23<00:00, 19.18s/it, loss=-0.9123]


Epoch 70 finished. Avg loss: -0.9123
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 71/100: 100%|██████████| 20/20 [06:22<00:00, 19.10s/it, loss=-0.9177]


Epoch 71 finished. Avg loss: -0.9177
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 72/100: 100%|██████████| 20/20 [06:20<00:00, 19.03s/it, loss=-0.9154]


Epoch 72 finished. Avg loss: -0.9154
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 73/100: 100%|██████████| 20/20 [06:24<00:00, 19.22s/it, loss=-0.9192]


Epoch 73 finished. Avg loss: -0.9192
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 74/100: 100%|██████████| 20/20 [06:22<00:00, 19.13s/it, loss=-0.9154]


Epoch 74 finished. Avg loss: -0.9154
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 75/100: 100%|██████████| 20/20 [06:21<00:00, 19.06s/it, loss=-0.9158]


Epoch 75 finished. Avg loss: -0.9158
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 76/100: 100%|██████████| 20/20 [06:22<00:00, 19.13s/it, loss=-0.9128]


Epoch 76 finished. Avg loss: -0.9128
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 77/100: 100%|██████████| 20/20 [06:13<00:00, 18.69s/it, loss=-0.9191]


Epoch 77 finished. Avg loss: -0.9191
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 78/100: 100%|██████████| 20/20 [06:13<00:00, 18.65s/it, loss=-0.9230]


Epoch 78 finished. Avg loss: -0.9230
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 79/100: 100%|██████████| 20/20 [06:17<00:00, 18.88s/it, loss=-0.9172]


Epoch 79 finished. Avg loss: -0.9172
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 80/100: 100%|██████████| 20/20 [06:15<00:00, 18.77s/it, loss=-0.9178]


Epoch 80 finished. Avg loss: -0.9178
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 81/100: 100%|██████████| 20/20 [06:12<00:00, 18.60s/it, loss=-0.9160]


Epoch 81 finished. Avg loss: -0.9160
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 82/100: 100%|██████████| 20/20 [06:21<00:00, 19.06s/it, loss=-0.9195]


Epoch 82 finished. Avg loss: -0.9195
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 83/100: 100%|██████████| 20/20 [06:15<00:00, 18.76s/it, loss=-0.9179]


Epoch 83 finished. Avg loss: -0.9179
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 84/100: 100%|██████████| 20/20 [06:18<00:00, 18.92s/it, loss=-0.9208]


Epoch 84 finished. Avg loss: -0.9208
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 85/100: 100%|██████████| 20/20 [05:50<00:00, 17.52s/it, loss=-0.9239]


Epoch 85 finished. Avg loss: -0.9239
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 86/100: 100%|██████████| 20/20 [06:19<00:00, 18.96s/it, loss=-0.9157]


Epoch 86 finished. Avg loss: -0.9157
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 87/100: 100%|██████████| 20/20 [06:20<00:00, 19.01s/it, loss=-0.9111]


Epoch 87 finished. Avg loss: -0.9111
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 88/100: 100%|██████████| 20/20 [06:11<00:00, 18.60s/it, loss=-0.9194]


Epoch 88 finished. Avg loss: -0.9194
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 89/100: 100%|██████████| 20/20 [06:14<00:00, 18.72s/it, loss=-0.9169]


Epoch 89 finished. Avg loss: -0.9169
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 90/100: 100%|██████████| 20/20 [06:14<00:00, 18.71s/it, loss=-0.9176]


Epoch 90 finished. Avg loss: -0.9176
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 91/100: 100%|██████████| 20/20 [06:17<00:00, 18.87s/it, loss=-0.9189]


Epoch 91 finished. Avg loss: -0.9189
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 92/100: 100%|██████████| 20/20 [05:32<00:00, 16.64s/it, loss=-0.9194]


Epoch 92 finished. Avg loss: -0.9194
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 93/100: 100%|██████████| 20/20 [06:16<00:00, 18.81s/it, loss=-0.9203]


Epoch 93 finished. Avg loss: -0.9203
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 94/100: 100%|██████████| 20/20 [06:14<00:00, 18.73s/it, loss=-0.9212]


Epoch 94 finished. Avg loss: -0.9212
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 95/100: 100%|██████████| 20/20 [06:16<00:00, 18.82s/it, loss=-0.9223]


Epoch 95 finished. Avg loss: -0.9223
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 96/100: 100%|██████████| 20/20 [06:11<00:00, 18.57s/it, loss=-0.9198]


Epoch 96 finished. Avg loss: -0.9198
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 97/100: 100%|██████████| 20/20 [06:14<00:00, 18.74s/it, loss=-0.9226]


Epoch 97 finished. Avg loss: -0.9226
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 98/100: 100%|██████████| 20/20 [06:11<00:00, 18.56s/it, loss=-0.9148]


Epoch 98 finished. Avg loss: -0.9148
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 99/100: 100%|██████████| 20/20 [06:18<00:00, 18.95s/it, loss=-0.9190]


Epoch 99 finished. Avg loss: -0.9190
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth


Pretrain Epoch 100/100: 100%|██████████| 20/20 [06:15<00:00, 18.76s/it, loss=-0.9217]


Epoch 100 finished. Avg loss: -0.9217
Saved checkpoint: /kaggle/working/simsiam_task4/simsiam_latest.pth
Pretraining complete. Encoder saved to /kaggle/working/simsiam_task4/simsiam_encoder.pth


## (Task 4.2) CELL 10 - Feature extraction (frozen encoder)

In [15]:
enc_ckpt = os.path.join(OUT_DIR, "simsiam_encoder.pth")
if not os.path.exists(enc_ckpt):
    
    if 'model' in globals() and hasattr(model, "encoder"):
        encoder = model.encoder
    else:
        raise FileNotFoundError("Encoder checkpoint not found and model not in memory.")
else:
    d = torch.load(enc_ckpt, map_location=DEVICE)
    encoder = SimSiam(backbone=BACKBONE).encoder
    encoder.load_state_dict(d["encoder_state_dict"])
encoder = encoder.to(DEVICE)
encoder.eval()


def extract_and_save(paths_list, labels_list, split_name):
    ds = ManifestDataset(paths_list, labels_list, eval_transform)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
    feats = []
    files = []
    with torch.no_grad():
        for imgs, lbls, ps in loader:
            imgs = imgs.to(DEVICE)
            h = encoder(imgs).view(imgs.size(0), -1).cpu().numpy()
            feats.append(h)
            files.extend(ps)
    feats = np.vstack(feats)
    np.save(os.path.join(OUT_DIR, f"{split_name}_feats.npy"), feats)
    np.save(os.path.join(OUT_DIR, f"{split_name}_labels.npy"), np.array(labels_list))
    print(f"Saved {split_name} features: {feats.shape} to {OUT_DIR}/{split_name}_feats.npy")
    return feats

train_feats = extract_and_save(train_paths, train_labels, "train")
val_feats = extract_and_save(val_paths, val_labels, "val")
test_feats = extract_and_save(test_paths, test_labels, "test")


Saved train features: (1296, 512) to /kaggle/working/simsiam_task4/train_feats.npy
Saved val features: (144, 512) to /kaggle/working/simsiam_task4/val_feats.npy
Saved test features: (360, 512) to /kaggle/working/simsiam_task4/test_feats.npy


## CELL 11 - Linear probe + shallow heads evaluations (train classifiers on frozen features)

In [16]:
train_feats = np.load(os.path.join(OUT_DIR, "train_feats.npy"))
train_lbls = np.load(os.path.join(OUT_DIR, "train_labels.npy"))
val_feats = np.load(os.path.join(OUT_DIR, "val_feats.npy"))
val_lbls = np.load(os.path.join(OUT_DIR, "val_labels.npy"))
test_feats = np.load(os.path.join(OUT_DIR, "test_feats.npy"))
test_lbls = np.load(os.path.join(OUT_DIR, "test_labels.npy"))


classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=2000),
    "SVM_RBF": SVC(kernel="rbf", probability=True),
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "DecisionTree": DecisionTreeClassifier(),
    "MLP": MLPClassifier(hidden_layer_sizes=(512,), max_iter=500)
}

results = {}
for name, clf in classifiers.items():
    print("Training:", name)
    clf.fit(train_feats, train_lbls)
    val_pred = clf.predict(val_feats)
    val_acc = accuracy_score(val_lbls, val_pred)
    test_pred = clf.predict(test_feats)
    test_acc = accuracy_score(test_lbls, test_pred)
    print(f" {name} val_acc: {val_acc:.4f} test_acc: {test_acc:.4f}")
    results[name] = {"val_acc": float(val_acc), "test_acc": float(test_acc)}
    joblib.dump(clf, os.path.join(OUT_DIR, f"{name}.joblib"))


with open(os.path.join(OUT_DIR, "probe_results.json"), "w") as f:
    json.dump(results, f, indent=2)
print("Probe results saved to", os.path.join(OUT_DIR, "probe_results.json"))


Training: LogisticRegression
 LogisticRegression val_acc: 0.8333 test_acc: 0.8167
Training: SVM_RBF
 SVM_RBF val_acc: 0.7917 test_acc: 0.7806
Training: RandomForest
 RandomForest val_acc: 0.8403 test_acc: 0.7972
Training: DecisionTree
 DecisionTree val_acc: 0.7431 test_acc: 0.6778
Training: MLP
 MLP val_acc: 0.8750 test_acc: 0.8528
Probe results saved to /kaggle/working/simsiam_task4/probe_results.json


## CELL 12 - Full fine-tune: attach classification head and fine-tune entire encoder

In [17]:
import os
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from tqdm import tqdm


ft_num_workers = 0
pin_memory = True if torch.cuda.is_available() else False
resume_ckpt_path = os.path.join(OUT_DIR, "finetune_resume.pth")
best_ckpt_path = os.path.join(OUT_DIR, "finetune_best.pth")
save_every_epoch = True


if 'encoder' not in globals() or encoder is None:
    enc_candidates = [
        os.path.join(OUT_DIR, "simsiam_encoder_memory.pth"),
        os.path.join(OUT_DIR, "simsiam_encoder.pth")
    ]
    found = None
    for p in enc_candidates:
        if os.path.exists(p):
            found = p
            break
    if found is None:
        raise FileNotFoundError("Encoder checkpoint not found in OUT_DIR. Run pretraining or restore archive.")
    enc_ck = torch.load(found, map_location="cpu", weights_only=False)
    BACKBONE = globals().get("BACKBONE", "resnet18")
    tmp_model = SimSiam(backbone=BACKBONE)
    tmp_model.encoder.load_state_dict(enc_ck["encoder_state_dict"])
    encoder = tmp_model.encoder
    del tmp_model


encoder = encoder.to(DEVICE)


class FineTuneClassifier(nn.Module):
    def __init__(self, encoder, feat_dim, num_classes):
        super().__init__()
        self.encoder = encoder
        self.head = nn.Linear(feat_dim, num_classes)

    def forward(self, x):
        h = self.encoder(x).view(x.size(0), -1)
        return self.head(h)


encoder.eval()
with torch.no_grad():
    dummy = torch.zeros(1, 3, RESOLUTION, RESOLUTION).to(DEVICE)
    try:
        out = encoder(dummy).view(1, -1)
        feat_dim = out.shape[1]
    except Exception:
        # fallback to known dims
        feat_dim = 512 if BACKBONE == "resnet18" else 2048

num_classes = len(classes)
ft_model = FineTuneClassifier(encoder, feat_dim, num_classes).to(DEVICE)


ft_train_transform = transforms.Compose([
    transforms.Resize(int(RESOLUTION*1.1)),
    transforms.CenterCrop(RESOLUTION),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
ft_train_ds = ManifestDataset(train_paths, train_labels, ft_train_transform)
ft_val_ds = ManifestDataset(val_paths, val_labels, eval_transform)

ft_train_loader = DataLoader(ft_train_ds, batch_size=BATCH_SIZE, shuffle=True,
                             num_workers=ft_num_workers, pin_memory=pin_memory)
ft_val_loader   = DataLoader(ft_val_ds,  batch_size=BATCH_SIZE, shuffle=False,
                             num_workers=ft_num_workers, pin_memory=pin_memory)


ft_optimizer = optim.SGD(ft_model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
ft_scheduler = optim.lr_scheduler.StepLR(ft_optimizer, step_size=15, gamma=0.1)
criterion = nn.CrossEntropyLoss()


start_epoch = 0
best_val_acc = 0.0
if os.path.exists(resume_ckpt_path):
    try:
        ck = torch.load(resume_ckpt_path, map_location=DEVICE, weights_only=False)
        ft_model.load_state_dict(ck["model_state"])
        ft_optimizer.load_state_dict(ck["optimizer_state"])
        if "scheduler_state" in ck:
            try:
                ft_scheduler.load_state_dict(ck["scheduler_state"])
            except Exception:
                pass
        start_epoch = ck.get("epoch", 0) + 1
        best_val_acc = ck.get("val_acc", 0.0)
        print(f"Resumed fine-tune from resume checkpoint at epoch {start_epoch} (best val {best_val_acc:.4f})")
    except Exception as e:
        print("Could not resume from resume checkpoint:", e)


try:
    for epoch in range(start_epoch, FINETUNE_EPOCHS):
        ft_model.train()
        losses = []
        loop = tqdm(ft_train_loader, desc=f"Fine-tune Epoch {epoch+1}/{FINETUNE_EPOCHS}")
        for imgs, labels_batch, _ in loop:
            imgs = imgs.to(DEVICE, non_blocking=pin_memory)
            labels_batch = labels_batch.to(DEVICE, non_blocking=pin_memory)
            logits = ft_model(imgs)
            loss = criterion(logits, labels_batch)
            ft_optimizer.zero_grad()
            loss.backward()
            ft_optimizer.step()
            losses.append(loss.item())
            loop.set_postfix(train_loss=f"{np.mean(losses):.4f}")

        ft_scheduler.step()


        ft_model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for imgs, labels_batch, _ in ft_val_loader:
                imgs = imgs.to(DEVICE, non_blocking=pin_memory)
                logits = ft_model(imgs)
                preds = logits.argmax(dim=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels_batch.numpy())
        val_acc = accuracy_score(all_labels, all_preds)
        print(f"Fine-tune Epoch {epoch+1}/{FINETUNE_EPOCHS} - train_loss: {np.mean(losses):.4f} val_acc: {val_acc:.4f}")

        # Save best
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save({
                "epoch": epoch,
                "model_state": ft_model.state_dict(),
                "optimizer_state": ft_optimizer.state_dict(),
                "val_acc": val_acc
            }, best_ckpt_path)
            print("Saved best fine-tune checkpoint:", best_ckpt_path)

        # Periodic resume checkpoint
        if save_every_epoch:
            torch.save({
                "epoch": epoch,
                "model_state": ft_model.state_dict(),
                "optimizer_state": ft_optimizer.state_dict(),
                "scheduler_state": ft_scheduler.state_dict(),
                "val_acc": val_acc
            }, resume_ckpt_path)

except KeyboardInterrupt:
    print("KeyboardInterrupt caught — saving resume checkpoint...")
    torch.save({
        "epoch": epoch,
        "model_state": ft_model.state_dict(),
        "optimizer_state": ft_optimizer.state_dict(),
        "scheduler_state": ft_scheduler.state_dict(),
        "val_acc": best_val_acc
    }, resume_ckpt_path)
    print("Saved resume checkpoint to", resume_ckpt_path)
    raise

print("Fine-tune complete. Best val acc:", best_val_acc)



Fine-tune Epoch 1/50: 100%|██████████| 21/21 [10:33<00:00, 30.17s/it, train_loss=0.8706]


Fine-tune Epoch 1/50 - train_loss: 0.8706 val_acc: 0.3889
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 2/50: 100%|██████████| 21/21 [10:34<00:00, 30.23s/it, train_loss=0.6193]


Fine-tune Epoch 2/50 - train_loss: 0.6193 val_acc: 0.5694
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 3/50: 100%|██████████| 21/21 [10:30<00:00, 30.02s/it, train_loss=0.7477]


Fine-tune Epoch 3/50 - train_loss: 0.7477 val_acc: 0.7431
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 4/50: 100%|██████████| 21/21 [10:27<00:00, 29.87s/it, train_loss=0.4576]


Fine-tune Epoch 4/50 - train_loss: 0.4576 val_acc: 0.7778
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 5/50: 100%|██████████| 21/21 [10:29<00:00, 29.98s/it, train_loss=0.4598]


Fine-tune Epoch 5/50 - train_loss: 0.4598 val_acc: 0.8681
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 6/50: 100%|██████████| 21/21 [10:29<00:00, 29.97s/it, train_loss=0.3948]


Fine-tune Epoch 6/50 - train_loss: 0.3948 val_acc: 0.8611


Fine-tune Epoch 7/50: 100%|██████████| 21/21 [10:37<00:00, 30.33s/it, train_loss=0.3207]


Fine-tune Epoch 7/50 - train_loss: 0.3207 val_acc: 0.6250


Fine-tune Epoch 8/50: 100%|██████████| 21/21 [10:31<00:00, 30.07s/it, train_loss=0.3516]


Fine-tune Epoch 8/50 - train_loss: 0.3516 val_acc: 0.8958
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 9/50: 100%|██████████| 21/21 [10:33<00:00, 30.19s/it, train_loss=0.3168]


Fine-tune Epoch 9/50 - train_loss: 0.3168 val_acc: 0.8681


Fine-tune Epoch 10/50: 100%|██████████| 21/21 [10:34<00:00, 30.19s/it, train_loss=0.2560]


Fine-tune Epoch 10/50 - train_loss: 0.2560 val_acc: 0.8611


Fine-tune Epoch 11/50: 100%|██████████| 21/21 [10:39<00:00, 30.45s/it, train_loss=0.2551]


Fine-tune Epoch 11/50 - train_loss: 0.2551 val_acc: 0.8889


Fine-tune Epoch 12/50: 100%|██████████| 21/21 [10:44<00:00, 30.70s/it, train_loss=0.2649]


Fine-tune Epoch 12/50 - train_loss: 0.2649 val_acc: 0.8611


Fine-tune Epoch 13/50: 100%|██████████| 21/21 [10:29<00:00, 29.99s/it, train_loss=0.2212]


Fine-tune Epoch 13/50 - train_loss: 0.2212 val_acc: 0.8819


Fine-tune Epoch 14/50: 100%|██████████| 21/21 [10:25<00:00, 29.77s/it, train_loss=0.2540]


Fine-tune Epoch 14/50 - train_loss: 0.2540 val_acc: 0.8194


Fine-tune Epoch 15/50: 100%|██████████| 21/21 [10:26<00:00, 29.85s/it, train_loss=0.2133]


Fine-tune Epoch 15/50 - train_loss: 0.2133 val_acc: 0.8889


Fine-tune Epoch 16/50: 100%|██████████| 21/21 [10:37<00:00, 30.38s/it, train_loss=0.1158]


Fine-tune Epoch 16/50 - train_loss: 0.1158 val_acc: 0.9236
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 17/50: 100%|██████████| 21/21 [10:22<00:00, 29.66s/it, train_loss=0.0835]


Fine-tune Epoch 17/50 - train_loss: 0.0835 val_acc: 0.9306
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 18/50: 100%|██████████| 21/21 [10:35<00:00, 30.26s/it, train_loss=0.0748]


Fine-tune Epoch 18/50 - train_loss: 0.0748 val_acc: 0.9375
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 19/50: 100%|██████████| 21/21 [10:25<00:00, 29.78s/it, train_loss=0.0656]


Fine-tune Epoch 19/50 - train_loss: 0.0656 val_acc: 0.9306


Fine-tune Epoch 20/50: 100%|██████████| 21/21 [10:23<00:00, 29.68s/it, train_loss=0.0606]


Fine-tune Epoch 20/50 - train_loss: 0.0606 val_acc: 0.9306


Fine-tune Epoch 21/50: 100%|██████████| 21/21 [10:29<00:00, 29.99s/it, train_loss=0.0553]


Fine-tune Epoch 21/50 - train_loss: 0.0553 val_acc: 0.9306


Fine-tune Epoch 22/50: 100%|██████████| 21/21 [10:24<00:00, 29.73s/it, train_loss=0.0635]


Fine-tune Epoch 22/50 - train_loss: 0.0635 val_acc: 0.9306


Fine-tune Epoch 23/50: 100%|██████████| 21/21 [10:28<00:00, 29.93s/it, train_loss=0.0465]


Fine-tune Epoch 23/50 - train_loss: 0.0465 val_acc: 0.9375


Fine-tune Epoch 24/50: 100%|██████████| 21/21 [10:34<00:00, 30.22s/it, train_loss=0.0815]


Fine-tune Epoch 24/50 - train_loss: 0.0815 val_acc: 0.9236


Fine-tune Epoch 25/50: 100%|██████████| 21/21 [10:33<00:00, 30.15s/it, train_loss=0.0767]


Fine-tune Epoch 25/50 - train_loss: 0.0767 val_acc: 0.9375


Fine-tune Epoch 26/50: 100%|██████████| 21/21 [10:25<00:00, 29.81s/it, train_loss=0.0469]


Fine-tune Epoch 26/50 - train_loss: 0.0469 val_acc: 0.9444
Saved best fine-tune checkpoint: /kaggle/working/simsiam_task4/finetune_best.pth


Fine-tune Epoch 27/50: 100%|██████████| 21/21 [10:33<00:00, 30.17s/it, train_loss=0.0623]


Fine-tune Epoch 27/50 - train_loss: 0.0623 val_acc: 0.9306


Fine-tune Epoch 28/50: 100%|██████████| 21/21 [10:34<00:00, 30.24s/it, train_loss=0.0466]


Fine-tune Epoch 28/50 - train_loss: 0.0466 val_acc: 0.9306


Fine-tune Epoch 29/50: 100%|██████████| 21/21 [10:31<00:00, 30.08s/it, train_loss=0.0395]


Fine-tune Epoch 29/50 - train_loss: 0.0395 val_acc: 0.9306


Fine-tune Epoch 30/50: 100%|██████████| 21/21 [10:35<00:00, 30.24s/it, train_loss=0.0448]


Fine-tune Epoch 30/50 - train_loss: 0.0448 val_acc: 0.9444


Fine-tune Epoch 31/50: 100%|██████████| 21/21 [10:38<00:00, 30.40s/it, train_loss=0.0296]


Fine-tune Epoch 31/50 - train_loss: 0.0296 val_acc: 0.9444


Fine-tune Epoch 32/50: 100%|██████████| 21/21 [10:43<00:00, 30.64s/it, train_loss=0.0322]


Fine-tune Epoch 32/50 - train_loss: 0.0322 val_acc: 0.9375


Fine-tune Epoch 33/50: 100%|██████████| 21/21 [10:29<00:00, 29.96s/it, train_loss=0.0306]


Fine-tune Epoch 33/50 - train_loss: 0.0306 val_acc: 0.9375


Fine-tune Epoch 34/50: 100%|██████████| 21/21 [10:34<00:00, 30.20s/it, train_loss=0.0329]


Fine-tune Epoch 34/50 - train_loss: 0.0329 val_acc: 0.9375


Fine-tune Epoch 35/50: 100%|██████████| 21/21 [10:33<00:00, 30.17s/it, train_loss=0.0472]


Fine-tune Epoch 35/50 - train_loss: 0.0472 val_acc: 0.9444


Fine-tune Epoch 36/50: 100%|██████████| 21/21 [10:37<00:00, 30.38s/it, train_loss=0.0243]


Fine-tune Epoch 36/50 - train_loss: 0.0243 val_acc: 0.9306


Fine-tune Epoch 37/50: 100%|██████████| 21/21 [10:33<00:00, 30.15s/it, train_loss=0.0286]


Fine-tune Epoch 37/50 - train_loss: 0.0286 val_acc: 0.9375


Fine-tune Epoch 38/50: 100%|██████████| 21/21 [10:38<00:00, 30.40s/it, train_loss=0.0308]


Fine-tune Epoch 38/50 - train_loss: 0.0308 val_acc: 0.9306


Fine-tune Epoch 39/50: 100%|██████████| 21/21 [10:33<00:00, 30.15s/it, train_loss=0.0283]


Fine-tune Epoch 39/50 - train_loss: 0.0283 val_acc: 0.9444


Fine-tune Epoch 40/50: 100%|██████████| 21/21 [10:43<00:00, 30.66s/it, train_loss=0.0349]


Fine-tune Epoch 40/50 - train_loss: 0.0349 val_acc: 0.9375


Fine-tune Epoch 41/50: 100%|██████████| 21/21 [10:31<00:00, 30.06s/it, train_loss=0.0262]


Fine-tune Epoch 41/50 - train_loss: 0.0262 val_acc: 0.9444


Fine-tune Epoch 42/50: 100%|██████████| 21/21 [10:40<00:00, 30.52s/it, train_loss=0.0232]


Fine-tune Epoch 42/50 - train_loss: 0.0232 val_acc: 0.9444


Fine-tune Epoch 43/50: 100%|██████████| 21/21 [10:38<00:00, 30.38s/it, train_loss=0.0282]


Fine-tune Epoch 43/50 - train_loss: 0.0282 val_acc: 0.9375


Fine-tune Epoch 44/50: 100%|██████████| 21/21 [10:30<00:00, 30.01s/it, train_loss=0.0302]


Fine-tune Epoch 44/50 - train_loss: 0.0302 val_acc: 0.9444


Fine-tune Epoch 45/50: 100%|██████████| 21/21 [10:23<00:00, 29.69s/it, train_loss=0.0270]


Fine-tune Epoch 45/50 - train_loss: 0.0270 val_acc: 0.9375


Fine-tune Epoch 46/50: 100%|██████████| 21/21 [10:35<00:00, 30.27s/it, train_loss=0.0399]


Fine-tune Epoch 46/50 - train_loss: 0.0399 val_acc: 0.9306


Fine-tune Epoch 47/50: 100%|██████████| 21/21 [10:31<00:00, 30.05s/it, train_loss=0.0271]


Fine-tune Epoch 47/50 - train_loss: 0.0271 val_acc: 0.9444


Fine-tune Epoch 48/50: 100%|██████████| 21/21 [10:33<00:00, 30.19s/it, train_loss=0.0240]


Fine-tune Epoch 48/50 - train_loss: 0.0240 val_acc: 0.9375


Fine-tune Epoch 49/50: 100%|██████████| 21/21 [10:40<00:00, 30.49s/it, train_loss=0.0271]


Fine-tune Epoch 49/50 - train_loss: 0.0271 val_acc: 0.9306


Fine-tune Epoch 50/50: 100%|██████████| 21/21 [10:31<00:00, 30.09s/it, train_loss=0.0240]


Fine-tune Epoch 50/50 - train_loss: 0.0240 val_acc: 0.9375
Fine-tune complete. Best val acc: 0.9444444444444444


## (Task 4.3) CELL 13 - Embedding visualization: UMAP, t-SNE, PCA & silhouette

In [17]:
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
try:
    from sklearn.manifold import TSNE
except Exception:
    TSNE = None


try:
    import umap
except Exception:
    umap = None


OUT_DIR = globals().get("OUT_DIR", "/kaggle/working/simsiam_task4")
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR, exist_ok=True)

def safe_load(npname):
    p = os.path.join(OUT_DIR, npname)
    if not os.path.exists(p):
        raise FileNotFoundError(f"Missing required file: {p}")
    return np.load(p, allow_pickle=False)

train_feats = safe_load("train_feats.npy")
val_feats   = safe_load("val_feats.npy")
test_feats  = safe_load("test_feats.npy")
train_lbls  = safe_load("train_labels.npy")
val_lbls    = safe_load("val_labels.npy")
test_lbls   = safe_load("test_labels.npy")

feats_all = np.vstack([train_feats, val_feats, test_feats]).astype(np.float32)
lbls_all  = np.concatenate([train_lbls, val_lbls, test_lbls]).astype(int)

classes = globals().get("classes", ["Diseased", "Dried", "Healthy"])
SEED = globals().get("SEED", 42)


def scatter_and_save(X2, labels, title, fname):
    plt.figure(figsize=(8,6))
    for c_idx, c_name in enumerate(classes):
        idx = labels == c_idx
        if np.any(idx):
            plt.scatter(X2[idx,0], X2[idx,1], label=c_name, s=10, alpha=0.6)
    plt.legend()
    plt.title(title)
    outpath = os.path.join(OUT_DIR, fname)
    plt.savefig(outpath, dpi=150, bbox_inches="tight")
    plt.close()
    print("Saved:", outpath)


pca = PCA(n_components=2, random_state=SEED)
proj_pca = pca.fit_transform(feats_all)
scatter_and_save(proj_pca, lbls_all, "PCA projection of features", "pca_proj.png")


if umap is None:
    print("UMAP not installed or import failed; skipping UMAP projection.")
else:
    try:
        
        reducer = umap.UMAP(n_components=2, random_state=SEED)
        proj_umap = reducer.fit_transform(feats_all)
        scatter_and_save(proj_umap, lbls_all, "UMAP projection of features", "umap_proj.png")
    except Exception as e:
        
        print("UMAP failed:", repr(e))
        
        scatter_and_save(proj_pca, lbls_all, "UMAP fallback (PCA proxy)", "umap_proj_fallback_pca.png")
        print("Saved PCA proxy as UMAP fallback.")


if TSNE is None:
    print("TSNE not available; skipping t-SNE projection.")
else:
    try:
        n_samples = feats_all.shape[0]
        tsne_max = 2000
        if n_samples > tsne_max:
            idx_keep = []
            rng = np.random.RandomState(SEED)
            per_class = max(50, int(tsne_max / max(1, len(classes))))
            lbls_arr = lbls_all
            for c in range(len(classes)):
                positions = np.where(lbls_arr == c)[0]
                if len(positions) == 0:
                    continue
                k = min(len(positions), per_class)
                sel = rng.choice(positions, size=k, replace=False)
                idx_keep.extend(sel.tolist())
            idx_keep = np.array(sorted(idx_keep))
            feats_tsne = feats_all[idx_keep]
            labels_tsne = lbls_all[idx_keep]
            print(f"t-SNE: dataset too large ({n_samples}), subsampling to {len(idx_keep)} samples for speed.")
        else:
            feats_tsne = feats_all
            labels_tsne = lbls_all

        tsne = TSNE(n_components=2, perplexity=30, random_state=SEED, init='pca')
        proj_tsne = tsne.fit_transform(feats_tsne)
        
        fname = "tsne_proj.png" if feats_tsne.shape[0] == n_samples else "tsne_proj_subsample.png"
        scatter_and_save(proj_tsne, labels_tsne, "t-SNE projection of features", fname)
    except Exception as e:
        print("t-SNE failed or was interrupted:", repr(e))
        print("Skipping t-SNE.")


try:

        unique, counts = np.unique(lbls_all, return_counts=True)
    if len(unique) < 2 or np.min(counts) < 2:
        print("Silhouette not computed: need >=2 classes and >=2 samples per class.")
        sil = None
    else:
        sil = silhouette_score(feats_all, lbls_all)
        print("Silhouette score (features):", sil)
        with open(os.path.join(OUT_DIR, "embedding_stats.txt"), "w") as f:
            f.write(f"Silhouette: {sil}\n")
except Exception as e:
    print("Silhouette computation failed:", repr(e))
    sil = None

print("Embedding viz cell finished. Check files in:", OUT_DIR)


Saved: /kaggle/working/simsiam_task4/pca_proj.png
UMAP failed: TypeError("check_array() got an unexpected keyword argument 'ensure_all_finite'")
Saved: /kaggle/working/simsiam_task4/umap_proj_fallback_pca.png
Saved PCA proxy as UMAP fallback.
Saved: /kaggle/working/simsiam_task4/tsne_proj.png
Silhouette score (features): -0.019335013
Embedding viz cell finished. Check files in: /kaggle/working/simsiam_task4


## (Task 4.4) CELL 14 - Label-efficiency experiments (1%,5%,10%,25%,50% labeled) using linear probe

In [18]:
fractions = [0.01, 0.05, 0.10, 0.25, 0.50, 1.0]
results = {}
total_train = len(train_feats)
for frac in fractions:
    n = max(1, int(total_train * frac))

    subs_idx = []
    train_lbls_arr = np.array(train_lbls)
    for c in range(len(classes)):
        idxs = np.where(train_lbls_arr == c)[0]
        k = max(1, int(len(idxs) * frac))
        rng = np.random.RandomState(SEED)
        sel = rng.choice(idxs, size=k, replace=False)
        subs_idx.extend(sel.tolist())
    subs_idx = sorted(subs_idx)
    X_sub = train_feats[subs_idx]
    y_sub = train_lbls_arr[subs_idx]

    clf = LogisticRegression(max_iter=2000)
    clf.fit(X_sub, y_sub)
    test_pred = clf.predict(test_feats)
    acc = accuracy_score(test_lbls, test_pred)
    results[f"{int(frac*100)}%"] = float(acc)
    print(f"Fraction {int(frac*100)}% -> Test Acc: {acc:.4f}")

with open(os.path.join(OUT_DIR, "label_efficiency.json"), "w") as f:
    json.dump(results, f, indent=2)
print("Saved label-efficiency results to", os.path.join(OUT_DIR, "label_efficiency.json"))


Fraction 1% -> Test Acc: 0.6250
Fraction 5% -> Test Acc: 0.7472
Fraction 10% -> Test Acc: 0.7778
Fraction 25% -> Test Acc: 0.7889
Fraction 50% -> Test Acc: 0.8167
Fraction 100% -> Test Acc: 0.8167
Saved label-efficiency results to /kaggle/working/simsiam_task4/label_efficiency.json
