# Plan: Zero-Shot CLIP Baseline for Cdiscount

Objectives:
- Ship a working baseline ASAP that generates submission.csv.
- Use GPU-accelerated zero-shot image classification (CLIP) with category_names.csv since train data is not present.

Rationale:
- Repository includes test.bson, sample_submission.csv, and category_names.csv, but no train images/labels.
- Zero-shot CLIP can map image embeddings to text label embeddings of categories.
- This avoids training time and leverages GPU for fast throughput.

Milestones:
1) Environment + GPU check
2) Data inspection: read test.bson, confirm schema (product_id, images)
3) Build category text prompts from category_names.csv
4) Load CLIP (open_clip or OpenAI clip) on GPU
5) Encode category prompts (cache text embeddings)
6) Iterate over test.bson in streaming batches:
   - Decode images (PIL), preprocess
   - Encode images with CLIP
   - Compute similarities to category text embeddings
   - Argmax to get predicted category_id
   - Use majority vote across images per product_id
7) Write submission.csv and validate format vs sample_submission.csv
8) Time/perf logging per N batches; subsample smoke test first

Risk/Alternatives:
- Zero-shot accuracy may be limited with >5k classes; may still place well in this benchmark.
- If slow, reduce model size (e.g., ViT-B/16) and use fp16.

Next:
- Ask experts to confirm zero-shot CLIP viability and any prompt engineering tips.
- Then implement GPU check + BSON reader + CLIP pipeline with careful logging.

In [8]:
# Setup: inventory files, search for train data, and GPU check/install deps
import os, sys, subprocess, time, json, shutil, gc
from pathlib import Path

# Set CPU threading env BEFORE importing torch
os.environ.setdefault('OMP_NUM_THREADS', '24')
os.environ.setdefault('MKL_NUM_THREADS', '24')
os.environ.setdefault('OPENBLAS_NUM_THREADS', '24')

print('CWD:', os.getcwd())
print('Files:', os.listdir())

# Search for train data quickly (sanity check)
import subprocess
try:
    out = subprocess.check_output(['bash','-lc','set -o pipefail; find / -name "*train.bson*" 2>/dev/null | head -n 20'], text=True)
    print('Potential train files found:\n', out)
except Exception as e:
    print('find train.bson error:', e)

# Ensure key packages; install torch if missing
def ensure(pkg, pip_name=None, extra_args=None):
    pip_name = pip_name or pkg
    try:
        __import__(pkg)
        print(f'OK: {pkg} already installed')
    except Exception:
        cmd = [sys.executable, '-m', 'pip', 'install', pip_name]
        if extra_args:
            cmd += extra_args
        print('Installing', pip_name)
        subprocess.check_call(cmd)

ensure('torch', 'torch')
ensure('pandas', 'pandas')
ensure('numpy', 'numpy')
ensure('PIL', 'Pillow')
ensure('bson', 'pymongo')  # bson is provided by pymongo
ensure('open_clip', 'open_clip_torch')

import torch
print(f'GPU Available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU Name: {torch.cuda.get_device_name(0)}')
    props = torch.cuda.get_device_properties(0)
    print(f'GPU Memory: {props.total_memory / 1024**3:.1f} GB')

import pandas as pd
cat_df = pd.read_csv('category_names.csv')
ss = pd.read_csv('sample_submission.csv')
print('category_names.csv shape:', cat_df.shape)
print('sample_submission.csv shape:', ss.shape)
print(cat_df.head(3))
print(ss.head(3))
print('Unique _id in sample_submission:', ss['_id'].nunique())

print('Setup complete.')

CWD: /var/lib/simon/agent_run_states/cdiscount-image-classification-challenge-20250920-182308
Files: ['cache', 'test.bson', 'sample_submission.csv', 'category_names.csv', 'docker_run.log', 'requirements.txt', 'submission.csv', 'agent_metadata', 'task.txt', '00_eda_and_planning.ipynb', 'description.md']


find train.bson error: Command '['bash', '-lc', 'set -o pipefail; find / -name "*train.bson*" 2>/dev/null | head -n 20']' returned non-zero exit status 1.
OK: torch already installed
OK: pandas already installed
OK: numpy already installed
OK: PIL already installed
OK: bson already installed
OK: open_clip already installed
GPU Available: False
category_names.csv shape: (5270, 4)
sample_submission.csv shape: (706990, 2)
   category_id              category_level1     category_level2  \
0   1000021794        ABONNEMENT / SERVICES      CARTE PREPAYEE   
1   1000012764  AMENAGEMENT URBAIN - VOIRIE  AMENAGEMENT URBAIN   
2   1000012776  AMENAGEMENT URBAIN - VOIRIE  AMENAGEMENT URBAIN   

             category_level3  
0  CARTE PREPAYEE MULTIMEDIA  
1                ABRI FUMEUR  
2      ABRI VELO - ABRI MOTO  
   _id  category_id
0    6   1000010653
1    7   1000010653
2   12   1000010653
Unique _id in sample_submission: 706990
Setup complete.


In [7]:
# Build class list, prompts, load CLIP, and encode text embeddings (cacheable) + hierarchy maps
import pandas as pd, numpy as np, os, re, torch, open_clip, time, json
from pathlib import Path

# CPU-optimized settings (GPU unavailable): RN50 + more prompts
SUBSAMPLE_CLASSES = None  # encode all classes

cat_df = pd.read_csv('category_names.csv')
assert 'category_id' in cat_df.columns, 'category_id column missing in category_names.csv'

# Use provided hierarchy columns if present
lvl1_col = 'category_level1' if 'category_level1' in cat_df.columns else None
lvl3_col = None
for c in ['category_level3', 'category_name', 'name', 'title']:
    if c in cat_df.columns:
        lvl3_col = c
        break
assert lvl3_col is not None, 'No usable category name (level3) column found'

def clean_label(s):
    s = str(s)
    s = re.split(r'[>/,|]', s)[-1]
    s = s.strip()
    return s if s else 'produit'

# Level-3 (final classes)
class_ids_all = cat_df['category_id'].astype(int).tolist()
class_names_raw_all = cat_df[lvl3_col].astype(str).tolist()
class_names_all = [clean_label(x) for x in class_names_raw_all]
if SUBSAMPLE_CLASSES is not None:
    class_ids = class_ids_all[:SUBSAMPLE_CLASSES]
    class_names = class_names_all[:SUBSAMPLE_CLASSES]
else:
    class_ids = class_ids_all
    class_names = class_names_all
num_classes = len(class_ids)
print(f'Classes used (L3): {num_classes} / {len(class_ids_all)}; sample: {class_ids[:3]} -> {class_names[:3]}')

# Level-1 (superclasses)
if lvl1_col is not None:
    lvl1_all = cat_df[lvl1_col].astype(str).tolist()
    lvl1_names_unique = sorted(set(lvl1_all))
else:
    lvl1_names_unique = []
print('Level1 classes:', len(lvl1_names_unique))

# Prompt templates (FR + EN) - 9 templates
fr_tmps = [
    'une photo de {}',
    'une photo produit de {}',
    'une image de {}',
    'photo d\u2019un {}',
    '{} (produit)'
]
en_tmps = [
    'a product photo of a {}',
    'a catalog photo of a {}',
    'an image of a {}',
    'a {}'
]
templates = fr_tmps + en_tmps
print(f'Using {len(templates)} templates')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)
# Faster CPU model
model_name = 'RN50'
pretrained_tag = 'openai'  # available tags for RN50: ['openai','yfcc15m','cc12m']
print('Loading open_clip model:', model_name, pretrained_tag)
model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained_tag, device=device)
tokenizer = open_clip.get_tokenizer(model_name)
model.eval()
try:
    torch.set_num_threads(12)
    torch.set_num_interop_threads(1)
    torch.backends.mkldnn.enabled = True
    print('torch threads:', torch.get_num_threads(), 'interop:', torch.get_num_interop_threads(), 'mkldnn:', torch.backends.mkldnn.enabled)
except Exception as e:
    print('Could not set torch threading:', e)

# Encode and cache text embeddings helpers
cache_dir = Path('cache')
cache_dir.mkdir(exist_ok=True)

def encode_labels_to_embeds(labels, cache_key):
    text_cache_path = cache_dir / f'text_embeds_{model_name}_{pretrained_tag}_{len(templates)}tmps_{cache_key}.npy'
    order_cache_path = cache_dir / f'labels_order_{model_name}_{cache_key}.json'
    if text_cache_path.exists() and order_cache_path.exists():
        try:
            embeds = np.load(text_cache_path)
            cached_labels = json.loads(order_cache_path.read_text())
            if cached_labels == labels:
                print(f'Loaded cached text embeddings for {cache_key}:', embeds.shape)
                t = torch.from_numpy(embeds).to(device)
                return t / t.norm(dim=-1, keepdim=True)
            else:
                print('Label order changed; recomputing for', cache_key)
        except Exception as e:
            print('Cache load failed for', cache_key, e)
    with torch.no_grad():
        all_embeds = []
        start = time.time()
        for idx, name in enumerate(labels):
            texts = [t.format(name) for t in templates]
            text_tokens = tokenizer(texts).to(device)
            tf = model.encode_text(text_tokens)
            tf = tf / tf.norm(dim=-1, keepdim=True)
            cls_embed = tf.mean(dim=0, keepdim=True)
            cls_embed = cls_embed / cls_embed.norm(dim=-1, keepdim=True)
            all_embeds.append(cls_embed)
            if (idx+1) % 500 == 0 or (idx+1) == len(labels):
                print(f'Encoded {idx+1}/{len(labels)} labels for {cache_key} in {time.time()-start:.1f}s')
        embeds = torch.cat(all_embeds, dim=0).float().cpu().numpy()
    np.save(text_cache_path, embeds)
    order_cache_path.write_text(json.dumps(labels))
    t = torch.from_numpy(embeds).to(device)
    return t / t.norm(dim=-1, keepdim=True)

# Build Level1 -> Level3 index map
lvl1_to_children_idx = {}  # level1 index -> list of L3 indices
lvl1_names = lvl1_names_unique
lvl1_name_to_idx = {n:i for i,n in enumerate(lvl1_names)}
if lvl1_names:
    # Map based on cat_df rows to indices in our class_ids list
    l3_id_to_pos = {cid:i for i,cid in enumerate(class_ids)}
    for _, row in cat_df.iterrows():
        cid = int(row['category_id'])
        if cid not in l3_id_to_pos:
            continue
        l3_pos = l3_id_to_pos[cid]
        l1_name = str(row[lvl1_col]) if lvl1_col else None
        if l1_name is None:
            continue
        l1_idx = lvl1_name_to_idx[l1_name]
        lvl1_to_children_idx.setdefault(l1_idx, []).append(l3_pos)
    print('Built Level1->Level3 child map for', len(lvl1_to_children_idx), 'Level1 nodes')

# Encode text embeddings for Level3 (final) and Level1 (coarse) labels
text_embeds_l3_t = encode_labels_to_embeds(class_names, cache_key=f'L3_{num_classes}cls')
if lvl1_names:
    text_embeds_l1_t = encode_labels_to_embeds(lvl1_names, cache_key=f'L1_{len(lvl1_names)}cls')
else:
    text_embeds_l1_t = None

# Make variables available for inference cells
text_embeds_t = text_embeds_l3_t  # backwards compat
print('Final: L3 embeds:', tuple(text_embeds_l3_t.shape), 'L1 embeds:', None if text_embeds_l1_t is None else tuple(text_embeds_l1_t.shape))
print('Done text encoding.')

Classes used (L3): 5270 / 5270; sample: [1000021794, 1000012764, 1000012776] -> ['CARTE PREPAYEE MULTIMEDIA', 'ABRI FUMEUR', 'ABRI VELO - ABRI MOTO']
Level1 classes: 49
Using 9 templates
Device: cpu
Loading open_clip model: RN50 openai




torch threads: 12 interop: 1 mkldnn: True
Built Level1->Level3 child map for 49 Level1 nodes


Encoded 500/5270 labels for L3_5270cls in 36.0s


Encoded 1000/5270 labels for L3_5270cls in 72.1s


Encoded 1500/5270 labels for L3_5270cls in 108.0s


Encoded 2000/5270 labels for L3_5270cls in 143.9s


Encoded 2500/5270 labels for L3_5270cls in 179.9s


Encoded 3000/5270 labels for L3_5270cls in 215.9s


Encoded 3500/5270 labels for L3_5270cls in 251.9s


Encoded 4000/5270 labels for L3_5270cls in 288.1s


Encoded 4500/5270 labels for L3_5270cls in 323.9s


Encoded 5000/5270 labels for L3_5270cls in 359.5s


Encoded 5270/5270 labels for L3_5270cls in 378.6s


Encoded 49/49 labels for L1_49cls in 3.5s
Final: L3 embeds: (5270, 1024) L1 embeds: (49, 1024)
Done text encoding.


In [None]:
# Inference: stream test.bson, encode images, score vs text (hierarchical with L1 top-k), and build submission
import bson
from io import BytesIO
from PIL import Image, UnidentifiedImageError
import pandas as pd, numpy as np, torch, time, sys, collections

# CPU-friendly knobs
SUBSAMPLE_PRODUCTS = None   # full run
USE_TTA_HFLIP = False       # keep False for CPU
BATCH_IMAGES = 256          # tune {192,256,320,384}
L1_TOPK = 2                 # beam size at Level-1

ss = pd.read_csv('sample_submission.csv')
ss_ids = ss['_id'].tolist()
print('Sample submission size:', len(ss_ids))

def pil_from_bytes(b):
    try:
        return Image.open(BytesIO(b)).convert('RGB')
    except UnidentifiedImageError:
        return None

# Batch encode helper
def flush_batch(img_tensors, owners):
    if not img_tensors:
        return {}
    batch = torch.stack(img_tensors).to(device)
    with torch.no_grad():
        feats = model.encode_image(batch)
        feats = feats / feats.norm(dim=-1, keepdim=True)
    prod_feats = {}
    for feat, pid in zip(feats, owners):  # one image per product in our pipeline
        prod_feats[pid] = feat / feat.norm()
    return prod_feats

def hierarchical_score_topk(prod_feats_dict):
    # Per-product scoring with L1 top-k beam, then score union of children
    out = {}
    for pid, feat in prod_feats_dict.items():
        if text_embeds_l1_t is not None and len(lvl1_to_children_idx) > 0:
            with torch.no_grad():
                sims_l1 = (feat @ text_embeds_l1_t.T)  # [L1]
                topk_vals, topk_idx = torch.topk(sims_l1, k=min(L1_TOPK, sims_l1.shape[0]))
            child_set = []
            for l1i in topk_idx.tolist():
                child_set.extend(lvl1_to_children_idx.get(int(l1i), []))
            if not child_set:
                # flat fallback
                with torch.no_grad():
                    sims = feat @ text_embeds_t.T
                    ti = int(torch.argmax(sims).item())
                out[pid] = int(class_ids[ti])
            else:
                # dedup children
                child_set = list(dict.fromkeys(child_set))
                child_mat = text_embeds_t[child_set]  # [K, D]
                with torch.no_grad():
                    sims_c = feat @ child_mat.T  # [K]
                    cp = int(torch.argmax(sims_c).item())
                cls_pos = child_set[cp]
                out[pid] = int(class_ids[cls_pos])
        else:
            with torch.no_grad():
                sims = feat @ text_embeds_t.T
                ti = int(torch.argmax(sims).item())
            out[pid] = int(class_ids[ti])
    return out

# Threading (ensure high CPU throughput)
try:
    torch.set_num_threads(24)
    torch.set_num_interop_threads(1)
    torch.backends.mkldnn.enabled = True
    print('torch threads:', torch.get_num_threads(), 'interop:', torch.get_num_interop_threads())
except Exception as e:
    print('Thread config failed:', e)

pred_map = {}  # product_id -> category_id
fallback_cat = int(class_ids[0])
start = time.time()
img_tensors, owners = [], []
img_count = 0
predicted_so_far = 0
log_every_preds = 2000

with open('test.bson', 'rb') as f:
    for doc in bson.decode_file_iter(f):
        pid = int(doc['_id'])
        pics = doc.get('imgs') or doc.get('pictures') or doc.get('images') or []
        # Use only the largest image by byte size per product for speed
        best_b = None
        best_sz = -1
        for p in pics:
            b = p.get('picture') or p.get('img') or p.get('data')
            if b is None:
                continue
            sz = len(b) if hasattr(b, '__len__') else 0
            if sz > best_sz:
                best_sz = sz
                best_b = b
        if best_b is not None:
            im = pil_from_bytes(best_b)
            if im is not None:
                img_tensors.append(preprocess(im))
                owners.append(pid)
                img_count += 1

        if len(img_tensors) >= BATCH_IMAGES:
            prod_feats = flush_batch(img_tensors, owners)
            img_tensors.clear(); owners.clear()
            if prod_feats:
                preds_chunk = hierarchical_score_topk(prod_feats)
                pred_map.update(preds_chunk)
                predicted_so_far = len(pred_map)
                if predicted_so_far % log_every_preds == 0:
                    dt = time.time() - start
                    ips = img_count / max(dt, 1e-6)
                    print(f'Pred products: {predicted_so_far}, imgs processed: {img_count}, elapsed: {dt:.1f}s, imgs/s: {ips:.1f}', flush=True)
                if SUBSAMPLE_PRODUCTS is not None and predicted_so_far >= SUBSAMPLE_PRODUCTS:
                    print(f'Stopping early after {predicted_so_far} products (profile)')
                    break
    else:
        pass

if img_tensors:
    prod_feats = flush_batch(img_tensors, owners)
    if prod_feats:
        preds_chunk = hierarchical_score_topk(prod_feats)
        pred_map.update(preds_chunk)

dt = time.time() - start
ips = img_count / max(dt, 1e-6)
print(f'Total predicted products: {len(pred_map)}; images processed: {img_count}; elapsed: {dt:.1f}s; imgs/s: {ips:.1f}')

# Build submission in sample order; fill missing with fallback (should be minimal)
preds = [pred_map.get(pid, fallback_cat) for pid in ss_ids]
sub = pd.DataFrame({'_id': ss_ids, 'category_id': preds})
sub.to_csv('submission.csv', index=False)
print('Wrote submission.csv with shape:', sub.shape)
print(sub.head())

Sample submission size: 706990
Thread config failed: Error: cannot set number of interop threads after parallel work has started or set_num_interop_threads called


In [4]:
# Quick scan: count products and images in test.bson, estimate runtime
import bson, time, os
from io import BytesIO
from PIL import Image, UnidentifiedImageError

path = 'test.bson'
size_gb = os.path.getsize(path) / (1024**3)
print(f'test.bson size: {size_gb:.3f} GB')

start = time.time()
n_prod = 0
n_imgs = 0
bad_imgs = 0
sample_limit = 2000  # stop early to estimate throughput
t0 = time.time()
with open(path, 'rb') as f:
    for doc in bson.decode_file_iter(f):
        n_prod += 1
        pics = doc.get('imgs') or []
        n_imgs += len(pics)
        if n_prod % 10000 == 0:
            dt = time.time() - start
            print(f'Counted {n_prod} products, {n_imgs} images in {dt:.1f}s')
        if sample_limit and n_prod >= sample_limit:
            break
dt = time.time() - start
pps = n_prod / max(dt, 1e-6)
ips = n_imgs / max(dt, 1e-6)
print(f'Sampled {n_prod} products, {n_imgs} images in {dt:.2f}s  (products/s ~ {pps:.1f}, images/s ~ {ips:.1f})')
print('Note: This is scan speed only (no model). Use to estimate total product count feasibility.')

test.bson size: 5.814 GB
Sampled 2000 products, 3242 images in 0.01s  (products/s ~ 201950.2, images/s ~ 327361.3)
Note: This is scan speed only (no model). Use to estimate total product count feasibility.
