# Stage-2 Recognizer: GT-crop classifier (smoke test)

Plan:
- Parse train.csv (unicode x y w h) and folds_group.csv; use fold 0 for val.
- Build GT crop dataset with 15% padding, resized to 192x192, square padding.
- Label encode unicode tokens; balanced sampling.
- Model: torchvision resnet50 (ImageNet weights), replace fc to num_classes.
- Train 1 epoch smoke (cap steps) to validate pipeline; save checkpoint and class map.
- Next: extend epochs, add aug, and later run on detector crops for full E2E.

In [None]:
# GT crops dataset and ResNet50 classifier (fold 0; with unicode mapping + full training)
import os, math, time, json, random, re
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image, ImageOps
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.amp import autocast, GradScaler
import torchvision
from torchvision import transforms as T

torch.backends.cudnn.benchmark = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

CWD = Path('.')
train_csv = CWD / 'train.csv'
folds_csv = CWD / 'folds_group.csv'
trans_csv = CWD / 'unicode_translation.csv'
train_dir = CWD / 'train_images'
assert train_csv.exists() and folds_csv.exists() and train_dir.exists()
df_train = pd.read_csv(train_csv)
df_folds = pd.read_csv(folds_csv)
df_trans = pd.read_csv(trans_csv) if trans_csv.exists() else pd.DataFrame()

def parse_labels_full(labels: str):
    if not isinstance(labels, str) or labels.strip() == '':
        return []
    toks = labels.strip().split()
    out = []
    if len(toks) % 5 != 0:
        return out
    for i in range(0, len(toks), 5):
        u, x, y, w, h = toks[i:i+5]
        try:
            out.append((u, int(x), int(y), int(w), int(h)))
        except:
            pass
    return out

# Build per-image annotations
rows = []
for r in df_train.itertuples(index=False):
    image_id = getattr(r, 'image_id') if hasattr(r, 'image_id') else r[0]
    labels = getattr(r, 'labels') if hasattr(r, 'labels') else r[1]
    for (u,x,y,w,h) in parse_labels_full(labels):
        rows.append((image_id,u,x,y,w,h))
df_anns = pd.DataFrame(rows, columns=['image_id','unicode','x','y','w','h'])
print('Annotations (raw):', df_anns.shape)

# Build unicode mapping to canonical tokens if available
def build_unicode_map(df_trans: pd.DataFrame):
    if df_trans is None or df_trans.empty:
        return {}
    # Identify columns with many U+ tokens
    cand_cols = []
    for c in df_trans.columns:
        try:
            s = df_trans[c].astype(str)
        except Exception:
            continue
        m = s.str.match(r'^U\+[0-9A-Fa-f]+$').fillna(False).mean()
        if m > 0.2:
            cand_cols.append(c)
    if not cand_cols:
        return {}
    def canon_score(c):
        name = c.lower()
        score = 0
        if any(k in name for k in ['canon', 'target', 'to', 'new']):
            score += 2
        return score
    cand_cols_sorted = sorted(cand_cols, key=lambda c: (-canon_score(c), df_trans[c].nunique()))
    canon_col = cand_cols_sorted[0]
    mapping = {}
    for c in cand_cols:
        if c == canon_col:
            continue
        for a, b in zip(df_trans[c].astype(str), df_trans[canon_col].astype(str)):
            if re.match(r'^U\+[0-9A-Fa-f]+$', str(a)) and re.match(r'^U\+[0-9A-Fa-f]+$', str(b)) and a != b:
                mapping[a] = b
    print('Unicode mapping built:', len(mapping), 'mappings; canonical col =', canon_col)
    return mapping

u_map = build_unicode_map(df_trans)
if u_map:
    df_anns['unicode'] = df_anns['unicode'].map(lambda x: u_map.get(x, x))

print('Annotations (mapped) head:', df_anns.head(2).to_dict('records'))

# Label encode unicode tokens
unicodes = sorted(df_anns['unicode'].unique().tolist())
class_to_idx = {u:i for i,u in enumerate(unicodes)}
num_classes = len(class_to_idx)
print('Num classes (mapped):', num_classes)
Path('recognizer_classes.json').write_text(json.dumps(class_to_idx))

def clamp(val, lo, hi):
    return max(lo, min(hi, val))

IMG_SIZE = 192
PAD_RATIO = 0.25  # increased for robustness to detector noise
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
train_tf = T.Compose([
    T.RandomApply([T.ColorJitter(0.1,0.1,0.1,0.0)], p=0.5),
    T.RandomAffine(degrees=5, translate=(0.05,0.05), scale=(0.9,1.1), shear=5),
    T.RandomApply([T.GaussianBlur(kernel_size=3, sigma=(0.1,0.5))], p=0.3),
    T.ToTensor(),
    T.Normalize(mean, std),
])
val_tf = T.Compose([
    T.ToTensor(),
    T.Normalize(mean, std),
])

def load_and_crop(image_id, x, y, w, h):
    p = train_dir / f'{image_id}.jpg'
    if not p.exists():
        alt = train_dir / f'{image_id}.png'
        if alt.exists():
            p = alt
    img = Image.open(p).convert('RGB')
    W, H = img.size
    dx = int(round(w * PAD_RATIO)); dy = int(round(h * PAD_RATIO))
    x1 = clamp(x - dx, 0, W-1); y1 = clamp(y - dy, 0, H-1)
    x2 = clamp(x + w + dx, 1, W); y2 = clamp(y + h + dy, 1, H)
    crop = img.crop((x1, y1, x2, y2))
    cw, ch = crop.size
    if cw != ch:
        m = max(cw, ch)
        pad_w = m - cw; pad_h = m - ch
        crop = ImageOps.expand(crop, border=(0,0,pad_w,pad_h), fill=0) if cw < m or ch < m else crop
    crop = crop.resize((IMG_SIZE, IMG_SIZE), Image.BILINEAR)
    return crop

class GTCropDataset(Dataset):
    def __init__(self, df_anns: pd.DataFrame, df_folds: pd.DataFrame, fold: int, split: str):
        fold_map = dict(df_folds.values)
        keep = df_anns['image_id'].map(fold_map.get)
        if split == 'train':
            mask = keep != fold
        else:
            mask = keep == fold
        self.df = df_anns.loc[mask].reset_index(drop=True)
        self.split = split
        print(split, 'samples:', len(self.df))
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        img = load_and_crop(r.image_id, r.x, r.y, r.w, r.h)
        x = train_tf(img) if self.split == 'train' else val_tf(img)
        y = class_to_idx[r.unicode]
        return x, y

fold = 0
train_ds = GTCropDataset(df_anns, df_folds, fold, 'train')
val_ds = GTCropDataset(df_anns, df_folds, fold, 'val')

# Balanced sampler by inverse frequency
cls_counts = train_ds.df['unicode'].map(train_ds.df['unicode'].value_counts())
weights = 1.0 / cls_counts.values.astype(np.float64)
sampler = WeightedRandomSampler(weights=torch.as_tensor(weights, dtype=torch.double), num_samples=min(len(train_ds), 100000), replacement=True)

batch_size = 128
num_workers = min(8, os.cpu_count() or 2)
train_loader = DataLoader(train_ds, batch_size=batch_size, sampler=sampler, num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

# Model
from torchvision.models import resnet50, ResNet50_Weights
weights = ResNet50_Weights.IMAGENET1K_V2
model = resnet50(weights=weights)
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, num_classes)
model.to(device)

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
# Param groups: lower LR for backbone, higher for head
backbone_params = [p for n,p in model.named_parameters() if not n.startswith('fc.')]
head_params = list(model.fc.parameters())
optimizer = optim.AdamW([
    {'params': backbone_params, 'lr': 1e-4},
    {'params': head_params, 'lr': 1e-3},
], weight_decay=1e-4)
scaler = GradScaler('cuda') if device.type=='cuda' else None

# Cosine schedule with warmup (step-wise)
epochs = 12
steps_per_epoch = math.ceil((sampler.num_samples if hasattr(sampler, 'num_samples') else len(train_ds)) / batch_size)
total_steps = epochs * steps_per_epoch
warmup_steps = 500
base_lrs = [g['lr'] for g in optimizer.param_groups]

def set_lrs(scale):
    for g, base_lr in zip(optimizer.param_groups, base_lrs):
        g['lr'] = base_lr * scale

# EMA
use_ema = True
ema_decay = 0.999
ema_state = None
def init_ema():
    # clone current model state
    return {k: v.detach().clone() for k, v in model.state_dict().items()}
def update_ema():
    if not use_ema:
        return
    with torch.no_grad():
        msd = model.state_dict()
        for k, v in msd.items():
            if k not in ema_state:
                ema_state[k] = v.detach().clone()
                continue
            if torch.is_floating_point(v):
                # ema = decay*ema + (1-decay)*v
                ema_state[k].copy_(ema_state[k] * ema_decay + v.detach() * (1.0 - ema_decay))
            else:
                # For non-float buffers (e.g., num_batches_tracked), copy directly
                ema_state[k] = v.detach().clone()

def eval_with_state(state_dict):
    backup = {k: v.detach().clone() for k, v in model.state_dict().items()}
    model.load_state_dict(state_dict, strict=False)
    acc = evaluate(model, val_loader)
    model.load_state_dict(backup, strict=False)
    return acc

def train_one_epoch(model, loader, epoch, global_step0):
    model.train()
    t0 = time.time(); last = t0; n=0; loss_sum=0.0; gstep = global_step0
    for i,(x,y) in enumerate(loader):
        # LR schedule
        if gstep < warmup_steps:
            set_lrs((gstep+1)/max(1,warmup_steps))
        else:
            # cosine from 1.0 -> 0.0 over total_steps-warmup
            t = (gstep - warmup_steps) / max(1, total_steps - warmup_steps)
            cos_scale = 0.5 * (1 + math.cos(math.pi * t))
            set_lrs(cos_scale)

        x = x.to(device, non_blocking=True); y = y.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        if scaler is not None:
            with autocast('cuda'):
                logits = model(x)
                loss = criterion(logits, y)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward(); optimizer.step()
        # EMA update
        if use_ema:
            update_ema()

        n += 1; loss_sum += loss.item(); gstep += 1
        if time.time()-last > 5:
            mem = torch.cuda.memory_allocated()/1024**3 if device.type=='cuda' else 0.0
            print(f'Epoch {epoch} iter {i} avg_loss {loss_sum/n:.4f} step {gstep}/{total_steps} mem {mem:.2f}GB elapsed {time.time()-t0:.1f}s', flush=True); last=time.time()
    print(f'Epoch {epoch} done: steps {n}, avg {loss_sum/max(1,n):.4f}, time {time.time()-t0:.1f}s')
    return gstep

def evaluate(model, loader):
    model.eval()
    correct=0; total=0
    with torch.no_grad():
        for x,y in loader:
            x = x.to(device); y = y.to(device)
            logits = model(x)
            pred = logits.argmax(1)
            correct += (pred==y).sum().item()
            total += y.numel()
    acc = correct/max(1,total)
    print('Val acc:', f'{acc:.4f}', 'total:', total)
    return acc

best_acc = 0.0
best_ckpt = 'recognizer_resnet50_fold0_best.pth'
ema_state = init_ema() if use_ema else None
global_step = 0
for ep in range(1, epochs+1):
    global_step = train_one_epoch(model, train_loader, ep, global_step)
    # Evaluate with EMA weights if enabled
    if use_ema:
        acc = eval_with_state(ema_state)
    else:
        acc = evaluate(model, val_loader)
    if acc > best_acc:
        best_acc = acc
        to_save = ema_state if use_ema else model.state_dict()
        torch.save({'model': to_save, 'num_classes': num_classes, 'classes': class_to_idx, 'ema': use_ema}, best_ckpt)
        print('Saved best recognizer checkpoint (acc=', f'{acc:.4f}', ')')

# Always save last checkpoint and mapping (EMA if used)
last_ckpt = 'recognizer_resnet50_fold0_last.pth'
to_save_last = ema_state if use_ema else model.state_dict()
torch.save({'model': to_save_last, 'num_classes': num_classes, 'classes': class_to_idx, 'ema': use_ema}, last_ckpt)
Path('recognizer_classes.json').write_text(json.dumps(class_to_idx))
print('Saved last recognizer checkpoint and classes mapping')
print('Best acc:', f'{best_acc:.4f}')

Device: cuda


Annotations (raw): (613505, 6)
Unicode mapping built: 0 mappings; canonical col = Unicode
Annotations (mapped) head: [{'image_id': '200004148_00015_1', 'unicode': 'U+306F', 'x': 1187, 'y': 361, 'w': 47, 'h': 27}, {'image_id': '200004148_00015_1', 'unicode': 'U+306F', 'x': 1487, 'y': 2581, 'w': 48, 'h': 28}]
Num classes (mapped): 4113
train samples: 497857


val samples: 115648


In [3]:
# Save recognizer checkpoint from in-memory model after interrupted training
import torch, json
from pathlib import Path

assert 'model' in globals(), 'Recognizer model not found in memory. Re-run training cell 1.'
assert 'class_to_idx' in globals() and 'num_classes' in globals(), 'Class mapping not found. Re-run training cell 1.'

ckpt_path = Path('recognizer_resnet50_fold0_ep1.pth')
torch.save({'model': model.state_dict(), 'num_classes': num_classes, 'classes': class_to_idx}, ckpt_path)
Path('recognizer_classes.json').write_text(json.dumps(class_to_idx))
print('Saved recognizer checkpoint to', ckpt_path.resolve())
print('Saved classes to recognizer_classes.json with', len(class_to_idx), 'classes')

Saved recognizer checkpoint to /var/lib/simon/agent_run_states/kuzushiji-recognition-20250929-180012/recognizer_resnet50_fold0_ep1.pth
Saved classes to recognizer_classes.json with 4113 classes


In [4]:
# Inference: classify detector crops on test and assemble submission.csv
import json
import pandas as pd
from collections import defaultdict

CWD = Path('.')
test_dir = CWD / 'test_images'
det_test_path = CWD / 'det_test_preds.parquet'
ckpt_path = CWD / 'recognizer_resnet50_fold0_ep1.pth'
classes_path = CWD / 'recognizer_classes.json'
assert det_test_path.exists(), 'det_test_preds.parquet missing'
assert ckpt_path.exists() and classes_path.exists(), 'recognizer checkpoint or classes mapping missing'

# Load classes
class_to_idx = json.loads(classes_path.read_text())
idx_to_class = {int(v): k for k, v in class_to_idx.items()}
num_classes = len(idx_to_class)

# Build model
from torchvision.models import resnet50, ResNet50_Weights
weights = None  # will load our trained head
rec_model = resnet50(weights=weights)
in_features = rec_model.fc.in_features
rec_model.fc = nn.Linear(in_features, num_classes)
state = torch.load(ckpt_path, map_location='cpu')
rec_model.load_state_dict(state['model'], strict=False)
rec_model.to(device)
rec_model.eval()

val_tf = T.Compose([T.ToTensor(), T.Normalize(mean, std)])

def crop_from_box(image_id: str, x: float, y: float, w: float, h: float):
    p = test_dir / f'{image_id}.jpg'
    if not p.exists():
        alt = test_dir / f'{image_id}.png'
        if alt.exists():
            p = alt
    img = Image.open(p).convert('RGB')
    W, H = img.size
    dx = int(round(w * PAD_RATIO)); dy = int(round(h * PAD_RATIO))
    x1 = clamp(int(x) - dx, 0, W-1); y1 = clamp(int(y) - dy, 0, H-1)
    x2 = clamp(int(x + w) + dx, 1, W); y2 = clamp(int(y + h) + dy, 1, H)
    crop = img.crop((x1, y1, x2, y2))
    cw, ch = crop.size
    if cw != ch:
        m = max(cw, ch)
        pad_w = m - cw; pad_h = m - ch
        crop = ImageOps.expand(crop, border=(0,0,pad_w,pad_h), fill=0) if cw < m or ch < m else crop
    crop = crop.resize((IMG_SIZE, IMG_SIZE), Image.BILINEAR)
    return crop

# Load detector predictions
det_df = pd.read_parquet(det_test_path)
print('Detector test preds:', det_df.shape)

# Group by image and run batched classification per image for speed
rows_out = []
df_sample = pd.read_csv('sample_submission.csv')
grp = det_df.groupby('image_id')
t0 = time.time()
processed = 0
for image_id in df_sample['image_id']:
    if image_id in grp.groups:
        g = grp.get_group(image_id)
        # build batch crops
        crops = [crop_from_box(image_id, x, y, w, h) for x, y, w, h in zip(g['x'].values, g['y'].values, g['w'].values, g['h'].values)]
        if len(crops) == 0:
            rows_out.append('')
            continue
        xs = torch.stack([val_tf(c) for c in crops], dim=0).to(device)
        with torch.no_grad():
            logits = rec_model(xs)
            pred_idx = logits.argmax(1).detach().cpu().numpy().tolist()
        pred_unicodes = [idx_to_class.get(int(i), 'U+003F') for i in pred_idx]
        # centers for submission
        cx = (g['x'].values + g['w'].values/2.0).round().astype(int).tolist()
        cy = (g['y'].values + g['h'].values/2.0).round().astype(int).tolist()
        toks = []
        for u, x_, y_ in zip(pred_unicodes, cx, cy):
            toks.extend([u, str(int(x_)), str(int(y_))])
        rows_out.append(' '.join(toks))
    else:
        rows_out.append('')
    processed += 1
    if processed % 50 == 0:
        print(f'Processed {processed}/{len(df_sample)} images in {time.time()-t0:.1f}s', flush=True)

sub_df = pd.DataFrame({'image_id': df_sample['image_id'], 'labels': rows_out})
sub_df.to_csv('submission.csv', index=False)
print('Wrote submission.csv with shape', sub_df.shape)
print(sub_df.head(2))

  state = torch.load(ckpt_path, map_location='cpu')


Detector test preds: (34458, 6)


Processed 50/361 images in 101.3s


Processed 100/361 images in 208.0s


Processed 150/361 images in 308.4s


Processed 200/361 images in 417.2s


Processed 250/361 images in 521.4s


Processed 300/361 images in 619.7s


Processed 350/361 images in 723.8s


Wrote submission.csv with shape (361, 2)
      image_id                                             labels
0  umgy007-028  U+904A 1285 2603 U+65BC 341 903 U+3067 1294 14...
1  hnsd004-026  U+6575 493 582 U+3093 1246 2424 U+975E 501 174...


In [None]:
# Assemble submission using best detector threshold/cap and best recognizer (EMA) checkpoint
import json, time
import pandas as pd
import numpy as np
from pathlib import Path
from PIL import Image, ImageOps
import torch
import torch.nn as nn
from torchvision import transforms as T
from torchvision.models import resnet50

CWD = Path('.')
best_cfg_path = CWD / 'det_threshold_best_fold0.json'
det_test_path = CWD / 'det_test_preds.parquet'
classes_path = CWD / 'recognizer_classes.json'
best_ckpt_path = CWD / 'recognizer_resnet50_fold0_best.pth'
test_dir = CWD / 'test_images'
sample_path = CWD / 'sample_submission.csv'
assert best_cfg_path.exists() and det_test_path.exists() and classes_path.exists() and best_ckpt_path.exists() and sample_path.exists(), 'Missing required artifacts for assembly'

best_cfg = json.loads(best_cfg_path.read_text())
th = float(best_cfg['best_threshold']); cap = int(best_cfg['best_cap'])
use_dedup = bool(best_cfg.get('dedup', False))
dedup_radius = float(best_cfg.get('dedup_radius', 6.0))
print('Using detector filter:', best_cfg)

# Load detector predictions and filter
det_df = pd.read_parquet(det_test_path)
print('Raw test preds:', det_df.shape, 'score stats:', det_df['score'].describe().to_dict())
det_df = det_df[det_df['score'] >= th].copy()
det_df.sort_values(['image_id','score'], ascending=[True, False], inplace=True)
det_df['rn'] = det_df.groupby('image_id').cumcount()
det_df = det_df[det_df['rn'] < cap].drop(columns=['rn'])

def dedup_centers_df(df_img: pd.DataFrame, radius: float = 6.0) -> pd.DataFrame:
    if len(df_img) <= 1:
        return df_img
    g = df_img.sort_values('score', ascending=False).reset_index(drop=True)
    keep_idx = []
    kept = []
    for i, (cx, cy) in enumerate((g['x'] + g['w']/2.0, g['y'] + g['h']/2.0)):
        pass  # placeholder (will be replaced below)
    # Implement without vectorized zip to maintain clarity
    keep_idx = []
    kept = []
    for i in range(len(g)):
        cx = float(g.loc[i, 'x'] + g.loc[i, 'w']/2.0)
        cy = float(g.loc[i, 'y'] + g.loc[i, 'h']/2.0)
        ok = True
        for (kx, ky) in kept:
            if (cx - kx)*(cx - kx) + (cy - ky)*(cy - ky) <= radius*radius:
                ok = False; break
        if ok:
            keep_idx.append(i); kept.append((cx, cy))
    return g.iloc[keep_idx].reset_index(drop=True)

if use_dedup:
    parts = []
    for img_id, g in det_df.groupby('image_id'):
        parts.append(dedup_centers_df(g, radius=dedup_radius))
    det_df = pd.concat(parts, axis=0).reset_index(drop=True) if parts else det_df.iloc[0:0]
print('Filtered test preds:', det_df.shape, 'dedup applied:' , use_dedup)

# Load classes and model
class_to_idx = json.loads(classes_path.read_text())
idx_to_class = {int(v): k for k, v in class_to_idx.items()}
num_classes = len(idx_to_class)
rec_model = resnet50(weights=None)
rec_model.fc = nn.Linear(rec_model.fc.in_features, num_classes)
state = torch.load(best_ckpt_path, map_location='cpu')
rec_model.load_state_dict(state['model'], strict=False)
rec_model.to(device)
rec_model.eval()

val_tf = T.Compose([T.ToTensor(), T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])])

def clamp(v, lo, hi):
    return max(lo, min(hi, v))

def crop_from_box(image_id: str, x: float, y: float, w: float, h: float, pad_ratio: float=0.25, img_size: int=192):
    p = test_dir / f'{image_id}.jpg'
    if not p.exists():
        alt = test_dir / f'{image_id}.png'
        if alt.exists():
            p = alt
    img = Image.open(p).convert('RGB')
    W, H = img.size
    dx = int(round(w * pad_ratio)); dy = int(round(h * pad_ratio))
    x1 = clamp(int(x) - dx, 0, W-1); y1 = clamp(int(y) - dy, 0, H-1)
    x2 = clamp(int(x + w) + dx, 1, W); y2 = clamp(int(y + h) + dy, 1, H)
    crop = img.crop((x1, y1, x2, y2))
    cw, ch = crop.size
    if cw != ch:
        m = max(cw, ch)
        pad_w = m - cw; pad_h = m - ch
        crop = ImageOps.expand(crop, border=(0,0,pad_w,pad_h), fill=0) if cw < m or ch < m else crop
    crop = crop.resize((192, 192), Image.BILINEAR)
    return crop

df_sample = pd.read_csv(sample_path)
grp = det_df.groupby('image_id')
rows_out = []
t0 = time.time()
for i, image_id in enumerate(df_sample['image_id'].tolist(), 1):
    if image_id in grp.groups:
        g = grp.get_group(image_id)
        crops = [crop_from_box(image_id, x, y, w, h) for x,y,w,h in zip(g['x'].values, g['y'].values, g['w'].values, g['h'].values)]
        if len(crops) == 0:
            rows_out.append('')
        else:
            xs = torch.stack([val_tf(c) for c in crops]).to(device)
            with torch.no_grad():
                logits = rec_model(xs)
                pred_idx = logits.argmax(1).detach().cpu().numpy().tolist()
            pred_unicodes = [idx_to_class.get(int(k), 'U+003F') for k in pred_idx]
            cx = (g['x'].values + g['w'].values/2.0).round().astype(int).tolist()
            cy = (g['y'].values + g['h'].values/2.0).round().astype(int).tolist()
            toks = []
            for u, x_, y_ in zip(pred_unicodes, cx, cy):
                toks.extend([u, str(int(x_)), str(int(y_))])
            rows_out.append(' '.join(toks))
    else:
        rows_out.append('')
    if i % 50 == 0:
        print(f'Assembled {i}/{len(df_sample)} in {time.time()-t0:.1f}s', flush=True)

sub_df = pd.DataFrame({'image_id': df_sample['image_id'], 'labels': rows_out})
sub_df.to_csv('submission.csv', index=False)
print('Wrote submission.csv with shape', sub_df.shape)
print(sub_df.head(2))