In [1]:
# Quick environment and data inspection
import os, glob, pandas as pd, numpy as np

print('CWD:', os.getcwd())
print('Train images:', len(glob.glob('train/*.png')))
print('Train_cleaned images:', len(glob.glob('train_cleaned/*.png')))
print('Test images:', len(glob.glob('test/*.png')))

sub_path = 'sampleSubmission.csv'
print('\nReading sampleSubmission.csv ...')
sub = pd.read_csv(sub_path)
print('submission shape:', sub.shape)
print('columns:', list(sub.columns))
print(sub.head(3))

CWD: /app/agent_run_states/denoising-dirty-documents-20250905-222218
Train images: 115
Train_cleaned images: 115
Test images: 29

Reading sampleSubmission.csv ...


submission shape: (5789880, 2)
columns: ['id', 'value']
        id  value
0  110_1_1      1
1  110_1_2      1
2  110_1_3      1


In [5]:
# Build baseline: 5x5 neighborhood features, image-wise KFold, XGBoost GPU if available
import os, glob, math, random, sys, subprocess
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def ensure_xgb(upgrade=True):
    if upgrade:
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', '--no-cache-dir', 'xgboost', '-q'], check=True)
    import xgboost as xgb
    return xgb

def load_gray(path):
    im = Image.open(path).convert('L')
    arr = np.asarray(im, dtype=np.uint8)
    return arr

def features_from_img(img, ks=5):
    # img uint8 HxW -> features (H*W, ks*ks), values normalized [0,1]
    pad = ks // 2
    padded = np.pad(img, ((pad, pad), (pad, pad)), mode='reflect')
    H, W = img.shape
    feats = []
    for dy in range(-pad, pad+1):
        for dx in range(-pad, pad+1):
            sl = padded[pad+dy:pad+dy+H, pad+dx:pad+dx+W]
            feats.append(sl.reshape(-1))
    X = np.stack(feats, axis=1).astype(np.float32) / 255.0
    return X

def sample_indices(H, W, max_per_image=60000, seed=42):
    n = H * W
    if n <= max_per_image:
        return np.arange(n, dtype=np.int64)
    rng = np.random.default_rng(seed)
    return rng.choice(n, size=max_per_image, replace=False)

train_paths = sorted(glob.glob('train/*.png'), key=lambda p: int(Path(p).stem))
clean_paths = {Path(p).stem: p for p in glob.glob('train_cleaned/*.png')}
test_paths = sorted(glob.glob('test/*.png'), key=lambda p: int(Path(p).stem))

# Prepare dataset by sampling per image to limit memory
ks = 5
max_per_image = 60000
X_list, y_list, ids_list = [], [], []
rng_base = 42
for i, tp in enumerate(train_paths):
    img_id = Path(tp).stem
    cp = clean_paths[img_id]
    img = load_gray(tp)
    tgt = load_gray(cp)
    H, W = img.shape
    X_full = features_from_img(img, ks=ks)
    y_full = (tgt.reshape(-1).astype(np.float32) / 255.0)
    idx = sample_indices(H, W, max_per_image=max_per_image, seed=rng_base + i)
    X_list.append(X_full[idx])
    y_list.append(y_full[idx])
    ids_list.extend([img_id]*len(idx))
    # free
    del X_full, y_full, img, tgt

X = np.concatenate(X_list, axis=0)
y = np.concatenate(y_list, axis=0)
img_ids = np.array(ids_list)
del X_list, y_list, ids_list
print('X shape:', X.shape, 'y shape:', y.shape, 'unique images:', len(np.unique(img_ids)))

# Image-wise 5-fold CV; run one fold to validate quickly
unique_imgs = np.array(sorted({*img_ids}, key=lambda s: int(s)))
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_img_idx, val_img_idx = next(kf.split(unique_imgs))
train_imgs = set(unique_imgs[train_img_idx])
val_imgs = set(unique_imgs[val_img_idx])
tr_mask = np.array([i in train_imgs for i in img_ids])
va_mask = ~tr_mask
X_tr, y_tr = X[tr_mask], y[tr_mask]
X_va, y_va = X[va_mask], y[va_mask]
print('Train/Val sizes:', X_tr.shape, X_va.shape)

xgb = ensure_xgb(upgrade=True)
gpu_params = dict(tree_method='gpu_hist', predictor='gpu_predictor')
params = dict(n_estimators=300, max_depth=6, learning_rate=0.06, subsample=0.8, colsample_bytree=0.8, reg_lambda=8.0, min_child_weight=1.0, random_state=42, n_jobs=8, verbosity=0, **gpu_params)
model = xgb.XGBRegressor(**params)
# Train without eval_set to avoid API incompatibilities; evaluate manually on holdout
model.fit(X_tr, y_tr)
pred_va = model.predict(X_va)
rmse = mean_squared_error(y_va, pred_va, squared=False)
print('Validation RMSE (1 fold):', rmse)

X shape: (6900000, 25) y shape: (6900000,) unique images: 115


Train/Val sizes: (5520000, 25) (1380000, 25)


Validation RMSE (1 fold): 0.020459445




In [6]:
# Train on all sampled data and generate submission.csv
import csv, time

xgb = ensure_xgb(upgrade=False)
gpu_params = dict(tree_method='gpu_hist', predictor='gpu_predictor')
full_params = dict(n_estimators=500, max_depth=6, learning_rate=0.06, subsample=0.8, colsample_bytree=0.8, reg_lambda=8.0, min_child_weight=1.0, random_state=42, n_jobs=8, verbosity=0, **gpu_params)
model_full = xgb.XGBRegressor(**full_params)
t0 = time.time()
model_full.fit(X, y)
print(f'Trained full model in {time.time()-t0:.1f}s on {X.shape[0]} samples')

out_path = 'submission.csv'
with open(out_path, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'value'])
    total_rows = 0
    for tp in test_paths:
        img_id = Path(tp).stem
        img = load_gray(tp)
        H, W = img.shape
        X_test = features_from_img(img, ks=ks)
        preds = model_full.predict(X_test).astype(np.float32)
        preds = np.clip(preds, 0.0, 1.0)
        # write row-major order, 1-based indices
        r_idx = np.repeat(np.arange(1, H+1), W)
        c_idx = np.tile(np.arange(1, W+1), H)
        for r, c, v in zip(r_idx, c_idx, preds):
            writer.writerow([f'{img_id}_{r}_{c}', float(v)])
        total_rows += H*W
        print(f'Wrote {img_id} with {H*W} pixels; cumulative {total_rows}')
print('Saved submission to', out_path)

Trained full model in 11.0s on 6900000 samples


Wrote 6 with 139320 pixels; cumulative 139320


Wrote 8 with 139320 pixels; cumulative 278640


Wrote 26 with 139320 pixels; cumulative 417960


Wrote 35 with 139320 pixels; cumulative 557280


Wrote 36 with 139320 pixels; cumulative 696600


Wrote 42 with 139320 pixels; cumulative 835920


Wrote 54 with 139320 pixels; cumulative 975240


Wrote 62 with 139320 pixels; cumulative 1114560


Wrote 68 with 139320 pixels; cumulative 1253880


Wrote 77 with 226800 pixels; cumulative 1480680


Wrote 78 with 226800 pixels; cumulative 1707480


Wrote 80 with 226800 pixels; cumulative 1934280


Wrote 95 with 226800 pixels; cumulative 2161080


Wrote 110 with 226800 pixels; cumulative 2387880


Wrote 111 with 226800 pixels; cumulative 2614680


Wrote 122 with 226800 pixels; cumulative 2841480


Wrote 131 with 226800 pixels; cumulative 3068280


Wrote 134 with 226800 pixels; cumulative 3295080


Wrote 137 with 226800 pixels; cumulative 3521880


Wrote 146 with 226800 pixels; cumulative 3748680


Wrote 150 with 226800 pixels; cumulative 3975480


Wrote 155 with 226800 pixels; cumulative 4202280


Wrote 159 with 226800 pixels; cumulative 4429080


Wrote 162 with 226800 pixels; cumulative 4655880


Wrote 170 with 226800 pixels; cumulative 4882680


Wrote 174 with 226800 pixels; cumulative 5109480


Wrote 180 with 226800 pixels; cumulative 5336280


Wrote 186 with 226800 pixels; cumulative 5563080


Wrote 216 with 226800 pixels; cumulative 5789880
Saved submission to submission.csv
