In [8]:
# Diagnostics for CatBoost training progress
import os, json, time
from datetime import datetime
import pandas as pd

def stat(p):
    if not os.path.exists(p):
        return 'MISSING'
    sz = os.path.getsize(p)
    mt = datetime.fromtimestamp(os.path.getmtime(p))
    return f'size={sz} mtime={mt}'

print('=== catboost_info stats ===')
for p in [
    'catboost_info/learn_error.tsv',
    'catboost_info/test_error.tsv',
    'catboost_info/time_left.tsv',
    'catboost_info/catboost_training.json'
]:
    print(p, '->', stat(p))

def tail_tsv(path, n=5):
    if not os.path.exists(path):
        return f'{path} MISSING'
    try:
        # Read last lines efficiently
        with open(path, 'rb') as f:
            f.seek(0, os.SEEK_END)
            end = f.tell()
            block = 4096
            data = b''
            while len(data.splitlines()) <= n + 1 and end > 0:
                read = min(block, end)
                f.seek(end - read)
                data = f.read(read) + data
                end -= read
        txt = data.decode('utf-8', errors='ignore').splitlines()
        return '\n'.join(txt[-n:])
    except Exception as e:
        return f'Error reading {path}: {e}'

print('\n=== Tail: test_error.tsv (validation metric per iter) ===')
print(tail_tsv('catboost_info/test_error.tsv', n=10))

print('\n=== Tail: learn_error.tsv (train metric per iter) ===')
print(tail_tsv('catboost_info/learn_error.tsv', n=5))

print('\n=== time_left.tsv (CatBoost ETA) ===')
print(tail_tsv('catboost_info/time_left.tsv', n=5))

print('\n=== catboost_training.json summary ===')
jpath = 'catboost_info/catboost_training.json'
if os.path.exists(jpath):
    try:
        # catboost JSON can be large; read last ~1MB to find summary keys
        with open(jpath, 'rb') as f:
            f.seek(0, os.SEEK_END)
            size = f.tell()
            f.seek(max(0, size - 1_000_000))
            tail = f.read().decode('utf-8', errors='ignore')
        # Try to find best_iteration and best_value patterns
        best_iter = None
        best_value = None
        for line in tail.splitlines():
            if 'best_iteration' in line and best_iter is None:
                # naive parse of digits
                digits = ''.join(ch for ch in line if ch.isdigit())
                if digits:
                    best_iter = int(digits)
            if 'best_value' in line and best_value is None:
                try:
                    # extract float
                    import re
                    m = re.search(r"best_value[^\d\.-]*([-+]?[0-9]*\.?[0-9]+)", line)
                    if m:
                        best_value = float(m.group(1))
                except Exception:
                    pass
            if best_iter is not None and best_value is not None:
                break
        print('best_iteration (approx):', best_iter, ' | best_value (Accuracy):', best_value)
    except Exception as e:
        print('Error parsing catboost_training.json tail:', e)
else:
    print('catboost_training.json MISSING')

print('\n=== submission.csv status ===')
print('submission.csv ->', stat('submission.csv'))

print('\n=== Done diagnostics ===')

=== catboost_info stats ===
catboost_info/learn_error.tsv -> size=80425 mtime=2025-09-09 03:49:11.833574
catboost_info/test_error.tsv -> size=79367 mtime=2025-09-09 03:49:11.833574
catboost_info/time_left.tsv -> size=57346 mtime=2025-09-09 03:49:11.833574
catboost_info/catboost_training.json -> size=377793 mtime=2025-09-09 03:49:11.833574

=== Tail: test_error.tsv (validation metric per iter) ===
2575	0.9617055556	0.08158583073
2576	0.9617166667	0.08158406781
2577	0.9617277778	0.08158231273
2578	0.961725	0.08158087719
2579	0.9617277778	0.08158097898
2580	0.9617194444	0.08157987848
2581	0.9617083333	0.08157819243
2582	0.9617166667	0.08157563048
2583	0.9617083333	0.08157488098
2584	0.9617083333	0.08157134622

=== Tail: learn_error.tsv (train metric per iter) ===
2580	0.9649697531	0.07479400246
2581	0.9649688272	0.07478990193
2582	0.9649697531	0.07478542022
2583	0.9649700617	0.07478101552
2584	0.9649768519	0.07477431933

=== time_left.tsv (CatBoost ETA) ===
2580	38119559	6188336
2581	3813