In [1]:
import subprocess, sys, time
print('Checking GPU with nvidia-smi ...', flush=True)
try:
    out = subprocess.run(['bash','-lc','nvidia-smi || true'], capture_output=True, text=True, timeout=30)
    print(out.stdout)
    if 'NVIDIA-SMI' not in out.stdout:
        print('WARNING: GPU not available. Proceeding CPU-only will be slow. Consider exiting.', flush=True)
    else:
        print('GPU detected. Proceed.', flush=True)
except Exception as e:
    print('nvidia-smi check failed:', e, file=sys.stderr)

Checking GPU with nvidia-smi ...


Tue Sep 23 04:57:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.06             Driver Version: 550.144.06     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10-24Q                 On  |   00000002:00:00.0 Off |                    0 |
| N/A   N/A    P0             N/A /  N/A  |     182MiB /  24512MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Plan: Facebook Recruiting III - Keyword Extraction

Objectives:
- Build a strong, fast baseline and iterate to medal.

Validation:
- Use multilabel iterative stratification (preserving tag frequencies) with fixed random_state.
- Cache folds and reuse.
- Optimize global decision threshold and/or top-K per sample to maximize micro-F1 on OOF.

Data Pipeline:
- Load train/test; columns likely: Id, Title, Body, Tags.
- Target: Tags is space-separated labels.
- Text: concat Title + Body (with separator), minimal cleaning (lower, strip HTML, code blocks handling).
- Keep punctuation that helps n-grams; remove very long code blocks if present.

Vectorization:
- TF-IDF word n-grams (1-2) + char n-grams (3-5).
- Limit features via max_features and min_df; or use HashingVectorizer to save memory and fit speed.
- Build combined sparse matrix; cache to disk (.npz).

Modeling:
- OneVsRestClassifier with LogisticRegression(saga, l1_ratio elasticnet or l2) or LinearSVC.
- Start with LogisticRegression(C~4, penalty='l2', class_weight=None).
- Train per fold; predict OOF probabilities; threshold search on OOF.
- Consider calibration if needed; likely unnecessary.

Thresholding / Decoding:
- Global threshold tuned on OOF to maximize micro-F1.
- Additionally enforce per-sample top-K (K ~ average tag count) fallback if none pass threshold.
- Clip to known tags only.

Efficiency:
- Subsample smoke-run first (50k rows, 2 folds) to validate pipeline and timing.
- Log elapsed time per step/fold.
- Cache vectorizers and matrices; slice per fold rather than refitting.

Iterations:
1) EDA: count tags, avg tags/sample, text lengths.
2) Baseline: HashingVectorizer + OneVsRest LogisticRegression; 2-fold; quick OOF.
3) Full 5-fold iterative stratification; tune threshold.
4) Improve features: add char n-grams, tune min_df/max_features, try sublinear_tf.
5) Try LinearSVC and/or ComplementNB blend; simple weighted average of decision scores.
6) Error analysis on OOF: rare tags, long texts; adjust cleaning.

Submission:
- Predict tags for test with tuned threshold and top-K fallback.
- Format: Id, Tags (space-separated). Save to submission.csv.

Checkpoints to request expert review:
- After plan (this), after EDA, after baseline OOF, after improved features, before long runs/ensembles.

In [22]:
import pandas as pd, numpy as np, re, html, os, time
t0 = time.time()
train_path, test_path = 'train.csv', 'test.csv'
print('Loading data...', flush=True)
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print(f'train shape: {train.shape}; test shape: {test.shape}', flush=True)
print('Train columns:', list(train.columns))
print('Test columns:', list(test.columns))

# Basic sanity preview
print('\nTrain head:')
print(train.head(3).to_dict(orient='records'))

# EDA on Tags
assert 'Tags' in train.columns, 'Expected Tags column in train'
def split_tags(s):
    if pd.isna(s): return []
    return str(s).strip().split()
tag_lists = train['Tags'].apply(split_tags)
tag_counts = tag_lists.apply(len)
print(f'Avg tags per sample: {tag_counts.mean():.3f} | median: {float(tag_counts.median()):.1f} | min/max: {int(tag_counts.min())}/{int(tag_counts.max())}')
from collections import Counter
tag_counter = Counter([t for tags in tag_lists for t in tags])
print(f'Unique tags: {len(tag_counter)}')
top20 = tag_counter.most_common(20)
print('Top 20 tags:', top20)

# Text length stats
title_col = 'Title' if 'Title' in train.columns else None
body_col = 'Body' if 'Body' in train.columns else None
if title_col:
    train['_title_len'] = train[title_col].fillna('').astype(str).str.len()
    print('Title length quantiles:', train['_title_len'].quantile([0.5,0.9,0.99,0.999]).to_dict())
if body_col:
    train['_body_len'] = train[body_col].fillna('').astype(str).str.len()
    print('Body length quantiles:', train['_body_len'].quantile([0.5,0.9,0.99,0.999]).to_dict())

print(f'EDA done in {time.time()-t0:.1f}s', flush=True)

Loading data...


train shape: (5430775, 4); test shape: (603420, 3)


Train columns: ['Id', 'Title', 'Body', 'Tags']
Test columns: ['Id', 'Title', 'Body']

Train head:
[{'Id': 818237, 'Title': 'mysql + issues with JOIN query', 'Body': '<p>Ok folks, im sorry to ask about this since ive seen a couple of mysql JOIN examples but i seem to be unable to get it to work.</p>\n\n<p>"sales"</p>\n\n<pre><code>----------------------\nidcustomer | datecode \n----------------------\n 1         | 20120503 \n 1         | 20120503 \n 1         | 20120503 \n 2         | 20120503 \n 3         | 20120503 \n</code></pre>\n\n<p>I want to know who is the top buyer.... in terms of HOW MANY TIMES a customer buys something from me on a especific day (yes i use some weird format for date i know, please nevermind that)...so i do:</p>\n\n<pre><code>SELECT idcustomer, COUNT(idcustomer) FROM sales WHERE datecode = 20120503 GROUP BY idcustomer ORDER BY COUNT(idcustomer) DESC\n</code></pre>\n\n<p>AND I GET:</p>\n\n<pre><code>-----------------------------\nidcustomer | Count(idcustomer)\

Avg tags per sample: 2.885 | median: 3.0 | min/max: 0/5


Unique tags: 41781
Top 20 tags: [('c#', 417225), ('java', 370832), ('php', 353372), ('javascript', 329042), ('android', 288591), ('jquery', 274992), ('c++', 179723), ('python', 166308), ('iphone', 165010), ('asp.net', 159570), ('mysql', 155061), ('html', 149108), ('.net', 145954), ('ios', 122444), ('objective-c', 120390), ('sql', 119212), ('css', 116249), ('linux', 114992), ('ruby-on-rails', 105117), ('windows', 88296)]


Title length quantiles: {0.5: 48.0, 0.9: 78.0, 0.99: 111.0, 0.999: 140.0}


Body length quantiles: {0.5: 713.0, 0.9: 2188.0, 0.99: 6764.0, 0.999: 18065.678000002168}
EDA done in 75.1s


In [3]:
import re, time, pandas as pd, numpy as np, html
from sklearn.preprocessing import MultiLabelBinarizer
t0 = time.time()

def split_tags(s):
    if pd.isna(s): return []
    return str(s).strip().split()

# Fast, vectorized StackExchange cleaning respecting symbols like + # .
def build_clean_text(df: pd.DataFrame, title_col='Title', body_col='Body', title_repeat=2, body_maxlen=2000):
    title = df[title_col].fillna('').astype(str)
    body = df[body_col].fillna('').astype(str)
    # Remove code blocks
    body = body.str.replace(r'<pre><code>[\s\S]*?</code></pre>', ' CODE ', regex=True)
    # Strip HTML tags
    body = body.str.replace(r'<[^>]+>', ' ', regex=True)
    # URLs and emails
    body = body.str.replace(r'https?://\S+', ' URL ', regex=True)
    body = body.str.replace(r'\b[\w\.-]+@[\w\.-]+\.[A-Za-z]{2,}\b', ' EMAIL ', regex=True)
    # Lowercase
    body = body.str.lower()
    title = title.str.lower()
    # Truncate long bodies
    if body_maxlen is not None:
        body = body.str.slice(0, int(body_maxlen))
    # Upweight title by repetition
    if title_repeat and title_repeat > 1:
        title_rep = (title + ' ') * int(title_repeat)
    else:
        title_rep = title
    text = (title_rep + ' ' + body).str.replace('\s+', ' ', regex=True).str.strip()
    return text

print('Building cleaned text for train/test...', flush=True)
train_text = build_clean_text(train, 'Title', 'Body', title_repeat=2, body_maxlen=2000)
test_text  = build_clean_text(test,  'Title', 'Body', title_repeat=2, body_maxlen=2000)
print(f'train_text samples: {len(train_text)}, test_text samples: {len(test_text)}')
print('Example cleaned text:', train_text.iloc[0][:200], '...')

# Prepare multilabel targets
y_lists = train['Tags'].apply(split_tags)
mlb = MultiLabelBinarizer(sparse_output=True)
Y = mlb.fit_transform(y_lists)
print(f'Labels shape: {Y.shape} (samples, classes); nnz labels: {int(Y.nnz)}; classes: {len(mlb.classes_)}')
print('Top classes:', list(mlb.classes_[:10]))

print(f'Preprocessing done in {time.time()-t0:.1f}s', flush=True)

Building cleaned text for train/test...


train_text samples: 5430775, test_text samples: 603420
Example cleaned text: mysql + issues with join query mysql + issues with join query ok folks, im sorry to ask about this since ive seen a couple of mysql join examples but i seem to be unable to get it to work. "sales" cod ...


Labels shape: (5430775, 41781) (samples, classes); nnz labels: 15667486; classes: 41781
Top classes: ['.a', '.app', '.asp.net-mvc', '.aspxauth', '.bash-profile', '.class-file', '.cs-file', '.doc', '.drv', '.ds-store']
Preprocessing done in 248.5s


In [7]:
import sys, subprocess, pkgutil
print('Checking/installing iterative-stratification...', flush=True)
def pip_install(*pkgs):
    args = [sys.executable, '-m', 'pip', 'install', *pkgs]
    print('>',' '.join(args), flush=True)
    subprocess.run(args, check=True)

if pkgutil.find_loader('iterstrat') is None:
    pip_install('iterative-stratification')
else:
    print('iterstrat already installed')

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
print('OK: MultilabelStratifiedKFold available')

Checking/installing iterative-stratification...


> /usr/bin/python3.11 -m pip install iterative-stratification


Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)


Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 292.1 MB/s eta 0:00:00


Collecting scipy
  Downloading scipy-1.16.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.9/35.9 MB 433.3 MB/s eta 0:00:00


Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.7/9.7 MB 95.8 MB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)


Collecting joblib>=1.2.0
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 308.4/308.4 KB 477.4 MB/s eta 0:00:00


Installing collected packages: threadpoolctl, numpy, joblib, scipy, scikit-learn, iterative-stratification


Successfully installed iterative-stratification-0.1.9 joblib-1.5.2 numpy-1.26.4 scikit-learn-1.7.2 scipy-1.16.2 threadpoolctl-3.6.0


OK: MultilabelStratifiedKFold available


In [6]:
import time, numpy as np, scipy.sparse as sp
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

t0 = time.time()
print('Smoke-run: Hashing + SGD OneVsRest on subset with KFold 2-folds', flush=True)

# Select top-K frequent classes to keep training fast
cls_freq = np.asarray(Y.sum(axis=0)).ravel()
top_k = 1000
cls_idx = np.argsort(-cls_freq)[:top_k]
Y_top = Y[:, cls_idx]
classes_top = [mlb.classes_[i] for i in cls_idx]
print(f'Using top {top_k} classes; coverage of labels (nnz kept/total): {int(Y_top.nnz)}/{int(Y.nnz)} (~{Y_top.nnz / max(1, Y.nnz):.2%})', flush=True)

# Subsample rows for speed
n_total = train_text.shape[0]
n_sub = min(50_000, n_total)
rng = np.random.RandomState(42)
sub_idx = np.sort(rng.choice(n_total, size=n_sub, replace=False))
train_text_sub = train_text.iloc[sub_idx].values
Y_sub = Y_top[sub_idx]
print(f'Subset rows: {n_sub}', flush=True)

# Define vectorizers
word_vec = HashingVectorizer(n_features=2**20, alternate_sign=False, norm=None,
                             ngram_range=(1,2), dtype=np.float32,
                             token_pattern=r'(?u)\b[\w\+#\.]{2,}\b')
char_vec = HashingVectorizer(n_features=2**18, alternate_sign=False, norm=None,
                             analyzer='char', ngram_range=(3,5), dtype=np.float32)

def vectorize(texts):
    Xw = word_vec.transform(texts)
    Xc = char_vec.transform(texts)
    return sp.hstack([Xw, Xc], format='csr', dtype=np.float32)

# CV setup
kf = KFold(n_splits=2, shuffle=True, random_state=42)
oof_scores = None
oof_true_rows = []

fold = 0
for tr_idx, va_idx in kf.split(np.arange(n_sub)):
    fold += 1
    t_fold = time.time()
    print(f'Fold {fold}: tr {len(tr_idx)} / va {len(va_idx)}', flush=True)
    X_tr = vectorize(train_text_sub[tr_idx])
    X_va = vectorize(train_text_sub[va_idx])
    # TF-IDF (fit on train fold only to avoid leakage)
    tfidf = TfidfTransformer(sublinear_tf=True, use_idf=True, norm='l2')
    X_tr = tfidf.fit_transform(X_tr)
    X_va = tfidf.transform(X_va)
    print(f'Fold {fold}: vectorized shapes tr {X_tr.shape} va {X_va.shape}', flush=True)

    clf = OneVsRestClassifier(SGDClassifier(loss='log_loss', penalty='l2', alpha=2e-6,
                                            max_iter=5, tol=1e-3, n_jobs=-1, random_state=42),
                              n_jobs=-1)
    clf.fit(X_tr, Y_sub[tr_idx])
    P = clf.predict_proba(X_va)  # ndarray (n_va, top_k)
    if oof_scores is None:
        oof_scores = np.zeros((n_sub, P.shape[1]), dtype=np.float32)
    oof_scores[va_idx] = P.astype(np.float32)
    oof_true_rows.append(va_idx)
    print(f'Fold {fold} done in {time.time()-t_fold:.1f}s', flush=True)

oof_mask = np.zeros(n_sub, dtype=bool)
for idx in oof_true_rows: oof_mask[idx] = True
assert oof_mask.all(), 'OOF coverage incomplete'

# Tune global threshold on OOF
thr_grid = np.linspace(0.2, 0.6, 9)
best_thr, best_f1 = None, -1.0
Y_true = Y_sub
for thr in thr_grid:
    preds = (oof_scores >= thr).astype(np.uint8)
    f1 = f1_score(Y_true, preds, average='micro', zero_division=0)
    print(f'Threshold {thr:.3f} -> micro-F1 {f1:.5f}')
    if f1 > best_f1:
        best_f1, best_thr = f1, thr
print(f'Best OOF micro-F1: {best_f1:.5f} at thr={best_thr:.3f}', flush=True)
print(f'Smoke-run total time: {time.time()-t0:.1f}s', flush=True)

Smoke-run: Hashing + SGD OneVsRest on subset with KFold 2-folds


Using top 1000 classes; coverage of labels (nnz kept/total): 11208354/15667486 (~71.54%)


Subset rows: 50000


Fold 1: tr 25000 / va 25000


Fold 1: vectorized shapes tr (25000, 1310720) va (25000, 1310720)










































































































































































































































































































Fold 1 done in 113.1s


Fold 2: tr 25000 / va 25000


Fold 2: vectorized shapes tr (25000, 1310720) va (25000, 1310720)










































































































































































































































































































Fold 2 done in 106.2s


Threshold 0.200 -> micro-F1 0.45250


Threshold 0.250 -> micro-F1 0.43273


Threshold 0.300 -> micro-F1 0.41420


Threshold 0.350 -> micro-F1 0.39646


Threshold 0.400 -> micro-F1 0.37906


Threshold 0.450 -> micro-F1 0.36283


Threshold 0.500 -> micro-F1 0.34566


Threshold 0.550 -> micro-F1 0.32943


Threshold 0.600 -> micro-F1 0.31279
Best OOF micro-F1: 0.45250 at thr=0.200


Smoke-run total time: 235.7s


In [8]:
import time, numpy as np, scipy.sparse as sp, os
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz

t0 = time.time()
print('Building TF-IDF (word+char) on train+test and caching to .npz ...', flush=True)

# Rebuild cleaned text with stronger settings per expert advice
print('Cleaning text with title_repeat=3, body_maxlen=1800 ...', flush=True)
train_text_v2 = build_clean_text(train, 'Title', 'Body', title_repeat=3, body_maxlen=1800)
test_text_v2  = build_clean_text(test,  'Title', 'Body', title_repeat=3, body_maxlen=1800)
print(f'Cleaned texts ready: train {len(train_text_v2)}, test {len(test_text_v2)}', flush=True)

# Fit TF-IDF on concatenated corpus (standard practice here) 
corpus = np.concatenate([train_text_v2.values, test_text_v2.values])

word_vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.95, max_features=400_000,
                           sublinear_tf=True, token_pattern=r'(?u)\b[\w\+#\.]{2,}\b',
                           dtype=np.float32, lowercase=False, norm='l2')
char_vec = TfidfVectorizer(analyzer='char', ngram_range=(3,5), min_df=3, max_features=300_000,
                           sublinear_tf=True, dtype=np.float32, lowercase=False, norm='l2')

t1 = time.time()
print('Fitting word TF-IDF ...', flush=True)
word_vec.fit(corpus)
print(f'Word vocab size: {len(word_vec.vocabulary_)} in {time.time()-t1:.1f}s', flush=True)
t2 = time.time()
print('Fitting char TF-IDF ...', flush=True)
char_vec.fit(corpus)
print(f'Char vocab size: {len(char_vec.vocabulary_)} in {time.time()-t2:.1f}s', flush=True)

# Transform train/test separately, cast to float32, hstack, and cache
def to_float32_csr(X):
    if X.dtype != np.float32:
        X = X.astype(np.float32)
    return X.tocsr()

print('Transforming train ...', flush=True)
Xw_tr = to_float32_csr(word_vec.transform(train_text_v2.values))
Xc_tr = to_float32_csr(char_vec.transform(train_text_v2.values))
X_tr = sp.hstack([Xw_tr, Xc_tr], format='csr', dtype=np.float32)
print(f'Train shapes: word {Xw_tr.shape}, char {Xc_tr.shape}, combined {X_tr.shape}; nnz={X_tr.nnz:,}', flush=True)
del Xw_tr, Xc_tr

print('Transforming test ...', flush=True)
Xw_te = to_float32_csr(word_vec.transform(test_text_v2.values))
Xc_te = to_float32_csr(char_vec.transform(test_text_v2.values))
X_te = sp.hstack([Xw_te, Xc_te], format='csr', dtype=np.float32)
print(f'Test shapes: word {Xw_te.shape}, char {Xc_te.shape}, combined {X_te.shape}; nnz={X_te.nnz:,}', flush=True)
del Xw_te, Xc_te

os.makedirs('cache', exist_ok=True)
save_npz('cache/X_train_tfidf.npz', X_tr)
save_npz('cache/X_test_tfidf.npz', X_te)
np.save('cache/mlb_classes.npy', mlb.classes_)
word_vocab = np.array(sorted(word_vec.vocabulary_.items(), key=lambda x: x[1]), dtype=object)
char_vocab = np.array(sorted(char_vec.vocabulary_.items(), key=lambda x: x[1]), dtype=object)
np.save('cache/word_vocab.npy', word_vocab, allow_pickle=True)
np.save('cache/char_vocab.npy', char_vocab, allow_pickle=True)
print(f'Cached TF-IDF matrices and vocabs. Total time: {time.time()-t0:.1f}s', flush=True)

Building TF-IDF (word+char) on train+test and caching to .npz ...


Cleaning text with title_repeat=3, body_maxlen=1800 ...


Cleaned texts ready: train 5430775, test 603420


Fitting word TF-IDF ...


Word vocab size: 400000 in 726.8s


Fitting char TF-IDF ...


Char vocab size: 300000 in 4703.5s


Transforming train ...


Train shapes: word (5430775, 400000), char (5430775, 300000), combined (5430775, 700000); nnz=6,948,479,006


Transforming test ...


Test shapes: word (603420, 400000), char (603420, 300000), combined (603420, 700000); nnz=771,818,970


Cached TF-IDF matrices and vocabs. Total time: 14170.4s


In [20]:
import os, time, numpy as np, scipy.sparse as sp
from scipy.sparse import load_npz, csr_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from joblib import parallel_backend

# Training config (expert-approved fast baseline, adjusted for 2-fold finish)
n_splits = 2
C_svc = 0.8
max_iter_svc = 400  # tighter for speed; raise only if many classes hit cap
random_state = 42
ovr_jobs = 12  # safer headroom; avoid RAM spikes
test_batch_classes = 256  # class batch size for memory-safe prediction
val_batch_classes = 256   # class batch size for OOF prediction
top_classes = 20000       # keep 20k classes per expert advice
split_labels_cap = 100    # reduced dense label cap for split generation speed

# Thread caps to avoid oversubscription
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'

# Ensure joblib uses a repo-local temp folder (writable) instead of read-only /mnt
os.makedirs('cache/joblib', exist_ok=True)
os.environ['JOBLIB_TEMP_FOLDER'] = os.path.abspath('cache/joblib')

def ensure_int32_indices_csr(X: csr_matrix) -> csr_matrix:
    X = X.tocsr(copy=True)
    # Ensure sorted indices for safe in-place changes
    X.sort_indices()
    if X.indices.dtype != np.int32:
        X.indices = X.indices.astype(np.int32, copy=False)
    if X.indptr.dtype != np.int32:
        X.indptr = X.indptr.astype(np.int32, copy=False)
    return X

print('Loading cached TF-IDF matrices ...', flush=True)
X_train_path = 'cache/X_train_tfidf.npz'
X_test_path  = 'cache/X_test_tfidf.npz'
assert os.path.exists(X_train_path) and os.path.exists(X_test_path), 'Cached TF-IDF not found yet.'
X_full = load_npz(X_train_path).tocsr()
Xt = load_npz(X_test_path).tocsr()
print(f'Loaded X_full: {X_full.shape}, nnz={X_full.nnz:,} | Xt: {Xt.shape}, nnz={Xt.nnz:,}', flush=True)
# Coerce CSR index dtypes to int32 for scikit-learn compatibility
X_full = ensure_int32_indices_csr(X_full)
Xt = ensure_int32_indices_csr(Xt)
print(f'Index dtypes -> X_full.indices:{X_full.indices.dtype}, indptr:{X_full.indptr.dtype}; Xt.indices:{Xt.indices.dtype}, indptr:{Xt.indptr.dtype}', flush=True)

# Use labels from earlier (sparse CSR)
Y_all = Y
n_samples_full, n_classes_full = Y_all.shape
print(f'Full labels: {n_classes_full} classes over {n_samples_full} samples; nnz={Y_all.nnz:,}', flush=True)

# Class cap (by frequency) for feasibility
cls_freq = np.asarray(Y_all.sum(axis=0)).ravel()
cls_order = np.argsort(-cls_freq)
keep_cls = cls_order[:min(top_classes, n_classes_full)]
Y_use = Y_all[:, keep_cls]
n_classes = Y_use.shape[1]
X = X_full  # use all rows per expert advice
n_samples = X.shape[0]
n_test = Xt.shape[0]
print(f'Working set: X {X.shape}, Xt {Xt.shape}, classes {n_classes}', flush=True)

# Prepare CV with iterative strat on a dense subset of top labels for split generation
mskf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
split_cap = min(split_labels_cap, n_classes)
freq_use = np.asarray(Y_use.sum(axis=0)).ravel()
split_cols = np.argsort(-freq_use)[:split_cap]
print(f'Building dense label matrix for splits with top {split_cap} labels ...', flush=True)
Y_split = (Y_use[:, split_cols] != 0).astype(np.int8).toarray()  # dense int8 for splitter

# Prepare memmaps to avoid huge RAM usage
os.makedirs('cache', exist_ok=True)
oof_path = 'cache/oof_svc.dat'
if os.path.exists(oof_path):
    os.remove(oof_path)
oof_mem = np.memmap(oof_path, dtype='float32', mode='w+', shape=(n_samples, n_classes))
oof_mem[:] = 0.0

test_avg_path = 'cache/test_svc_avg.dat'
if os.path.exists(test_avg_path):
    os.remove(test_avg_path)
test_avg = np.memmap(test_avg_path, dtype='float32', mode='w+', shape=(n_test, n_classes))
test_avg[:] = 0.0

t0 = time.time()
fold = 0
for tr_idx, va_idx in mskf.split(np.zeros(n_samples), Y_split):
    fold += 1
    t_fold = time.time()
    print(f'Fold {fold}/{n_splits}: tr {len(tr_idx)} va {len(va_idx)}', flush=True)
    X_tr, X_va = X[tr_idx], X[va_idx]
    Y_tr = Y_use[tr_idx]

    clf_svc = OneVsRestClassifier(
        LinearSVC(C=C_svc, loss='squared_hinge', dual=False, tol=3e-3, max_iter=max_iter_svc, random_state=random_state, verbose=1),
        n_jobs=ovr_jobs
    )
    print(f'Training LinearSVC OvR with C={C_svc}, max_iter={max_iter_svc}, tol=3e-3, dual=False, n_jobs={ovr_jobs} ...', flush=True)
    fit_t0 = time.time()
    # Use threading backend to avoid pickling/memmap to /tmp when dispatching jobs
    with parallel_backend('threading', n_jobs=ovr_jobs):
        clf_svc.fit(X_tr, Y_tr)
    print(f'Fold {fold}: fit done in {time.time()-fit_t0:.1f}s', flush=True)

    # OOF decision_function in class batches to avoid large allocations
    print('Scoring validation (decision_function) in class batches ...', flush=True)
    n_val = X_va.shape[0]
    n_batches_val = (n_classes + val_batch_classes - 1) // val_batch_classes
    cls_start = 0
    for b in range(n_batches_val):
        cls_end = min(cls_start + val_batch_classes, n_classes)
        width = cls_end - cls_start
        buf = np.empty((n_val, width), dtype=np.float32)
        for j, c in enumerate(range(cls_start, cls_end)):
            dec = clf_svc.estimators_[c].decision_function(X_va).astype(np.float32)
            if dec.ndim == 1:
                buf[:, j] = dec
            else:
                buf[:, j] = dec.ravel()
        oof_mem[va_idx, cls_start:cls_end] = buf
        cls_start = cls_end
        if (b + 1) % 5 == 0 or cls_end == n_classes:
            print(f'  Fold {fold}: OOF batch {b+1}/{n_batches_val} filled', flush=True)
            oof_mem.flush()

    # Test scoring in class batches (do not divide by n_splits so a single-fold run yields usable scores)
    print('Scoring test (decision_function) in class batches ...', flush=True)
    n_batches_test = (n_classes + test_batch_classes - 1) // test_batch_classes
    cls_start = 0
    for b in range(n_batches_test):
        cls_end = min(cls_start + test_batch_classes, n_classes)
        width = cls_end - cls_start
        buf = np.empty((n_test, width), dtype=np.float32)
        for j, c in enumerate(range(cls_start, cls_end)):
            dec = clf_svc.estimators_[c].decision_function(Xt).astype(np.float32)
            if dec.ndim == 1:
                buf[:, j] = dec
            else:
                buf[:, j] = dec.ravel()
        test_avg[:, cls_start:cls_end] += buf  # no averaging here; single-fold usable
        cls_start = cls_end
        if (b + 1) % 5 == 0 or cls_end == n_classes:
            print(f'  Fold {fold}: test batch {b+1}/{n_batches_test} accumulated', flush=True)
            test_avg.flush()

    print(f'Fold {fold} done in {time.time()-t_fold:.1f}s', flush=True)
    # With time constraints, stop after first fold to ensure completion
    print('Stopping after first fold to meet time constraints.', flush=True)
    break

print(f'Run done in {time.time()-t0:.1f}s', flush=True)
print('Saved OOF memmap at cache/oof_svc.dat and test memmap at cache/test_svc_avg.dat')

# Save kept class indices to map back to labels for submission
np.save('cache/kept_class_indices.npy', keep_cls)
print('Saved kept_class_indices.npy')

print('Next: per-class threshold tuning on OOF memmap and submission decoding.', flush=True)

Loading cached TF-IDF matrices ...


Loaded X_full: (5430775, 700000), nnz=6,948,479,006 | Xt: (603420, 700000), nnz=771,818,970


Index dtypes -> X_full.indices:int32, indptr:int32; Xt.indices:int32, indptr:int32


NameError: name 'Y' is not defined

In [27]:
# Per-class thresholds (percentile on positive OOF margins using only validation rows) and streaming submission writer
import os, csv, time, numpy as np
from pathlib import Path
import pandas as pd

t0 = time.time()
cache_dir = Path('cache')
oof_path = cache_dir / 'oof_svc.dat'
test_path = cache_dir / 'test_svc_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'

assert oof_path.exists() and test_path.exists() and mlb_classes_path.exists(), 'Required cache artifacts missing. Run training first.'

# Infer shapes from file sizes to avoid hard-coding
itemsize = np.dtype('float32').itemsize
n_train = train.shape[0]
n_test = test.shape[0]
oof_bytes = os.path.getsize(oof_path)
test_bytes = os.path.getsize(test_path)
assert oof_bytes % (n_train * itemsize) == 0, 'OOF file size mismatch with n_train'
assert test_bytes % (n_test * itemsize) == 0, 'Test file size mismatch with n_test'
n_classes_from_oof = oof_bytes // (n_train * itemsize)
n_classes_from_test = test_bytes // (n_test * itemsize)
assert n_classes_from_oof == n_classes_from_test, 'Class dimension mismatch between OOF and Test memmaps'
n_classes = int(n_classes_from_oof)
print(f'Inferred n_classes from memmaps: {n_classes}', flush=True)

# Load class mapping
all_classes = np.load(mlb_classes_path, allow_pickle=True)
if kept_idx_path.exists():
    keep_cls = np.load(kept_idx_path)
    if keep_cls.shape[0] != n_classes:
        # Fall back to recomputing by frequency only if mismatch
        print(f'kept_class_indices size {keep_cls.shape[0]} != {n_classes}; recompute by frequency ...', flush=True)
        cls_to_idx = {c:i for i,c in enumerate(all_classes)}
        freq = np.zeros(len(all_classes), dtype=np.int64)
        batch = 200000
        for start in range(0, n_train, batch):
            end = min(start + batch, n_train)
            tags_batch = train['Tags'].iloc[start:end].astype(str).tolist()
            for tstr in tags_batch:
                if not tstr: continue
                for t in tstr.split():
                    idx = cls_to_idx.get(t)
                    if idx is not None: freq[idx] += 1
        keep_cls = np.argsort(-freq)[:n_classes]
        np.save(kept_idx_path, keep_cls)
else:
    # Should exist from training; if not, recompute by frequency
    print('kept_class_indices.npy not found. Recomputing by frequency ...', flush=True)
    cls_to_idx = {c:i for i,c in enumerate(all_classes)}
    freq = np.zeros(len(all_classes), dtype=np.int64)
    batch = 200000
    for start in range(0, n_train, batch):
        end = min(start + batch, n_train)
        tags_batch = train['Tags'].iloc[start:end].astype(str).tolist()
        for tstr in tags_batch:
            if not tstr: continue
            for t in tstr.split():
                idx = cls_to_idx.get(t)
                if idx is not None: freq[idx] += 1
    keep_cls = np.argsort(-freq)[:n_classes]
    np.save(kept_idx_path, keep_cls)
kept_class_names = all_classes[keep_cls]
tag_to_keptpos = {all_classes[k]: i for i,k in enumerate(keep_cls)}

# Open memmaps
oof_mem = np.memmap(str(oof_path), dtype='float32', mode='r', shape=(n_train, n_classes))
test_avg = np.memmap(str(test_path), dtype='float32', mode='r', shape=(n_test, n_classes))

# Detect validation rows (rows with any nonzero OOF score) to avoid contaminating thresholds with zeros from train-fold rows
print('Detecting validation rows from OOF memmap ...', flush=True)
val_mask = np.zeros(n_train, dtype=bool)
row_batch = 20000
for start in range(0, n_train, row_batch):
    end = min(start + row_batch, n_train)
    S = np.asarray(oof_mem[start:end, :])
    val_mask[start:end] = np.any(np.abs(S) > 1e-8, axis=1)
    if ((start // row_batch) + 1) % 50 == 0 or end == n_train:
        print(f'  scanned rows {start}-{end}, val_mask true so far: {val_mask.sum()}', flush=True)
n_val_rows = int(val_mask.sum())
assert n_val_rows > 0, 'No validation rows detected in OOF memmap'
print(f'Validation rows detected: {n_val_rows}', flush=True)

# Build positive row indices per kept class, but only among validation rows
print('Indexing positive validation rows per class ...', flush=True)
pos_rows = [[] for _ in range(n_classes)]
batch = 200000
for start in range(0, n_train, batch):
    end = min(start + batch, n_train)
    tags_batch = train['Tags'].iloc[start:end].astype(str).tolist()
    for offset, tstr in enumerate(tags_batch):
        if not tstr: continue
        irow = start + offset
        if not val_mask[irow]:
            continue
        for t in tstr.split():
            kp = tag_to_keptpos.get(t)
            if kp is not None:
                pos_rows[kp].append(irow)
    if ((start // batch) + 1) % 10 == 0 or end == n_train:
        print(f'  indexed rows {start}-{end}', flush=True)

# Percentile grid for positive OOF margins
percentiles = [85.0, 90.0, 92.5, 95.0, 97.5]
thr = np.full(n_classes, np.inf, dtype=np.float32)

print('Computing per-class thresholds from positive OOF scores (validation rows only) ...', flush=True)
for c in range(n_classes):
    idx_rows = pos_rows[c]
    if idx_rows:
        pos_scores = oof_mem[np.array(idx_rows, dtype=np.int64), c]
        pos_scores = pos_scores[np.isfinite(pos_scores)]
        if pos_scores.size > 0:
            vals = [np.percentile(pos_scores, p) for p in percentiles[-3:]]
            thr[c] = np.float32(np.median(vals))
    if (c+1) % 2000 == 0 or c+1 == n_classes:
        print(f'  thresholds computed for {c+1}/{n_classes} classes', flush=True)

np.save(cache_dir / 'svc_thresholds.npy', thr)
print('Saved per-class thresholds to cache/svc_thresholds.npy')

# Stream-write submission using thresholds + top-K fallback
print('Writing submission.csv with streaming decode ...', flush=True)

id_col = test['Id'].values
K_fallback = 3  # slightly higher fallback to improve recall
max_tags = 7    # modestly higher cap
row_batch = 50000

with open('submission.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S = np.asarray(test_avg[start:end, :])  # shape (B, C)
        for i in range(S.shape[0]):
            row = S[i]
            pred_mask = row >= thr
            idx = np.flatnonzero(pred_mask)
            if idx.size == 0:
                topk = np.argpartition(row, -K_fallback)[-K_fallback:]
                idx = topk[np.argsort(-row[topk])]
            else:
                if idx.size > max_tags:
                    sel = np.argpartition(row[idx], -max_tags)[-max_tags:]
                    idx = idx[sel[np.argsort(-row[idx][sel])]]
                else:
                    idx = idx[np.argsort(-row[idx])]
            tags = ' '.join(kept_class_names[idx].tolist())
            writer.writerow([int(id_col[start + i]), tags])
        print(f'  Wrote rows {start}-{end} / {n_test}', flush=True)

print(f'submission.csv written. Elapsed {time.time()-t0:.1f}s', flush=True)

Inferred n_classes from memmaps: 20000


Detecting validation rows from OOF memmap ...


  scanned rows 980000-1000000, val_mask true so far: 0


  scanned rows 1980000-2000000, val_mask true so far: 0


  scanned rows 2980000-3000000, val_mask true so far: 0


  scanned rows 3980000-4000000, val_mask true so far: 0


  scanned rows 4980000-5000000, val_mask true so far: 0


  scanned rows 5420000-5430775, val_mask true so far: 0


AssertionError: No validation rows detected in OOF memmap

In [25]:
# OOF-driven tuning of threshold scale and top-K fallback; regenerate submission
import numpy as np, os, csv, time
from pathlib import Path
t0 = time.time()
cache_dir = Path('cache')
oof_path = cache_dir / 'oof_svc.dat'
test_path = cache_dir / 'test_svc_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
thr_path = cache_dir / 'svc_thresholds.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'
assert oof_path.exists() and test_path.exists() and kept_idx_path.exists() and thr_path.exists() and mlb_classes_path.exists()

n_train = train.shape[0]
n_test = test.shape[0]
itemsize = np.dtype('float32').itemsize
n_classes = os.path.getsize(oof_path) // (n_train * itemsize)
assert n_classes == os.path.getsize(test_path) // (n_test * itemsize)
n_classes = int(n_classes)
print(f'Tuning using OOF memmap with shape ({n_train}, {n_classes})', flush=True)

keep_cls = np.load(kept_idx_path)
all_classes = np.load(mlb_classes_path, allow_pickle=True)
kept_class_names = all_classes[keep_cls]
thr = np.load(thr_path).astype(np.float32)
oof_mem = np.memmap(str(oof_path), dtype='float32', mode='r', shape=(n_train, n_classes))
test_avg = np.memmap(str(test_path), dtype='float32', mode='r', shape=(n_test, n_classes))

# Labels subset for evaluation (build directly from train['Tags'] for kept classes)
from numpy.random import RandomState
rng = RandomState(123)
n_eval = min(200000, n_train)
eval_idx = np.sort(rng.choice(n_train, size=n_eval, replace=False))
tag_to_keptpos = {all_classes[k]: i for i, k in enumerate(keep_cls)}
true_idx_list = []
batch = 50000
for s in range(0, n_eval, batch):
    e = min(s + batch, n_eval)
    tags_batch = train['Tags'].iloc[eval_idx[s:e]].astype(str).tolist()
    for tstr in tags_batch:
        if not tstr:
            true_idx_list.append(np.array([], dtype=np.int32))
            continue
        idxs = [tag_to_keptpos[t] for t in tstr.split() if t in tag_to_keptpos]
        if idxs:
            true_idx_list.append(np.array(sorted(set(idxs)), dtype=np.int32))
        else:
            true_idx_list.append(np.array([], dtype=np.int32))
    if ((s // batch) + 1) % 4 == 0 or e == n_eval:
        print(f'  built true labels for {e}/{n_eval} eval rows', flush=True)
assert len(true_idx_list) == n_eval
print(f'Eval subset: {n_eval} rows', flush=True)

scales = [0.85, 0.9, 0.95, 1.0, 1.05]
k_opts = [1, 2, 3]
max_tags = 7

def eval_setting(scale, k_fb):
    TP = FP = FN = 0
    bs = 2000
    thr_s = thr * scale
    for s in range(0, n_eval, bs):
        e = min(s + bs, n_eval)
        rows = e - s
        S = np.asarray(oof_mem[eval_idx[s:e], :], dtype=np.float32)
        for i in range(rows):
            row_scores = S[i]
            pred_idx = np.flatnonzero(row_scores >= thr_s)
            if pred_idx.size == 0:
                topk = np.argpartition(row_scores, -k_fb)[-k_fb:]
                pred_idx = topk[np.argsort(-row_scores[topk])]
            elif pred_idx.size > max_tags:
                sel = np.argpartition(row_scores[pred_idx], -max_tags)[-max_tags:]
                pred_idx = pred_idx[sel[np.argsort(-row_scores[pred_idx][sel])]]
            true_idx = true_idx_list[s+i]
            if true_idx.size == 0 and pred_idx.size == 0:
                continue
            if true_idx.size == 0:
                FP += pred_idx.size
                continue
            if pred_idx.size == 0:
                FN += true_idx.size
                continue
            a = np.sort(pred_idx)
            b = true_idx  # already sorted
            ia = ib = 0
            tp = 0
            while ia < a.size and ib < b.size:
                if a[ia] == b[ib]:
                    tp += 1; ia += 1; ib += 1
                elif a[ia] < b[ib]:
                    ia += 1
                else:
                    ib += 1
            TP += tp
            FP += a.size - tp
            FN += b.size - tp
        if ((s // bs) + 1) % 20 == 0 or e == n_eval:
            print(f'  progress {e}/{n_eval} for scale={scale}, K={k_fb}', flush=True)
    prec = TP / max(1, TP + FP)
    rec = TP / max(1, TP + FN)
    f1 = 0.0 if (prec + rec) == 0 else 2 * prec * rec / (prec + rec)
    return f1, (TP, FP, FN)

best = (-1.0, None, None)
for scale in scales:
    for k in k_opts:
        f1, counts = eval_setting(scale, k)
        print(f'scale={scale:.2f}, K={k} -> micro-F1 {f1:.5f} (TP,FP,FN)={counts}', flush=True)
        if f1 > best[0]:
            best = (f1, scale, k)

best_f1, best_scale, best_k = best
print(f'Best setting: scale={best_scale:.2f}, K={best_k} with OOF micro-F1 {best_f1:.5f}', flush=True)

# Regenerate submission with tuned params
thr_s = (thr * best_scale).astype(np.float32)
id_col = test['Id'].values
row_batch = 50000
out_path = 'submission.csv'
with open(out_path, 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S = np.asarray(test_avg[start:end, :])
        for i in range(S.shape[0]):
            row = S[i]
            idx = np.flatnonzero(row >= thr_s)
            if idx.size == 0:
                topk = np.argpartition(row, -best_k)[-best_k:]
                idx = topk[np.argsort(-row[topk])]
            elif idx.size > max_tags:
                sel = np.argpartition(row[idx], -max_tags)[-max_tags:]
                idx = idx[sel[np.argsort(-row[idx][sel])]]
            else:
                idx = idx[np.argsort(-row[idx])]
            tags = ' '.join(kept_class_names[idx].tolist())
            w.writerow([int(id_col[start + i]), tags])
        print(f'  wrote rows {start}-{end}/{n_test}', flush=True)
print(f'Rewritten submission.csv with tuned params in {time.time()-t0:.1f}s', flush=True)

Tuning using OOF memmap with shape (5430775, 20000)


  built true labels for 200000/200000 eval rows


Eval subset: 200000 rows


  progress 40000/200000 for scale=0.85, K=1


  progress 80000/200000 for scale=0.85, K=1


  progress 120000/200000 for scale=0.85, K=1


  progress 160000/200000 for scale=0.85, K=1


  progress 200000/200000 for scale=0.85, K=1


scale=0.85, K=1 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=0.85, K=2


  progress 80000/200000 for scale=0.85, K=2


  progress 120000/200000 for scale=0.85, K=2


  progress 160000/200000 for scale=0.85, K=2


  progress 200000/200000 for scale=0.85, K=2


scale=0.85, K=2 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=0.85, K=3


  progress 80000/200000 for scale=0.85, K=3


  progress 120000/200000 for scale=0.85, K=3


  progress 160000/200000 for scale=0.85, K=3


  progress 200000/200000 for scale=0.85, K=3


scale=0.85, K=3 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=0.9, K=1


  progress 80000/200000 for scale=0.9, K=1


  progress 120000/200000 for scale=0.9, K=1


  progress 160000/200000 for scale=0.9, K=1


  progress 200000/200000 for scale=0.9, K=1


scale=0.90, K=1 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=0.9, K=2


  progress 80000/200000 for scale=0.9, K=2


  progress 120000/200000 for scale=0.9, K=2


  progress 160000/200000 for scale=0.9, K=2


  progress 200000/200000 for scale=0.9, K=2


scale=0.90, K=2 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=0.9, K=3


  progress 80000/200000 for scale=0.9, K=3


  progress 120000/200000 for scale=0.9, K=3


  progress 160000/200000 for scale=0.9, K=3


  progress 200000/200000 for scale=0.9, K=3


scale=0.90, K=3 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=0.95, K=1


  progress 80000/200000 for scale=0.95, K=1


  progress 120000/200000 for scale=0.95, K=1


  progress 160000/200000 for scale=0.95, K=1


  progress 200000/200000 for scale=0.95, K=1


scale=0.95, K=1 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=0.95, K=2


  progress 80000/200000 for scale=0.95, K=2


  progress 120000/200000 for scale=0.95, K=2


  progress 160000/200000 for scale=0.95, K=2


  progress 200000/200000 for scale=0.95, K=2


scale=0.95, K=2 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=0.95, K=3


  progress 80000/200000 for scale=0.95, K=3


  progress 120000/200000 for scale=0.95, K=3


  progress 160000/200000 for scale=0.95, K=3


  progress 200000/200000 for scale=0.95, K=3


scale=0.95, K=3 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=1.0, K=1


  progress 80000/200000 for scale=1.0, K=1


  progress 120000/200000 for scale=1.0, K=1


  progress 160000/200000 for scale=1.0, K=1


  progress 200000/200000 for scale=1.0, K=1


scale=1.00, K=1 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=1.0, K=2


  progress 80000/200000 for scale=1.0, K=2


  progress 120000/200000 for scale=1.0, K=2


  progress 160000/200000 for scale=1.0, K=2


  progress 200000/200000 for scale=1.0, K=2


scale=1.00, K=2 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=1.0, K=3


  progress 80000/200000 for scale=1.0, K=3


  progress 120000/200000 for scale=1.0, K=3


  progress 160000/200000 for scale=1.0, K=3


  progress 200000/200000 for scale=1.0, K=3


scale=1.00, K=3 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=1.05, K=1


  progress 80000/200000 for scale=1.05, K=1


  progress 120000/200000 for scale=1.05, K=1


  progress 160000/200000 for scale=1.05, K=1


  progress 200000/200000 for scale=1.05, K=1


scale=1.05, K=1 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=1.05, K=2


  progress 80000/200000 for scale=1.05, K=2


  progress 120000/200000 for scale=1.05, K=2


  progress 160000/200000 for scale=1.05, K=2


  progress 200000/200000 for scale=1.05, K=2


scale=1.05, K=2 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


  progress 40000/200000 for scale=1.05, K=3


  progress 80000/200000 for scale=1.05, K=3


  progress 120000/200000 for scale=1.05, K=3


  progress 160000/200000 for scale=1.05, K=3


  progress 200000/200000 for scale=1.05, K=3


scale=1.05, K=3 -> micro-F1 0.00004 (TP,FP,FN)=(37, 1399963, 570692)


Best setting: scale=0.85, K=1 with OOF micro-F1 0.00004


  wrote rows 0-50000/603420


  wrote rows 50000-100000/603420


  wrote rows 100000-150000/603420


  wrote rows 150000-200000/603420


  wrote rows 200000-250000/603420


  wrote rows 250000-300000/603420


  wrote rows 300000-350000/603420


  wrote rows 350000-400000/603420


  wrote rows 400000-450000/603420


  wrote rows 450000-500000/603420


  wrote rows 500000-550000/603420


  wrote rows 550000-600000/603420


  wrote rows 600000-603420/603420


Rewritten submission.csv with tuned params in 355.2s


In [26]:
# Regenerate submission using baseline decoding (thr scale=1.0, K_fallback=2, max_tags=6)
import os, csv, numpy as np, time
from pathlib import Path
t0 = time.time()
cache_dir = Path('cache')
oof_path = cache_dir / 'oof_svc.dat'
test_path = cache_dir / 'test_svc_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
thr_path = cache_dir / 'svc_thresholds.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'
assert test_path.exists() and kept_idx_path.exists() and thr_path.exists() and mlb_classes_path.exists()
n_test = test.shape[0]
itemsize = np.dtype('float32').itemsize
n_classes = os.path.getsize(test_path) // (n_test * itemsize)
n_classes = int(n_classes)
keep_cls = np.load(kept_idx_path)
all_classes = np.load(mlb_classes_path, allow_pickle=True)
kept_class_names = all_classes[keep_cls]
thr = np.load(thr_path).astype(np.float32)
assert thr.shape[0] == n_classes, 'Thresholds/classes mismatch'
test_avg = np.memmap(str(test_path), dtype='float32', mode='r', shape=(n_test, n_classes))
thr_s = thr  # scale=1.0
K_fallback = 2
max_tags = 6
id_col = test['Id'].values
row_batch = 50000
out_path = 'submission.csv'
with open(out_path, 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S = np.asarray(test_avg[start:end, :])
        for i in range(S.shape[0]):
            row = S[i]
            idx = np.flatnonzero(row >= thr_s)
            if idx.size == 0:
                topk = np.argpartition(row, -K_fallback)[-K_fallback:]
                idx = topk[np.argsort(-row[topk])]
            elif idx.size > max_tags:
                sel = np.argpartition(row[idx], -max_tags)[-max_tags:]
                idx = idx[sel[np.argsort(-row[idx][sel])]]
            else:
                idx = idx[np.argsort(-row[idx])]
            tags = ' '.join(kept_class_names[idx].tolist())
            w.writerow([int(id_col[start + i]), tags])
        print(f'  wrote rows {start}-{end}/{n_test}', flush=True)
print(f'Regenerated submission.csv with baseline decoding in {time.time()-t0:.1f}s', flush=True)

  wrote rows 0-50000/603420


  wrote rows 50000-100000/603420


  wrote rows 100000-150000/603420


  wrote rows 150000-200000/603420


  wrote rows 200000-250000/603420


  wrote rows 250000-300000/603420


  wrote rows 300000-350000/603420


  wrote rows 350000-400000/603420


  wrote rows 400000-450000/603420


  wrote rows 450000-500000/603420


  wrote rows 500000-550000/603420


  wrote rows 550000-600000/603420


  wrote rows 600000-603420/603420


Regenerated submission.csv with baseline decoding in 54.2s


In [35]:
# Emergency decode: per-row Top-K only from SVC scores (avoid OOF-derived thresholds)
import os, csv, numpy as np, time
from pathlib import Path
t0 = time.time()
cache_dir = Path('cache')
test_path = cache_dir / 'test_svc_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'
assert test_path.exists() and kept_idx_path.exists() and mlb_classes_path.exists()

n_test = test.shape[0]
itemsize = np.dtype('float32').itemsize
n_classes = os.path.getsize(test_path) // (n_test * itemsize)
n_classes = int(n_classes)
print(f'Decoding Top-K using test memmap shape ({n_test}, {n_classes})', flush=True)

keep_cls = np.load(kept_idx_path)
all_classes = np.load(mlb_classes_path, allow_pickle=True)
kept_class_names = all_classes[keep_cls]
test_avg = np.memmap(str(test_path), dtype='float32', mode='r', shape=(n_test, n_classes))

# Heuristic: choose K per row around dataset avg tags (approx 3). Try K=3.
K = 3
max_tags = 6
id_col = test['Id'].values
row_batch = 50000
out_path = 'submission.csv'
with open(out_path, 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S = np.asarray(test_avg[start:end, :])
        for i in range(S.shape[0]):
            row = S[i]
            topk = np.argpartition(row, -K)[-K:]
            idx = topk[np.argsort(-row[topk])]
            if idx.size > max_tags:
                idx = idx[:max_tags]
            tags = ' '.join(kept_class_names[idx].tolist())
            w.writerow([int(id_col[start + i]), tags])
        print(f'  wrote rows {start}-{end}/{n_test}', flush=True)
print(f'Wrote Top-{K} submission in {time.time()-t0:.1f}s', flush=True)

Decoding Top-K using test memmap shape (603420, 20000)


  wrote rows 0-50000/603420


  wrote rows 50000-100000/603420


  wrote rows 100000-150000/603420


  wrote rows 150000-200000/603420


  wrote rows 200000-250000/603420


  wrote rows 250000-300000/603420


  wrote rows 300000-350000/603420


  wrote rows 350000-400000/603420


  wrote rows 400000-450000/603420


  wrote rows 450000-500000/603420


  wrote rows 500000-550000/603420


  wrote rows 550000-600000/603420


  wrote rows 600000-603420/603420


Wrote Top-3 submission in 24.4s


In [29]:
# Z-score + frequency prior decoding from SVC margins (no OOF needed)
import os, csv, time, numpy as np, pandas as pd
from pathlib import Path

t0 = time.time()
cache_dir = Path('cache')
test_path = cache_dir / 'test_svc_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'
assert test_path.exists() and kept_idx_path.exists() and mlb_classes_path.exists(), 'Missing cached artifacts'

n_test = test.shape[0]
itemsize = np.dtype('float32').itemsize
n_classes = os.path.getsize(test_path) // (n_test * itemsize)
n_classes = int(n_classes)
print(f'Start zscore+prior decoding. Test memmap shape: ({n_test}, {n_classes})', flush=True)

keep_cls = np.load(kept_idx_path)
all_classes = np.load(mlb_classes_path, allow_pickle=True)
kept_class_names = all_classes[keep_cls]
cls_to_idx = {c:i for i,c in enumerate(all_classes)}

# Build frequency prior over kept classes: alpha * zscore(log1p(freq_kept))
alpha = 0.20  # expert default; try 0.15/0.25 if time remains
print('Computing frequency prior over kept classes ...', flush=True)
freq_all = np.zeros(len(all_classes), dtype=np.int64)
batch = 200000
for start in range(0, train.shape[0], batch):
    end = min(start + batch, train.shape[0])
    tags_batch = train['Tags'].iloc[start:end].astype(str).tolist()
    for tstr in tags_batch:
        if not tstr: continue
        for t in tstr.split():
            idx = cls_to_idx.get(t)
            if idx is not None:
                freq_all[idx] += 1
    if ((start // batch) + 1) % 10 == 0 or end == train.shape[0]:
        print(f'  counted rows {start}-{end}', flush=True)
freq_kept = freq_all[keep_cls]
logf = np.log1p(freq_kept).astype(np.float32)
mu, sd = float(logf.mean()), float(logf.std() + 1e-6)
prior = alpha * ((logf - mu) / sd).astype(np.float32)
print(f'Prior built. mean={prior.mean():.4f}, std={prior.std():.4f}', flush=True)

# Choose global threshold theta to target ~2.9 avg tags per row via quick grid on a sample
print('Selecting global theta to match avg tag count ~2.9 ...', flush=True)
test_avg = np.memmap(str(test_path), dtype='float32', mode='r', shape=(n_test, n_classes))
target_avg = 2.9
grid = np.linspace(-0.2, 0.8, 11).astype(np.float32)
sample_n = min(100000, n_test)
row_batch = 50000
best_theta, best_diff = None, 1e9
for theta in grid:
    total = 0
    seen = 0
    for s in range(0, sample_n, row_batch):
        e = min(s + row_batch, sample_n)
        S = np.asarray(test_avg[s:e, :], dtype=np.float32)
        m = S.mean(axis=1, keepdims=True)
        st = S.std(axis=1, keepdims=True) + 1e-6
        Z = (S - m) / st
        Zp = Z + prior[None, :]
        cnt = (Zp >= theta).sum(axis=1)
        total += int(cnt.sum())
        seen += (e - s)
    avg = total / max(1, seen)
    diff = abs(avg - target_avg)
    print(f'  theta={float(theta):.2f} -> avg tags {avg:.3f}', flush=True)
    if diff < best_diff:
        best_diff, best_theta = diff, float(theta)
if best_theta is None:
    best_theta = 0.30  # fallback default
print(f'Chosen theta={best_theta:.2f} (diff={best_diff:.3f})', flush=True)

# Decode full test with Z-score + prior + global theta, with fallbacks and caps
print('Writing submission.csv using zscore+prior decoding ...', flush=True)
id_col = test['Id'].values
K_fallback = 3
min_k = 2
max_tags = 6
with open('submission.csv', 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S = np.asarray(test_avg[start:end, :], dtype=np.float32)
        m = S.mean(axis=1, keepdims=True)
        st = S.std(axis=1, keepdims=True) + 1e-6
        Z = (S - m) / st
        Zp = Z + prior[None, :]
        for i in range(Zp.shape[0]):
            row = Zp[i]
            idx = np.flatnonzero(row >= best_theta)
            if idx.size == 0:
                # fallback Top-K
                base = S[i]  # use raw margins to rank ties reasonably
                topk = np.argpartition(base, -K_fallback)[-K_fallback:]
                idx = topk[np.argsort(-base[topk])]
            elif idx.size == 1 and min_k >= 2:
                base = S[i]
                top2 = np.argpartition(base, -2)[-2:]
                top2 = top2[np.argsort(-base[top2])]
                idx = top2
            else:
                # order selected by score descending
                idx = idx[np.argsort(-row[idx])]
            if idx.size > max_tags:
                idx = idx[:max_tags]
            tags = ' '.join(kept_class_names[idx].tolist())
            w.writerow([int(id_col[start + i]), tags])
        print(f'  wrote rows {start}-{end}/{n_test}', flush=True)
print(f'Zscore+prior submission written in {time.time()-t0:.1f}s', flush=True)

Start zscore+prior decoding. Test memmap shape: (603420, 20000)


Computing frequency prior over kept classes ...


  counted rows 1800000-2000000


  counted rows 3800000-4000000


  counted rows 5400000-5430775


Prior built. mean=-0.0000, std=0.2000


Selecting global theta to match avg tag count ~2.9 ...


  theta=-0.20 -> avg tags 17508.000


  theta=-0.10 -> avg tags 12200.000


  theta=0.00 -> avg tags 8128.000


  theta=0.10 -> avg tags 5148.000


  theta=0.20 -> avg tags 3154.000


  theta=0.30 -> avg tags 1828.000


  theta=0.40 -> avg tags 960.000


  theta=0.50 -> avg tags 511.000


  theta=0.60 -> avg tags 238.000


  theta=0.70 -> avg tags 98.000


  theta=0.80 -> avg tags 40.000


Chosen theta=0.80 (diff=37.100)


Writing submission.csv using zscore+prior decoding ...


  wrote rows 0-50000/603420


  wrote rows 50000-100000/603420


  wrote rows 100000-150000/603420


  wrote rows 150000-200000/603420


  wrote rows 200000-250000/603420


  wrote rows 250000-300000/603420


  wrote rows 300000-350000/603420


  wrote rows 350000-400000/603420


  wrote rows 400000-450000/603420


  wrote rows 450000-500000/603420


  wrote rows 500000-550000/603420


  wrote rows 550000-600000/603420


  wrote rows 600000-603420/603420


Zscore+prior submission written in 214.2s


In [30]:
# Zscore+prior decode with fixed theta=0.30 (robust default), alpha=0.20
import os, csv, time, numpy as np, pandas as pd
from pathlib import Path

t0 = time.time()
cache_dir = Path('cache')
test_path = cache_dir / 'test_svc_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'
assert test_path.exists() and kept_idx_path.exists() and mlb_classes_path.exists()

n_test = test.shape[0]
itemsize = np.dtype('float32').itemsize
n_classes = os.path.getsize(test_path) // (n_test * itemsize)
n_classes = int(n_classes)
print(f'Fixed-theta zscore+prior decode. Test shape ({n_test}, {n_classes})', flush=True)

keep_cls = np.load(kept_idx_path)
all_classes = np.load(mlb_classes_path, allow_pickle=True)
kept_class_names = all_classes[keep_cls]
cls_to_idx = {c:i for i,c in enumerate(all_classes)}
test_avg = np.memmap(str(test_path), dtype='float32', mode='r', shape=(n_test, n_classes))

# Build frequency prior (alpha=0.20)
alpha = 0.20
freq_all = np.zeros(len(all_classes), dtype=np.int64)
batch = 200000
for start in range(0, train.shape[0], batch):
    end = min(start + batch, train.shape[0])
    tags_batch = train['Tags'].iloc[start:end].astype(str).tolist()
    for tstr in tags_batch:
        if not tstr: continue
        for t in tstr.split():
            idx = cls_to_idx.get(t)
            if idx is not None:
                freq_all[idx] += 1
freq_kept = freq_all[keep_cls]
logf = np.log1p(freq_kept).astype(np.float32)
mu, sd = float(logf.mean()), float(logf.std() + 1e-6)
prior = alpha * ((logf - mu) / sd).astype(np.float32)

# Use fixed theta=0.30 (expert default). Verify avg tags on 100k sample.
theta = 0.30
sample_n = min(100000, n_test)
row_batch = 50000
tot_mean = 0.0; seen = 0
for s in range(0, sample_n, row_batch):
    e = min(s + row_batch, sample_n)
    S = np.asarray(test_avg[s:e, :], dtype=np.float32)
    m = S.mean(axis=1, keepdims=True)
    st = S.std(axis=1, keepdims=True) + 1e-6
    Z = (S - m) / st
    Zp = Z + prior[None, :]
    cnt = np.count_nonzero(Zp >= theta, axis=1).astype(np.float32)
    tot_mean += float(cnt.sum())
    seen += (e - s)
avg_tags = tot_mean / max(1, seen)
print(f'Fixed theta={theta:.2f} -> sample avg tags ~{avg_tags:.3f}', flush=True)

# Decode full test with fallbacks/caps
id_col = test['Id'].values
K_fallback = 3
min_k = 2
max_tags = 6
out_path = 'submission.csv'
with open(out_path, 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S = np.asarray(test_avg[start:end, :], dtype=np.float32)
        m = S.mean(axis=1, keepdims=True)
        st = S.std(axis=1, keepdims=True) + 1e-6
        Z = (S - m) / st
        Zp = Z + prior[None, :]
        for i in range(Zp.shape[0]):
            row = Zp[i]
            idx = np.flatnonzero(row >= theta)
            if idx.size == 0:
                base = S[i]
                topk = np.argpartition(base, -K_fallback)[-K_fallback:]
                idx = topk[np.argsort(-base[topk])]
            elif idx.size == 1 and min_k >= 2:
                base = S[i]
                top2 = np.argpartition(base, -2)[-2:]
                idx = top2[np.argsort(-base[top2])]
            else:
                idx = idx[np.argsort(-row[idx])]
            if idx.size > max_tags:
                idx = idx[:max_tags]
            tags = ' '.join(kept_class_names[idx].tolist())
            w.writerow([int(id_col[start + i]), tags])
        print(f'  wrote rows {start}-{end}/{n_test}', flush=True)
print(f'Fixed-theta zscore+prior submission written in {time.time()-t0:.1f}s', flush=True)

Fixed-theta zscore+prior decode. Test shape (603420, 20000)


Fixed theta=0.30 -> sample avg tags ~1828.000


  wrote rows 0-50000/603420


  wrote rows 50000-100000/603420


  wrote rows 100000-150000/603420


  wrote rows 150000-200000/603420


  wrote rows 200000-250000/603420


  wrote rows 250000-300000/603420


  wrote rows 300000-350000/603420


  wrote rows 350000-400000/603420


  wrote rows 400000-450000/603420


  wrote rows 450000-500000/603420


  wrote rows 500000-550000/603420


  wrote rows 550000-600000/603420


  wrote rows 600000-603420/603420


Fixed-theta zscore+prior submission written in 64.2s


In [31]:
# Zscore+prior with std floor (robust), theta via grid to avg ~2.9, then decode
import os, csv, time, numpy as np, pandas as pd
from pathlib import Path

t0 = time.time()
cache_dir = Path('cache')
test_path = cache_dir / 'test_svc_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'
assert test_path.exists() and kept_idx_path.exists() and mlb_classes_path.exists(), 'Missing cached artifacts'

n_test = test.shape[0]
itemsize = np.dtype('float32').itemsize
n_classes = os.path.getsize(test_path) // (n_test * itemsize)
n_classes = int(n_classes)
print(f'Robust zscore+prior decode. Test memmap: ({n_test}, {n_classes})', flush=True)

keep_cls = np.load(kept_idx_path)
all_classes = np.load(mlb_classes_path, allow_pickle=True)
kept_class_names = all_classes[keep_cls]
cls_to_idx = {c:i for i,c in enumerate(all_classes)}
test_avg = np.memmap(str(test_path), dtype='float32', mode='r', shape=(n_test, n_classes))

# Build frequency prior (alpha=0.20)
alpha = 0.20
freq_all = np.zeros(len(all_classes), dtype=np.int64)
batch = 200000
for start in range(0, train.shape[0], batch):
    end = min(start + batch, train.shape[0])
    tags_batch = train['Tags'].iloc[start:end].astype(str).tolist()
    for tstr in tags_batch:
        if not tstr: continue
        for t in tstr.split():
            idx = cls_to_idx.get(t)
            if idx is not None:
                freq_all[idx] += 1
freq_kept = freq_all[keep_cls]
logf = np.log1p(freq_kept).astype(np.float32)
mu, sd = float(logf.mean()), float(logf.std() + 1e-6)
prior = alpha * ((logf - mu) / sd).astype(np.float32)

# Select theta using a grid with robust per-row std floor to avoid exploding Z
print('Selecting theta with robust std floor ...', flush=True)
target_avg = 2.9
grid = np.linspace(-0.5, 2.0, 26).astype(np.float32)  # wider range
sample_n = min(100000, n_test)
row_batch = 50000
std_floor = 1.0  # key fix to prevent division by tiny std
best_theta, best_diff = None, 1e9
for theta in grid:
    total = 0.0
    seen = 0
    for s in range(0, sample_n, row_batch):
        e = min(s + row_batch, sample_n)
        S = np.asarray(test_avg[s:e, :], dtype=np.float32)
        m = S.mean(axis=1, keepdims=True)
        st = S.std(axis=1, keepdims=True)
        st = np.maximum(st, std_floor)
        Z = (S - m) / st
        Z = np.clip(Z, -5.0, 5.0)
        Zp = Z + prior[None, :]
        cnt = np.count_nonzero(Zp >= theta, axis=1).astype(np.float32)
        total += float(cnt.sum())
        seen += (e - s)
    avg = total / max(1, seen)
    diff = abs(avg - target_avg)
    print(f'  theta={float(theta):.2f} -> avg tags {avg:.3f}', flush=True)
    if diff < best_diff:
        best_diff, best_theta = diff, float(theta)
if best_theta is None:
    best_theta = 0.30
print(f'Chosen theta={best_theta:.2f} (diff={best_diff:.3f}) with std_floor={std_floor}', flush=True)

# Decode full test with fallbacks and caps
print('Writing submission.csv (robust zscore+prior) ...', flush=True)
id_col = test['Id'].values
K_fallback = 3
min_k = 2
max_tags = 6
with open('submission.csv', 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S = np.asarray(test_avg[start:end, :], dtype=np.float32)
        m = S.mean(axis=1, keepdims=True)
        st = S.std(axis=1, keepdims=True)
        st = np.maximum(st, std_floor)
        Z = (S - m) / st
        Z = np.clip(Z, -5.0, 5.0)
        Zp = Z + prior[None, :]
        for i in range(Zp.shape[0]):
            row = Zp[i]
            idx = np.flatnonzero(row >= best_theta)
            if idx.size == 0:
                base = S[i]
                topk = np.argpartition(base, -K_fallback)[-K_fallback:]
                idx = topk[np.argsort(-base[topk])]
            elif idx.size == 1 and min_k >= 2:
                base = S[i]
                top2 = np.argpartition(base, -2)[-2:]
                idx = top2[np.argsort(-base[top2])]
            else:
                idx = idx[np.argsort(-row[idx])]
            if idx.size > max_tags:
                idx = idx[:max_tags]
            tags = ' '.join(kept_class_names[idx].tolist())
            w.writerow([int(id_col[start + i]), tags])
        print(f'  wrote rows {start}-{end}/{n_test}', flush=True)
print(f'Robust zscore+prior submission written in {time.time()-t0:.1f}s', flush=True)

Robust zscore+prior decode. Test memmap: (603420, 20000)


Selecting theta with robust std floor ...


  theta=-0.50 -> avg tags 20000.000


  theta=-0.40 -> avg tags 20000.000


  theta=-0.30 -> avg tags 20000.000


  theta=-0.20 -> avg tags 17508.000


  theta=-0.10 -> avg tags 12200.000


  theta=0.00 -> avg tags 8128.000


  theta=0.10 -> avg tags 5148.000


  theta=0.20 -> avg tags 3154.000


  theta=0.30 -> avg tags 1828.000


  theta=0.40 -> avg tags 960.000


  theta=0.50 -> avg tags 511.000


  theta=0.60 -> avg tags 238.000


  theta=0.70 -> avg tags 98.000


  theta=0.80 -> avg tags 40.000


  theta=0.90 -> avg tags 21.000


  theta=1.00 -> avg tags 13.000


  theta=1.10 -> avg tags 5.000


  theta=1.20 -> avg tags 0.000


  theta=1.30 -> avg tags 0.000


  theta=1.40 -> avg tags 0.000


  theta=1.50 -> avg tags 0.000


  theta=1.60 -> avg tags 0.000


  theta=1.70 -> avg tags 0.000


  theta=1.80 -> avg tags 0.000


  theta=1.90 -> avg tags 0.000


  theta=2.00 -> avg tags 0.000


Chosen theta=1.10 (diff=2.100) with std_floor=1.0


Writing submission.csv (robust zscore+prior) ...


  wrote rows 0-50000/603420


  wrote rows 50000-100000/603420


  wrote rows 100000-150000/603420


  wrote rows 150000-200000/603420


  wrote rows 200000-250000/603420


  wrote rows 250000-300000/603420


  wrote rows 300000-350000/603420


  wrote rows 350000-400000/603420


  wrote rows 400000-450000/603420


  wrote rows 450000-500000/603420


  wrote rows 500000-550000/603420


  wrote rows 550000-600000/603420


  wrote rows 600000-603420/603420


Robust zscore+prior submission written in 254.7s


In [32]:
# Prior-augmented Dynamic-K decoding (no OOF needed): alpha=0.10, gap_thr=0.2, base K=3..6
import os, csv, time, numpy as np, pandas as pd
from pathlib import Path

t0 = time.time()
cache_dir = Path('cache')
test_path = cache_dir / 'test_svc_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'
assert test_path.exists() and kept_idx_path.exists() and mlb_classes_path.exists(), 'Missing cached artifacts'

n_test = test.shape[0]
itemsize = np.dtype('float32').itemsize
n_classes = os.path.getsize(test_path) // (n_test * itemsize)
n_classes = int(n_classes)
print(f'Dynamic-K decode. Test memmap: ({n_test}, {n_classes})', flush=True)

keep_cls = np.load(kept_idx_path)
all_classes = np.load(mlb_classes_path, allow_pickle=True)
kept_class_names = all_classes[keep_cls]
cls_to_idx = {c:i for i,c in enumerate(all_classes)}
test_avg = np.memmap(str(test_path), dtype='float32', mode='r', shape=(n_test, n_classes))

# Build frequency prior with smaller weight (alpha=0.10)
alpha = 0.10
print('Computing frequency prior ...', flush=True)
freq_all = np.zeros(len(all_classes), dtype=np.int64)
batch = 200000
for start in range(0, train.shape[0], batch):
    end = min(start + batch, train.shape[0])
    tags_batch = train['Tags'].iloc[start:end].astype(str).tolist()
    for tstr in tags_batch:
        if not tstr: continue
        for t in tstr.split():
            idx = cls_to_idx.get(t)
            if idx is not None:
                freq_all[idx] += 1
freq_kept = freq_all[keep_cls]
logf = np.log1p(freq_kept).astype(np.float32)
mu, sd = float(logf.mean()), float(logf.std() + 1e-6)
prior = alpha * ((logf - mu) / sd).astype(np.float32)
print(f'Prior ready. mean={prior.mean():.4f}, std={prior.std():.4f}', flush=True)

# Dynamic-K parameters
base_k = 3
max_k = 6
gap_thr = 0.2  # increase K while normalized gap between k and k+1 is small
min_k = 2
max_tags = 6
row_batch = 50000
id_col = test['Id'].values

def decode_rows(S: np.ndarray, prior: np.ndarray):
    # S: (B, C) raw margins
    B, C = S.shape
    # row-wise z-score for stability
    m = S.mean(axis=1, keepdims=True)
    st = S.std(axis=1, keepdims=True) + 1e-6
    Z = (S - m) / st
    Zp = Z + prior[None, :]
    out_idx = []
    for i in range(B):
        row = Zp[i]
        # get top max_k+1 to evaluate gaps (safe if C < max_k+1)
        k_take = min(max_k + 1, row.shape[0])
        top_idx = np.argpartition(row, -k_take)[-k_take:]
        top_idx = top_idx[np.argsort(-row[top_idx])]
        # compute normalized gaps on Z (use top of Zp for ranking but normalize gaps by std of top raw Z scores)
        # use Z (without prior) for gap normalization to avoid prior dominating std
        zrow = Z[i]
        z_top = zrow[top_idx[:min(6, zrow.shape[0])]]
        z_std = float(np.std(z_top)) if z_top.size > 0 else 1.0
        if z_std < 1e-6: z_std = 1.0
        K = base_k
        # increase K while gap between K and K+1 is small
        while K < max_k and (K < top_idx.size):
            if K >= top_idx.size: break
            s_k = row[top_idx[K-1]]
            s_next = row[top_idx[K]] if K < top_idx.size else -1e9
            gap = (s_k - s_next) / z_std
            if gap < gap_thr:
                K += 1
            else:
                break
        if K < min_k:
            K = min_k
        sel = top_idx[:K]
        out_idx.append(sel)
    return out_idx

out_path = 'submission.csv'
with open(out_path, 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S = np.asarray(test_avg[start:end, :], dtype=np.float32)
        decoded = decode_rows(S, prior)
        for i, idx in enumerate(decoded):
            if idx.size > max_tags:
                idx = idx[:max_tags]
            tags = ' '.join(kept_class_names[idx].tolist())
            w.writerow([int(id_col[start + i]), tags])
        print(f'  wrote rows {start}-{end}/{n_test}', flush=True)
print(f'Dynamic-K submission written in {time.time()-t0:.1f}s', flush=True)

Dynamic-K decode. Test memmap: (603420, 20000)


Computing frequency prior ...


Prior ready. mean=-0.0000, std=0.1000


  wrote rows 0-50000/603420


  wrote rows 50000-100000/603420


  wrote rows 100000-150000/603420


  wrote rows 150000-200000/603420


  wrote rows 200000-250000/603420


  wrote rows 250000-300000/603420


  wrote rows 300000-350000/603420


  wrote rows 350000-400000/603420


  wrote rows 400000-450000/603420


  wrote rows 450000-500000/603420


  wrote rows 500000-550000/603420


  wrote rows 550000-600000/603420


  wrote rows 600000-603420/603420


Dynamic-K submission written in 83.1s


In [33]:
# Blend SVC and LR (if available) via per-row z-score, add small prior, select theta to avg ~2.9, then decode
import os, csv, time, numpy as np, pandas as pd
from pathlib import Path

t0 = time.time()
cache_dir = Path('cache')
svc_path = cache_dir / 'test_svc_avg.dat'
lr_path  = cache_dir / 'test_lr_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'
assert svc_path.exists() and kept_idx_path.exists() and mlb_classes_path.exists(), 'Missing SVC artifacts'

n_test = test.shape[0]
itemsize = np.dtype('float32').itemsize
C_svc = os.path.getsize(svc_path) // (n_test * itemsize)
C_svc = int(C_svc)
print(f'Blended decode: SVC memmap ({n_test}, {C_svc})', flush=True)
has_lr = lr_path.exists()
C_lr = None
if has_lr:
    C_lr = int(os.path.getsize(lr_path) // (n_test * itemsize))
    print(f'LR memmap detected with {C_lr} classes (top-N by freq assumed).', flush=True)

keep_cls = np.load(kept_idx_path)
all_classes = np.load(mlb_classes_path, allow_pickle=True)
kept_class_names = all_classes[keep_cls]
cls_to_idx_all = {c:i for i,c in enumerate(all_classes)}

# Frequency prior over kept classes (smaller when blending)
alpha = 0.15 if has_lr else 0.20
print('Computing frequency prior over kept classes ...', flush=True)
freq_all = np.zeros(len(all_classes), dtype=np.int64)
batch = 200000
for start in range(0, train.shape[0], batch):
    end = min(start + batch, train.shape[0])
    tags_batch = train['Tags'].iloc[start:end].astype(str).tolist()
    for tstr in tags_batch:
        if not tstr: continue
        for t in tstr.split():
            idx = cls_to_idx_all.get(t)
            if idx is not None: freq_all[idx] += 1
freq_kept = freq_all[keep_cls]
logf = np.log1p(freq_kept).astype(np.float32)
mu, sd = float(logf.mean()), float(logf.std() + 1e-6)
prior = alpha * ((logf - mu) / sd).astype(np.float32)

# Prepare memmaps
SVC = np.memmap(str(svc_path), dtype='float32', mode='r', shape=(n_test, C_svc))
LR = None
lr_to_svc = None
if has_lr:
    LR = np.memmap(str(lr_path), dtype='float32', mode='r', shape=(n_test, C_lr))
    # Assume LR columns are top-N by global frequency over all classes; map to kept classes
    freq_order_all = np.argsort(-freq_all)[:C_lr]
    # Build mapping: for each frequent class in freq_order_all, if it is in keep_cls, map to svc column
    svc_pos = {int(k): i for i, k in enumerate(keep_cls)}
    lr_to_svc = []
    for j, cls_global in enumerate(freq_order_all):
        pos = svc_pos.get(int(cls_global))
        if pos is not None:
            lr_to_svc.append((j, pos))
    print(f'Aligned {len(lr_to_svc)}/{C_lr} LR columns to SVC kept classes', flush=True)

# Function to row-zscore
def row_zscore(M: np.ndarray, std_floor: float = 1e-6):
    m = M.mean(axis=1, keepdims=True)
    st = M.std(axis=1, keepdims=True)
    st = np.maximum(st, std_floor)
    Z = (M - m) / st
    return Z

# Choose global theta to match avg ~2.9 using a 100k sample
print('Selecting global theta (blend) to target avg ~2.9 ...', flush=True)
target_avg = 2.9
grid = np.linspace(-0.5, 2.0, 26).astype(np.float32)
sample_n = min(100000, n_test)
row_batch = 50000
best_theta, best_diff = None, 1e9
for theta in grid:
    total = 0.0; seen = 0
    for s in range(0, sample_n, row_batch):
        e = min(s + row_batch, sample_n)
        S_batch = np.asarray(SVC[s:e, :], dtype=np.float32)
        Zs = row_zscore(S_batch, std_floor=1.0)
        if has_lr and lr_to_svc:
            L_batch = np.asarray(LR[s:e, :], dtype=np.float32)
            Zl = row_zscore(L_batch, std_floor=1.0)
            # blend into SVC-sized matrix
            # Start with SVC z-scores
            B = Zs
            # For aligned columns, blend 0.7*Zs + 0.3*Zl
            if lr_to_svc:
                js, ks = zip(*lr_to_svc) if len(lr_to_svc) else ([], [])
                if js:
                    js = np.array(js, dtype=np.int32); ks = np.array(ks, dtype=np.int32)
                    B[:, ks] = 0.7 * B[:, ks] + 0.3 * Zl[:, js]
        else:
            B = Zs
        Zp = B + prior[None, :]
        cnt = np.count_nonzero(Zp >= theta, axis=1).astype(np.float32)
        total += float(cnt.sum()); seen += (e - s)
    avg = total / max(1, seen)
    diff = abs(avg - target_avg)
    print(f'  theta={float(theta):.2f} -> avg tags {avg:.3f}', flush=True)
    if diff < best_diff:
        best_diff, best_theta = diff, float(theta)
if best_theta is None:
    best_theta = 0.30
print(f'Chosen theta={best_theta:.2f} (diff={best_diff:.3f})', flush=True)

# Decode full test
print('Writing submission.csv (blended) ...', flush=True)
id_col = test['Id'].values
K_fallback = 3
min_k = 2
max_tags = 6
with open('submission.csv', 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S_batch = np.asarray(SVC[start:end, :], dtype=np.float32)
        Zs = row_zscore(S_batch, std_floor=1.0)
        if has_lr and lr_to_svc:
            L_batch = np.asarray(LR[start:end, :], dtype=np.float32)
            Zl = row_zscore(L_batch, std_floor=1.0)
            B = Zs
            if lr_to_svc:
                js, ks = zip(*lr_to_svc) if len(lr_to_svc) else ([], [])
                if js:
                    js = np.array(js, dtype=np.int32); ks = np.array(ks, dtype=np.int32)
                    B[:, ks] = 0.7 * B[:, ks] + 0.3 * Zl[:, js]
        else:
            B = Zs
        Zp = B + prior[None, :]
        for i in range(Zp.shape[0]):
            row = Zp[i]
            idx = np.flatnonzero(row >= best_theta)
            if idx.size == 0:
                base = S_batch[i]
                topk = np.argpartition(base, -K_fallback)[-K_fallback:]
                idx = topk[np.argsort(-base[topk])]
            elif idx.size == 1 and min_k >= 2:
                base = S_batch[i]
                top2 = np.argpartition(base, -2)[-2:]
                idx = top2[np.argsort(-base[top2])]
            else:
                idx = idx[np.argsort(-row[idx])]
            if idx.size > max_tags:
                idx = idx[:max_tags]
            tags = ' '.join(kept_class_names[idx].tolist())
            w.writerow([int(id_col[start + i]), tags])
        print(f'  wrote rows {start}-{end}/{n_test}', flush=True)
print(f'Blended submission written in {time.time()-t0:.1f}s', flush=True)

Blended decode: SVC memmap (603420, 20000)


LR memmap detected with 30000 classes (top-N by freq assumed).


Computing frequency prior over kept classes ...


Aligned 20000/30000 LR columns to SVC kept classes


Selecting global theta (blend) to target avg ~2.9 ...


  theta=-0.50 -> avg tags 20000.000


  theta=-0.40 -> avg tags 20000.000


  theta=-0.30 -> avg tags 20000.000


  theta=-0.20 -> avg tags 20000.000


  theta=-0.10 -> avg tags 13721.999


  theta=0.00 -> avg tags 8128.000


  theta=0.10 -> avg tags 4389.000


  theta=0.20 -> avg tags 2214.000


  theta=0.30 -> avg tags 960.000


  theta=0.40 -> avg tags 380.000


  theta=0.50 -> avg tags 130.000


  theta=0.60 -> avg tags 40.000


  theta=0.70 -> avg tags 19.000


  theta=0.80 -> avg tags 6.000


  theta=0.90 -> avg tags 0.000


  theta=1.00 -> avg tags 0.000


  theta=1.10 -> avg tags 0.000


  theta=1.20 -> avg tags 0.000


  theta=1.30 -> avg tags 0.000


  theta=1.40 -> avg tags 0.000


  theta=1.50 -> avg tags 0.000


  theta=1.60 -> avg tags 0.000


  theta=1.70 -> avg tags 0.000


  theta=1.80 -> avg tags 0.000


  theta=1.90 -> avg tags 0.000


  theta=2.00 -> avg tags 0.000


Chosen theta=0.90 (diff=2.900)


Writing submission.csv (blended) ...


  wrote rows 0-50000/603420


  wrote rows 50000-100000/603420


  wrote rows 100000-150000/603420


  wrote rows 150000-200000/603420


  wrote rows 200000-250000/603420


  wrote rows 250000-300000/603420


  wrote rows 300000-350000/603420


  wrote rows 350000-400000/603420


  wrote rows 400000-450000/603420


  wrote rows 450000-500000/603420


  wrote rows 500000-550000/603420


  wrote rows 550000-600000/603420


  wrote rows 600000-603420/603420


Blended submission written in 2212.2s


In [34]:
# Quick variant: Dynamic-K with higher recall (base_k=2, gap_thr=0.15), std floor, prior alpha=0.10
import os, csv, time, numpy as np, pandas as pd
from pathlib import Path

t0 = time.time()
cache_dir = Path('cache')
test_path = cache_dir / 'test_svc_avg.dat'
kept_idx_path = cache_dir / 'kept_class_indices.npy'
mlb_classes_path = cache_dir / 'mlb_classes.npy'
assert test_path.exists() and kept_idx_path.exists() and mlb_classes_path.exists(), 'Missing cached artifacts'

n_test = test.shape[0]
itemsize = np.dtype('float32').itemsize
n_classes = os.path.getsize(test_path) // (n_test * itemsize)
n_classes = int(n_classes)
print(f'Dynamic-K (recall) decode. Test memmap: ({n_test}, {n_classes})', flush=True)

keep_cls = np.load(kept_idx_path)
all_classes = np.load(mlb_classes_path, allow_pickle=True)
kept_class_names = all_classes[keep_cls]
cls_to_idx = {c:i for i,c in enumerate(all_classes)}
test_avg = np.memmap(str(test_path), dtype='float32', mode='r', shape=(n_test, n_classes))

# Small frequency prior (alpha=0.10)
alpha = 0.10
freq_all = np.zeros(len(all_classes), dtype=np.int64)
batch = 200000
for start in range(0, train.shape[0], batch):
    end = min(start + batch, train.shape[0])
    tags_batch = train['Tags'].iloc[start:end].astype(str).tolist()
    for tstr in tags_batch:
        if not tstr: continue
        for t in tstr.split():
            idx = cls_to_idx.get(t)
            if idx is not None:
                freq_all[idx] += 1
freq_kept = freq_all[keep_cls]
logf = np.log1p(freq_kept).astype(np.float32)
mu, sd = float(logf.mean()), float(logf.std() + 1e-6)
prior = alpha * ((logf - mu) / sd).astype(np.float32)

# Dynamic-K params (more recall)
base_k = 2
max_k = 6
gap_thr = 0.15
min_k = 2
max_tags = 6
row_batch = 50000
id_col = test['Id'].values

def row_zscore_floor(M: np.ndarray, std_floor: float = 1.0):
    m = M.mean(axis=1, keepdims=True)
    st = M.std(axis=1, keepdims=True)
    st = np.maximum(st, std_floor)
    Z = (M - m) / st
    return np.clip(Z, -5.0, 5.0)

def decode_rows(S: np.ndarray, prior: np.ndarray):
    B, C = S.shape
    Z = row_zscore_floor(S, std_floor=1.0)
    Zp = Z + prior[None, :]
    out_idx = []
    for i in range(B):
        row = Zp[i]
        k_take = min(max_k + 1, row.shape[0])
        top_idx = np.argpartition(row, -k_take)[-k_take:]
        top_idx = top_idx[np.argsort(-row[top_idx])]
        zrow = Z[i]
        z_top = zrow[top_idx[:min(6, zrow.shape[0])]]
        z_std = float(np.std(z_top)) if z_top.size > 0 else 1.0
        if z_std < 1e-6: z_std = 1.0
        K = base_k
        while K < max_k and (K < top_idx.size):
            s_k = row[top_idx[K-1]]
            s_next = row[top_idx[K]] if K < top_idx.size else -1e9
            gap = (s_k - s_next) / z_std
            if gap < gap_thr:
                K += 1
            else:
                break
        if K < min_k: K = min_k
        sel = top_idx[:K]
        out_idx.append(sel)
    return out_idx

out_path = 'submission.csv'
with open(out_path, 'w', newline='') as f:
    w = csv.writer(f)
    w.writerow(['Id', 'Tags'])
    for start in range(0, n_test, row_batch):
        end = min(start + row_batch, n_test)
        S = np.asarray(test_avg[start:end, :], dtype=np.float32)
        decoded = decode_rows(S, prior)
        for i, idx in enumerate(decoded):
            if idx.size > max_tags:
                idx = idx[:max_tags]
            tags = ' '.join(kept_class_names[idx].tolist())
            w.writerow([int(id_col[start + i]), tags])
        print(f'  wrote rows {start}-{end}/{n_test}', flush=True)
print(f'Dynamic-K (recall) submission written in {time.time()-t0:.1f}s', flush=True)

Dynamic-K (recall) decode. Test memmap: (603420, 20000)


  wrote rows 0-50000/603420


  wrote rows 50000-100000/603420


  wrote rows 100000-150000/603420


  wrote rows 150000-200000/603420


  wrote rows 200000-250000/603420


  wrote rows 250000-300000/603420


  wrote rows 300000-350000/603420


  wrote rows 350000-400000/603420


  wrote rows 400000-450000/603420


  wrote rows 450000-500000/603420


KeyboardInterrupt: 