# Plan: LMSYS Chatbot Arena Preference Prediction

Goals:
- Establish environment (GPU check) and robust baseline quickly
- Build deterministic CV mirroring test
- Fast baseline: text-only TF-IDF + linear/logistic models; add model/meta features
- Iterate with calibrated models and blends (e.g., LR, Linear SVM, NB-SVM, XGBoost on sparse)
- Cache features and OOF/test logits; error analysis loop

Initial Milestones:
1) Env check + data loading sanity
2) EDA: target distribution, columns, text lengths, missingness
3) CV protocol: StratifiedKFold on target with fixed seed; save folds
4) Baseline v1: TF-IDF on prompts + responses; simple linear model, class_weight balanced
5) Baseline v2: add engineered features (lengths, punctuation, toxicity/sentiment proxies if quick), per-position features to counter position bias
6) Calibrate (Platt/isotonic) and blend diverse seeds/models
7) Generate submission; iterate via OOF diagnostics

Discipline:
- Log timings per fold; cache sparse matrices
- Fit transforms inside folds only; avoid leakage
- Request expert review after baseline and major changes

Next action: run environment check and peek at data heads.

In [1]:
import os, sys, subprocess, time
import pandas as pd
import numpy as np
from datetime import datetime

def log(msg):
    print(f"[{datetime.utcnow().isoformat(timespec='seconds')}Z] {msg}", flush=True)

# 1) GPU environment check
log("Running nvidia-smi (GPU check)...")
try:
    res = subprocess.run(['bash','-lc','nvidia-smi || true'], capture_output=True, text=True, timeout=30)
    print(res.stdout)
except Exception as e:
    log(f"nvidia-smi failed: {e}")

# 2) Load data heads and shapes
log("Loading train.csv and test.csv heads...")
train_path = 'train.csv'
test_path = 'test.csv'\

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
log(f"train.shape={train.shape} test.shape={test.shape}")
log("train columns:")
print(train.columns.tolist())
log("test columns:")
print(test.columns.tolist())

# 3) Inspect target distribution
target_col = 'winner_model_b'
if target_col in train.columns:
    vc = train[target_col].value_counts(dropna=False)
    vcn = train[target_col].value_counts(normalize=True, dropna=False)
    log("Target counts:")
    print(vc)
    log("Target fractions:")
    print(vcn)
else:
    log(f"Target column {target_col} not found in train.csv")

# 4) Quick peek at text fields and lengths if present
text_cols = [c for c in train.columns if train[c].dtype == 'object']
log(f"Detected object (likely text) columns: {text_cols[:10]}{'...' if len(text_cols)>10 else ''}")
for c in text_cols[:5]:
    lens = train[c].fillna('').str.len()
    log(f"len({c}): mean={lens.mean():.1f} std={lens.std():.1f} min={lens.min()} p50={lens.median():.1f} p95={lens.quantile(0.95):.1f} max={lens.max()}")

log("Head(train):")
print(train.head(3))
log("Head(test):")
print(test.head(3))

[2025-09-24T20:34:51Z] Running nvidia-smi (GPU check)...


Wed Sep 24 20:34:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.06             Driver Version: 550.144.06     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10-24Q                 On  |   00000002:00:00.0 Off |                    0 |
| N/A   N/A    P0             N/A /  N/A  |     128MiB /  24512MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

[2025-09-24T20:34:52Z] train.shape=(51729, 9) test.shape=(5748, 4)


[2025-09-24T20:34:52Z] train columns:


['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']
[2025-09-24T20:34:52Z] test columns:


['id', 'prompt', 'response_a', 'response_b']
[2025-09-24T20:34:52Z] Target counts:


winner_model_b
0    34094
1    17635
Name: count, dtype: int64
[2025-09-24T20:34:52Z] Target fractions:


winner_model_b
0    0.659089
1    0.340911
Name: proportion, dtype: float64
[2025-09-24T20:34:52Z] Detected object (likely text) columns: ['model_a', 'model_b', 'prompt', 'response_a', 'response_b']


[2025-09-24T20:34:52Z] len(model_a): mean=14.4 std=4.7 min=6 p50=14.0 p95=24.0 max=30


[2025-09-24T20:34:52Z] len(model_b): mean=14.4 std=4.7 min=6 p50=14.0 p95=25.0 max=30


[2025-09-24T20:34:52Z] len(prompt): mean=367.0 std=1056.1 min=7 p50=96.0 p95=1472.0 max=33056


[2025-09-24T20:34:52Z] len(response_a): mean=1377.2 std=1518.6 min=4 p50=1076.0 p95=3708.0 max=54058


[2025-09-24T20:34:52Z] len(response_b): mean=1385.9 std=1546.9 min=4 p50=1083.0 p95=3696.0 max=53830


[2025-09-24T20:34:52Z] Head(train):


           id             model_a             model_b  \
0  2444074745      zephyr-7b-beta     llama-2-7b-chat   
1  1805535695  gpt-3.5-turbo-0613    llama-2-13b-chat   
2  2454781969    claude-instant-1  gpt-4-0125-preview   

                                              prompt  \
0  ["Can the Orca Cloud Security Platform detect ...   
1  ["Write 3 sensational twists for a thriller ",...   
2  ["Create some creatively mocking sentences abo...   

                                          response_a  \
0  ["Yes, the Orca Cloud Security Platform can de...   
1  ["1. The Protagonist's Best Friend is the Mast...   
2  ["I apologize, upon further reflection I don't...   

                                          response_b  winner_model_a  \
0  ["Yes, the Orca Cloud Security Platform can de...               0   
1  ["Sure, here are three sensational twists for ...               1   
2  ["Sure! Just remember, this is all in good fun...               0   

   winner_model_b  winner_tie  


           id                                             prompt  \
0  3297560222  ["What can you tell me about Maarten van Vulpe...   
1  2556155375  ["is cebu island a good place to travel to in ...   
2  1793939629  ["Hi, we've been trying to reach you about you...   

                                          response_a  \
0  ["Maarten van Vulpen (also spelled Marten or M...   
1  ["Yes, Cebu Island is a great place to visit i...   
2  ["I'm glad you reached out, but I must let you...   

                                          response_b  
0  ["Maarten van Vulpen is not a widely known pub...  
1  ["Cebu Island can be a good place to travel to...  
2  ["\"I understand that you're calling about my ...  


In [2]:
# Confirm submission format and prepare 3-class target + groups
import pandas as pd
import numpy as np
from hashlib import blake2b

def norm_prompt(s: str) -> str:
    if not isinstance(s, str):
        return ''
    # lightweight normalization: strip brackets/quotes often present in dataset, lowercase, collapse spaces
    t = s.strip()
    if t.startswith('["') and t.endswith('"]'):
        t = t[2:-2]
    t = t.replace('\n', ' ').replace('\r', ' ')
    t = ' '.join(t.split())
    return t.lower()

def hhash(*parts: str, nbytes: int = 8) -> int:
    h = blake2b(digest_size=nbytes)
    for p in parts:
        if p is None:
            p = ''
        if not isinstance(p, str):
            p = str(p)
        h.update(p.encode('utf-8', errors='ignore'))
        h.update(b'|')
    return int.from_bytes(h.digest(), 'little', signed=False)

sample_sub = pd.read_csv('sample_submission.csv')
print('sample_submission.columns:', sample_sub.columns.tolist())
assert ['id','winner_model_a','winner_model_b','winner_tie'] == sample_sub.columns.tolist(), 'Unexpected submission columns order'

# Build 3-class labels: 0=A wins, 1=B wins, 2=Tie
train = pd.read_csv('train.csv')
y_cols = ['winner_model_a','winner_model_b','winner_tie']
y_mat = train[y_cols].values.astype(int)
y = y_mat.argmax(axis=1)
cls_counts = pd.Series(y).value_counts().sort_index()
print('3-class counts (A,B,Tie):', cls_counts.to_dict())
print('3-class fractions:', (cls_counts/len(y)).round(4).to_dict())

# Grouping by prompt (normalized). For swap-aug later, we'll group by unordered pair; for now, prompt groups:
prompt_norm = train['prompt'].map(norm_prompt)
groups_prompt = prompt_norm.map(lambda s: hhash(s))
print('Unique prompt groups:', groups_prompt.nunique(), 'rows:', len(groups_prompt))

# Basic sanity: no model name features in test; confirm absence
test = pd.read_csv('test.csv')
print('Test has model_a/model_b?', {'model_a' in test.columns, 'model_b' in test.columns})

# Save quick artifacts for next steps (in-memory here, will rebuild in train pipeline)
del sample_sub

sample_submission.columns: ['id', 'winner_model_a', 'winner_model_b', 'winner_tie']


3-class counts (A,B,Tie): {0: 18074, 1: 17635, 2: 16020}
3-class fractions: {0: 0.3494, 1: 0.3409, 2: 0.3097}


Unique prompt groups: 46580 rows: 51729
Test has model_a/model_b? {False}


In [4]:
# Baseline v2: faster TF-IDF diffs + extra scalars + multinomial LR with StratifiedGroupKFold
import re, time
from time import perf_counter
import numpy as np
import pandas as pd
from hashlib import blake2b
from scipy import sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedGroupKFold

def log(msg):
    from datetime import datetime
    print(f"[{datetime.utcnow().isoformat(timespec='seconds')}Z] {msg}", flush=True)

def norm_prompt(s: str) -> str:
    if not isinstance(s, str):
        return ''
    t = s.strip()
    if t.startswith('["') and t.endswith('"]'):
        t = t[2:-2]
    t = t.replace('\n', ' ').replace('\r', ' ')
    t = ' '.join(t.split())
    return t.lower()

def hhash(*parts: str, nbytes: int = 8) -> int:
    h = blake2b(digest_size=nbytes)
    for p in parts:
        if p is None:
            p = ''
        if not isinstance(p, str):
            p = str(p)
        h.update(p.encode('utf-8', errors='ignore'))
        h.update(b'|')
    return int.from_bytes(h.digest(), 'little', signed=False)

def truncate_head_tail(s: str, head: int = 4000, tail: int = 1000) -> str:
    if not isinstance(s, str):
        return ''
    if len(s) <= head + tail:
        return s
    return s[:head] + s[-tail:]

# Regexes for numeric/stylometry counts
re_url = re.compile(r'https?://|www\.')
re_listline = re.compile(r'(?m)^(?:\s*[-*\u2022])')
re_digit = re.compile(r'\d')
re_codefence = re.compile(r'```')
re_quote = re.compile(r'"|\u201c|\u201d|\'')
re_refusal = re.compile(r"\b(i\s+cannot|i\s+can\'t|i\s+cant|sorry|apologize|unable|policy|safety|as an ai)\b", re.I)

FEATS = ['loglen_char','loglen_word','url','newline','qmark','exclam','listmark','digit','code','quote','refusal']

def basic_counts(s: str):
    if not isinstance(s, str):
        s = ''
    return {
        'loglen_char': np.log1p(len(s)),
        'loglen_word': np.log1p(len(s.split())),
        'url': len(re_url.findall(s)),
        'newline': s.count('\n'),
        'qmark': s.count('?'),
        'exclam': s.count('!'),
        'listmark': len(re_listline.findall(s)),
        'digit': len(re_digit.findall(s)),
        'code': len(re_codefence.findall(s)),
        'quote': len(re_quote.findall(s)),
        'refusal': len(re_refusal.findall(s)),
    }

def counts_array(texts):
    n = len(texts)
    M = np.zeros((n, len(FEATS)), dtype=np.float32)
    for i, s in enumerate(texts):
        c = basic_counts(s)
        for j, f in enumerate(FEATS):
            M[i, j] = c[f]
    return M

def cosine_rows(X, Y):
    # Inputs are L2-normalized TF-IDF; cosine = dot product
    return np.asarray(X.multiply(Y).sum(axis=1)).ravel()

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 3-class labels
y_cols = ['winner_model_a','winner_model_b','winner_tie']
y = train[y_cols].values.argmax(axis=1)

# Preprocess texts (truncate head+tail)
prompt_tr = train['prompt'].astype(str).map(truncate_head_tail)
pa_tr = train['response_a'].astype(str).map(truncate_head_tail)
pb_tr = train['response_b'].astype(str).map(truncate_head_tail)
prompt_tr_te = test['prompt'].astype(str).map(truncate_head_tail)
ra_tr_te = test['response_a'].astype(str).map(truncate_head_tail)
rb_tr_te = test['response_b'].astype(str).map(truncate_head_tail)

# Precompute numeric/stylometry counts once (A and B) for train and test
t_counts0 = perf_counter()
A_counts = counts_array(pa_tr.tolist())  # (n_train, k)
B_counts = counts_array(pb_tr.tolist())
A_counts_te = counts_array(ra_tr_te.tolist())
B_counts_te = counts_array(rb_tr_te.tolist())
log(f"Precomputed counts in {perf_counter()-t_counts0:.1f}s")

# Groups by normalized prompt
groups = train['prompt'].map(norm_prompt).map(lambda s: hhash(s))

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros((len(train), 3), dtype=np.float32)
test_pred = np.zeros((len(test), 3), dtype=np.float32)

start_all = perf_counter()
for fold, (tr_idx, va_idx) in enumerate(cv.split(train, y, groups=groups)):
    t0 = perf_counter()
    log(f"Fold {fold} start: tr={len(tr_idx)} va={len(va_idx)}")
    resp_tr_corpus = pd.concat([pa_tr.iloc[tr_idx], pb_tr.iloc[tr_idx]], axis=0).tolist()

    # TF-IDF vectorizers (reduced caps for speed) fit on train-fold responses
    t_vec = perf_counter()
    tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(3,6), min_df=5, max_features=200000,
                                 sublinear_tf=True, dtype=np.float32, norm='l2')
    _ = tfidf_char.fit_transform(resp_tr_corpus)
    tfidf_word = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=5, max_features=120000,
                                 sublinear_tf=True, dtype=np.float32, lowercase=True, token_pattern=r"(?u)\b\w+\b", norm='l2')
    _ = tfidf_word.fit_transform(resp_tr_corpus)
    log(f"Fold {fold} vectorizers fit in {perf_counter()-t_vec:.1f}s")

    # Transform A/B train and valid
    Xa_c_tr = tfidf_char.transform(pa_tr.iloc[tr_idx])
    Xb_c_tr = tfidf_char.transform(pb_tr.iloc[tr_idx])
    Xa_w_tr = tfidf_word.transform(pa_tr.iloc[tr_idx])
    Xb_w_tr = tfidf_word.transform(pb_tr.iloc[tr_idx])
    Xa_c_va = tfidf_char.transform(pa_tr.iloc[va_idx])
    Xb_c_va = tfidf_char.transform(pb_tr.iloc[va_idx])
    Xa_w_va = tfidf_word.transform(pa_tr.iloc[va_idx])
    Xb_w_va = tfidf_word.transform(pb_tr.iloc[va_idx])

    # Prompt sims using word TF-IDF
    Xp_w_tr = tfidf_word.transform(prompt_tr.iloc[tr_idx])
    Xp_w_va = tfidf_word.transform(prompt_tr.iloc[va_idx])
    sim_b_tr = cosine_rows(Xp_w_tr, Xb_w_tr)
    sim_a_tr = cosine_rows(Xp_w_tr, Xa_w_tr)
    sim_b_va = cosine_rows(Xp_w_va, Xb_w_va)
    sim_a_va = cosine_rows(Xp_w_va, Xa_w_va)
    sim_diff_tr = sp.csr_matrix((sim_b_tr - sim_a_tr).reshape(-1,1))
    sim_diff_va = sp.csr_matrix((sim_b_va - sim_a_va).reshape(-1,1))

    # Response-to-response similarity (symmetric, tie-friendly) using word TF-IDF
    cos_ab_tr = sp.csr_matrix(cosine_rows(Xa_w_tr, Xb_w_tr).reshape(-1,1))
    cos_ab_va = sp.csr_matrix(cosine_rows(Xa_w_va, Xb_w_va).reshape(-1,1))

    # Numeric diffs: diff, abs diff, and sum (all cheap scalars)
    A_tr = A_counts[tr_idx]; B_tr = B_counts[tr_idx]
    A_va = A_counts[va_idx]; B_va = B_counts[va_idx]
    diff_tr = (B_tr - A_tr).astype(np.float32)
    diff_va = (B_va - A_va).astype(np.float32)
    adiff_tr = np.abs(diff_tr).astype(np.float32)
    adiff_va = np.abs(diff_va).astype(np.float32)
    sum_tr = (A_tr + B_tr).astype(np.float32)
    sum_va = (A_va + B_va).astype(np.float32)
    num_tr = sp.csr_matrix(np.hstack([diff_tr, adiff_tr, sum_tr]))
    num_va = sp.csr_matrix(np.hstack([diff_va, adiff_va, sum_va]))

    # Final sparse stacks: anti-symmetric TF-IDF diffs + sims + numeric blocks
    X_tr = sp.hstack([Xb_c_tr - Xa_c_tr, Xb_w_tr - Xa_w_tr, sim_diff_tr, cos_ab_tr, num_tr], format='csr')
    X_va = sp.hstack([Xb_c_va - Xa_c_va, Xb_w_va - Xa_w_va, sim_diff_va, cos_ab_va, num_va], format='csr')

    # Model (faster tol)
    t_fit = perf_counter()
    clf = LogisticRegression(multi_class='multinomial', solver='saga', C=2.0, max_iter=1000, tol=1e-3, n_jobs=-1, verbose=0)
    clf.fit(X_tr, y[tr_idx])
    log(f"Fold {fold} model fit in {perf_counter()-t_fit:.1f}s")
    oof_fold = clf.predict_proba(X_va).astype(np.float32)
    oof[va_idx] = oof_fold
    ll = log_loss(y[va_idx], oof_fold, labels=[0,1,2])
    log(f"Fold {fold} logloss={ll:.5f} elapsed={perf_counter()-t0:.1f}s")

    # Test transform and predict for this fold
    Xa_c_te = tfidf_char.transform(ra_tr_te)
    Xb_c_te = tfidf_char.transform(rb_tr_te)
    Xa_w_te = tfidf_word.transform(ra_tr_te)
    Xb_w_te = tfidf_word.transform(rb_tr_te)
    Xp_w_te = tfidf_word.transform(prompt_tr_te)
    sim_b_te = cosine_rows(Xp_w_te, Xb_w_te)
    sim_a_te = cosine_rows(Xp_w_te, Xa_w_te)
    sim_diff_te = sp.csr_matrix((sim_b_te - sim_a_te).reshape(-1,1))
    cos_ab_te = sp.csr_matrix(cosine_rows(Xa_w_te, Xb_w_te).reshape(-1,1))
    diff_te = (B_counts_te - A_counts_te).astype(np.float32)
    adiff_te = np.abs(diff_te).astype(np.float32)
    sum_te = (A_counts_te + B_counts_te).astype(np.float32)
    num_te = sp.csr_matrix(np.hstack([diff_te, adiff_te, sum_te]))
    X_te = sp.hstack([Xb_c_te - Xa_c_te, Xb_w_te - Xa_w_te, sim_diff_te, cos_ab_te, num_te], format='csr')
    test_pred += clf.predict_proba(X_te).astype(np.float32) / cv.n_splits

# OOF logloss
oof_ll = log_loss(y, oof, labels=[0,1,2])
log(f"OOF logloss={oof_ll:.5f}; total elapsed={perf_counter()-start_all:.1f}s")

# Build submission
sub = pd.DataFrame({
    'id': test['id'].values,
    'winner_model_a': test_pred[:,0],
    'winner_model_b': test_pred[:,1],
    'winner_tie': test_pred[:,2],
})
# Probability hygiene: clip and renormalize
eps = 1e-15
probs = sub[['winner_model_a','winner_model_b','winner_tie']].values
probs = np.clip(probs, eps, 1 - eps)
probs /= probs.sum(axis=1, keepdims=True)
sub[['winner_model_a','winner_model_b','winner_tie']] = probs
sub.to_csv('submission.csv', index=False)
log('Wrote submission.csv')

[2025-09-24T21:17:35Z] Precomputed counts in 9.9s


[2025-09-24T21:17:44Z] Fold 0 start: tr=41174 va=10555


[2025-09-24T21:20:37Z] Fold 0 vectorizers fit in 172.6s




[2025-09-24T21:48:29Z] Fold 0 model fit in 1470.8s


[2025-09-24T21:48:29Z] Fold 0 logloss=1.07755 elapsed=1845.0s


[2025-09-24T21:48:52Z] Fold 1 start: tr=41276 va=10453


[2025-09-24T21:51:47Z] Fold 1 vectorizers fit in 174.8s




[2025-09-24T22:19:37Z] Fold 1 model fit in 1469.8s


[2025-09-24T22:19:37Z] Fold 1 logloss=1.07522 elapsed=1845.4s


[2025-09-24T22:20:00Z] Fold 2 start: tr=41451 va=10278


[2025-09-24T22:22:53Z] Fold 2 vectorizers fit in 173.2s




[2025-09-24T22:50:24Z] Fold 2 model fit in 1451.0s


[2025-09-24T22:50:24Z] Fold 2 logloss=1.07808 elapsed=1824.4s


[2025-09-24T22:50:47Z] Fold 3 start: tr=41598 va=10131


[2025-09-24T22:53:44Z] Fold 3 vectorizers fit in 176.3s




[2025-09-24T23:18:56Z] Fold 3 model fit in 1311.3s


[2025-09-24T23:18:56Z] Fold 3 logloss=1.07603 elapsed=1688.5s


[2025-09-24T23:19:18Z] Fold 4 start: tr=41417 va=10312


[2025-09-24T23:22:14Z] Fold 4 vectorizers fit in 175.2s




[2025-09-24T23:47:33Z] Fold 4 model fit in 1318.9s


[2025-09-24T23:47:33Z] Fold 4 logloss=1.07832 elapsed=1694.4s


[2025-09-24T23:47:55Z] OOF logloss=1.07704; total elapsed=9020.5s


[2025-09-24T23:47:55Z] Wrote submission.csv


In [5]:
# Post-hoc multiclass temperature scaling calibration on OOF; apply to test_pred and rewrite submission
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

def apply_temp_scaling(probs: np.ndarray, T: float) -> np.ndarray:
    # Prob-power temperature scaling: p_i^(1/T) and renormalize per row
    eps = 1e-15
    P = np.clip(probs, eps, 1 - eps).astype(np.float64)
    P_pow = np.power(P, 1.0 / max(T, 1e-6))
    P_pow /= P_pow.sum(axis=1, keepdims=True)
    return P_pow.astype(np.float32)

def find_best_T(oof_probs: np.ndarray, y_true: np.ndarray) -> float:
    # Optimize T > 0 by 1D search on log T
    def nll_from_logT(logT: float) -> float:
        T = float(np.exp(logT))
        P = apply_temp_scaling(oof_probs, T)
        return log_loss(y_true, P, labels=[0,1,2])
    # Coarse grid over logT in [-2.0, 2.0]
    grid = np.linspace(-2.0, 2.0, 41)
    vals = [nll_from_logT(g) for g in grid]
    best_idx = int(np.argmin(vals))
    best_logT = grid[best_idx]
    best_val = vals[best_idx]
    # Local refine around best
    for _ in range(3):
        lo = max(-5.0, best_logT - 0.5)
        hi = min(5.0, best_logT + 0.5)
        grid = np.linspace(lo, hi, 21)
        vals = [nll_from_logT(g) for g in grid]
        best_idx = int(np.argmin(vals))
        best_logT = grid[best_idx]
        best_val = vals[best_idx]
    return float(np.exp(best_logT))

assert 'oof' in globals() and 'y' in globals() and 'test_pred' in globals(), 'Run training cell first to define oof, y, test_pred'
base_oof_ll = log_loss(y, oof, labels=[0,1,2])
print(f'Base OOF logloss (uncalibrated): {base_oof_ll:.6f}')
T_opt = find_best_T(oof, y)
print(f'Optimal temperature T: {T_opt:.4f}')
oof_cal = apply_temp_scaling(oof, T_opt)
cal_oof_ll = log_loss(y, oof_cal, labels=[0,1,2])
print(f'Calibrated OOF logloss: {cal_oof_ll:.6f}')

# Apply to test_pred and rewrite submission.csv
test_cal = apply_temp_scaling(test_pred, T_opt)
sub = pd.DataFrame({
    'id': pd.read_csv('test.csv')['id'].values,
    'winner_model_a': test_cal[:,0],
    'winner_model_b': test_cal[:,1],
    'winner_tie': test_cal[:,2],
})
eps = 1e-15
probs = sub[['winner_model_a','winner_model_b','winner_tie']].values
probs = np.clip(probs, eps, 1 - eps)
probs /= probs.sum(axis=1, keepdims=True)
sub[['winner_model_a','winner_model_b','winner_tie']] = probs
sub.to_csv('submission.csv', index=False)
print('Rewrote submission.csv with temperature-scaled probabilities')

Base OOF logloss (uncalibrated): 1.077041


Optimal temperature T: 0.7047
Calibrated OOF logloss: 1.075130
Rewrote submission.csv with temperature-scaled probabilities


In [7]:
# Baseline v3: Faster SGDClassifier, richer features, per-fold isotonic calibration
import json, re, time
from time import perf_counter
import numpy as np
import pandas as pd
from hashlib import blake2b
from scipy import sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.calibration import IsotonicRegression

def log(msg):
    from datetime import datetime
    print(f"[{datetime.utcnow().isoformat(timespec='seconds')}Z] {msg}", flush=True)

def norm_prompt_for_group(s: str) -> str:
    if not isinstance(s, str):
        return ''
    t = s.strip()
    # Try JSON list parsing (multi-turn); fallback to simple cleanup
    try:
        obj = json.loads(t)
        if isinstance(obj, list):
            t = ' [TURN] '.join(map(str, obj))
        elif isinstance(obj, str):
            t = obj
    except Exception:
        pass
    t = t.replace('\r', ' ').replace('\n', ' ')
    t = ' '.join(t.split())
    return t.lower()

def hhash(*parts: str, nbytes: int = 8) -> int:
    h = blake2b(digest_size=nbytes)
    for p in parts:
        if p is None:
            p = ''
        if not isinstance(p, str):
            p = str(p)
        h.update(p.encode('utf-8', errors='ignore'))
        h.update(b'|')
    return int.from_bytes(h.digest(), 'little', signed=False)

def truncate_head_tail(s: str, head: int = 4000, tail: int = 1000) -> str:
    if not isinstance(s, str):
        return ''
    if len(s) <= head + tail:
        return s
    return s[:head] + s[-tail:]

# Regexes and counters
re_url = re.compile(r'https?://|www\.')
re_listline = re.compile(r'(?m)^(?:\s*[-*\u2022])')
re_digit = re.compile(r'\d')
re_codefence = re.compile(r'```')
re_quote = re.compile(r'"|\u201c|\u201d|\'')
re_refusal = re.compile(r"\b(i\s+cannot|i\s+can\'t|i\s+cant|sorry|apologize|unable|policy|safety|as an ai)\b", re.I)
re_letter = re.compile(r'[A-Za-z]')
re_upper = re.compile(r'[A-Z]')
re_punct = re.compile(r'[\!\?\.,;:\-\(\)\[\]\{\}\"\'\`\~\/\\]')

FEATS = ['loglen_char','loglen_word','url','newline','qmark','exclam','listmark','digit','code','quote','refusal',
         'letters','uppers','punct']

def basic_counts(s: str):
    if not isinstance(s, str):
        s = ''
    letters = len(re_letter.findall(s))
    uppers = len(re_upper.findall(s))
    punct = len(re_punct.findall(s))
    return {
        'loglen_char': np.log1p(len(s)),
        'loglen_word': np.log1p(len(s.split())),
        'url': len(re_url.findall(s)),
        'newline': s.count('\n'),
        'qmark': s.count('?'),
        'exclam': s.count('!'),
        'listmark': len(re_listline.findall(s)),
        'digit': len(re_digit.findall(s)),
        'code': len(re_codefence.findall(s)),
        'quote': len(re_quote.findall(s)),
        'refusal': len(re_refusal.findall(s)),
        'letters': letters,
        'uppers': uppers,
        'punct': punct,
    }

def counts_array(texts):
    n = len(texts)
    M = np.zeros((n, len(FEATS)), dtype=np.float32)
    for i, s in enumerate(texts):
        c = basic_counts(s)
        for j, f in enumerate(FEATS):
            M[i, j] = c[f]
    return M

def cosine_rows(X, Y):
    return np.asarray(X.multiply(Y).sum(axis=1)).ravel()

def clean_proba(P):
    P = np.asarray(P, dtype=np.float64)
    P = np.nan_to_num(P, nan=1.0/3.0, posinf=1.0/3.0, neginf=1e-15)
    P = np.clip(P, 0.0, 1.0)
    rs = P.sum(axis=1, keepdims=True)
    rs[rs == 0] = 1.0
    P = P / rs
    return P

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
y_cols = ['winner_model_a','winner_model_b','winner_tie']
y = train[y_cols].values.argmax(axis=1)

# Text preprocessing
prompt_tr = train['prompt'].astype(str).map(truncate_head_tail)
pa_tr = train['response_a'].astype(str).map(truncate_head_tail)
pb_tr = train['response_b'].astype(str).map(truncate_head_tail)
prompt_tr_te = test['prompt'].astype(str).map(truncate_head_tail)
ra_tr_te = test['response_a'].astype(str).map(truncate_head_tail)
rb_tr_te = test['response_b'].astype(str).map(truncate_head_tail)

# Precompute counts and simple ratios
t0 = perf_counter()
A_counts = counts_array(pa_tr.tolist())
B_counts = counts_array(pb_tr.tolist())
A_counts_te = counts_array(ra_tr_te.tolist())
B_counts_te = counts_array(rb_tr_te.tolist())
log(f"Counts computed in {perf_counter()-t0:.1f}s")

def build_scalar_blocks(Ac, Bc):
    # diff, abs diff, sum as before
    diff = (Bc - Ac).astype(np.float32)
    adiff = np.abs(diff).astype(np.float32)
    summ = (Ac + Bc).astype(np.float32)
    # ratios (avoid div-by-zero)
    eps = 1e-6
    # len ratios using first two features: loglen_char/loglen_word are logs; better use raw letters/word counts
    letters_A = Ac[:, FEATS.index('letters')]; letters_B = Bc[:, FEATS.index('letters')]
    words_A = np.expm1(Ac[:, FEATS.index('loglen_word')]); words_B = np.expm1(Bc[:, FEATS.index('loglen_word')])
    len_ratio = ((letters_B + eps) / (letters_A + eps)).reshape(-1,1).astype(np.float32)
    word_ratio = ((words_B + eps) / (words_A + eps)).reshape(-1,1).astype(np.float32)
    # caps ratio and punct density
    upp_A = Ac[:, FEATS.index('uppers')]; upp_B = Bc[:, FEATS.index('uppers')]
    let_A = letters_A; let_B = letters_B
    caps_ratio_A = ((upp_A + eps)/(let_A + eps)).reshape(-1,1)
    caps_ratio_B = ((upp_B + eps)/(let_B + eps)).reshape(-1,1)
    caps_diff = (caps_ratio_B - caps_ratio_A).astype(np.float32)
    caps_adiff = np.abs(caps_diff).astype(np.float32)
    punct_A = Ac[:, FEATS.index('punct')]; punct_B = Bc[:, FEATS.index('punct')]
    # use char length approx from letters + punct + digits (+ others ignored)
    approx_len_A = letters_A + punct_A + Ac[:, FEATS.index('digit')] + 1.0
    approx_len_B = letters_B + punct_B + Bc[:, FEATS.index('digit')] + 1.0
    pden_A = (punct_A / approx_len_A).reshape(-1,1)
    pden_B = (punct_B / approx_len_B).reshape(-1,1)
    pden_diff = (pden_B - pden_A).astype(np.float32)
    pden_adiff = np.abs(pden_diff).astype(np.float32)
    ratios = np.hstack([len_ratio, word_ratio, caps_diff, caps_adiff, pden_diff, pden_adiff]).astype(np.float32)
    return sp.csr_matrix(np.hstack([diff, adiff, summ, ratios]))

# Grouping by robust prompt parse
groups = train['prompt'].map(norm_prompt_for_group).map(lambda s: hhash(s))

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros((len(train), 3), dtype=np.float32)
test_pred = np.zeros((len(test), 3), dtype=np.float32)

start_all = perf_counter()
for fold, (tr_idx, va_idx) in enumerate(cv.split(train, y, groups=groups)):
    t_fold = perf_counter()
    log(f"Fold {fold} start tr={len(tr_idx)} va={len(va_idx)}")
    resp_tr_corpus = pd.concat([pa_tr.iloc[tr_idx], pb_tr.iloc[tr_idx]], axis=0).tolist()
    # TF-IDF with min_df=3 and tighter caps; char 3-5, word 1-2
    t_vec = perf_counter()
    tfidf_char = TfidfVectorizer(analyzer='char', ngram_range=(3,5), min_df=3, max_features=150000,
                                 sublinear_tf=True, dtype=np.float32, norm='l2')
    _ = tfidf_char.fit_transform(resp_tr_corpus)
    tfidf_word = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=3, max_features=150000,
                                 sublinear_tf=True, dtype=np.float32, lowercase=True, token_pattern=r"(?u)\b\w+\b", norm='l2')
    _ = tfidf_word.fit_transform(resp_tr_corpus)
    log(f"Fold {fold} vec fit {perf_counter()-t_vec:.1f}s")

    # Transform A/B
    Xa_c_tr = tfidf_char.transform(pa_tr.iloc[tr_idx]); Xb_c_tr = tfidf_char.transform(pb_tr.iloc[tr_idx])
    Xa_w_tr = tfidf_word.transform(pa_tr.iloc[tr_idx]); Xb_w_tr = tfidf_word.transform(pb_tr.iloc[tr_idx])
    Xa_c_va = tfidf_char.transform(pa_tr.iloc[va_idx]); Xb_c_va = tfidf_char.transform(pb_tr.iloc[va_idx])
    Xa_w_va = tfidf_word.transform(pa_tr.iloc[va_idx]); Xb_w_va = tfidf_word.transform(pb_tr.iloc[va_idx])

    # Prompt sims (word TF-IDF)
    Xp_w_tr = tfidf_word.transform(prompt_tr.iloc[tr_idx])
    Xp_w_va = tfidf_word.transform(prompt_tr.iloc[va_idx])
    sim_pa_tr = cosine_rows(Xp_w_tr, Xa_w_tr)
    sim_pb_tr = cosine_rows(Xp_w_tr, Xb_w_tr)
    sim_pa_va = cosine_rows(Xp_w_va, Xa_w_va)
    sim_pb_va = cosine_rows(Xp_w_va, Xb_w_va)
    sim_diff_tr = sp.csr_matrix((sim_pb_tr - sim_pa_tr).reshape(-1,1))
    sim_diff_va = sp.csr_matrix((sim_pb_va - sim_pa_va).reshape(-1,1))
    cos_pa_tr = sp.csr_matrix(sim_pa_tr.reshape(-1,1))
    cos_pb_tr = sp.csr_matrix(sim_pb_tr.reshape(-1,1))
    cos_pa_va = sp.csr_matrix(sim_pa_va.reshape(-1,1))
    cos_pb_va = sp.csr_matrix(sim_pb_va.reshape(-1,1))
    # Response-to-response sim
    cos_ab_tr = sp.csr_matrix(cosine_rows(Xa_w_tr, Xb_w_tr).reshape(-1,1))
    cos_ab_va = sp.csr_matrix(cosine_rows(Xa_w_va, Xb_w_va).reshape(-1,1))

    # Scalars
    num_tr = build_scalar_blocks(A_counts[tr_idx], B_counts[tr_idx])
    num_va = build_scalar_blocks(A_counts[va_idx], B_counts[va_idx])

    # Sparse stacks: anti-sym char+word diffs; add |B-A| for word only; add sims and scalars
    w_diff_tr = (Xb_w_tr - Xa_w_tr)
    w_adiff_tr = abs(w_diff_tr)
    w_diff_va = (Xb_w_va - Xa_w_va)
    w_adiff_va = abs(w_diff_va)
    X_tr = sp.hstack([Xb_c_tr - Xa_c_tr, w_diff_tr, w_adiff_tr, sim_diff_tr, cos_ab_tr, cos_pa_tr, cos_pb_tr, num_tr], format='csr')
    X_va = sp.hstack([Xb_c_va - Xa_c_va, w_diff_va, w_adiff_va, sim_diff_va, cos_ab_va, cos_pa_va, cos_pb_va, num_va], format='csr')

    # Model: fast SGDClassifier with early stopping
    t_fit = perf_counter()
    clf = SGDClassifier(loss='log_loss', alpha=1e-4, penalty='l2',
                        early_stopping=True, n_iter_no_change=5, validation_fraction=0.1,
                        max_iter=1000, tol=1e-3, random_state=42, n_jobs=-1)
    clf.fit(X_tr, y[tr_idx])
    log(f"Fold {fold} SGD fit {perf_counter()-t_fit:.1f}s")

    # Predict val and calibrate per class using isotonic (OVR style)
    p_va = clf.predict_proba(X_va)
    p_va = clean_proba(p_va)
    y_va = y[va_idx]
    p_va_cal = np.zeros_like(p_va, dtype=np.float64)
    iso_models = []
    for c in range(3):
        # Guard against degenerate folds or NaNs
        x_c = p_va[:, c].astype(np.float64)
        if np.any(np.isnan(x_c)) or np.any(np.isinf(x_c)) or (y_va == c).sum() == 0 or (y_va == c).sum() == len(y_va):
            iso_models.append(None)
            p_va_cal[:, c] = x_c
            continue
        ir = IsotonicRegression(out_of_bounds='clip', y_min=0.0, y_max=1.0)
        ir.fit(x_c, (y_va == c).astype(np.float64))
        iso_models.append(ir)
        p_va_cal[:, c] = ir.predict(x_c)
    # clip and renorm
    eps = 1e-15
    p_va_cal = np.clip(p_va_cal, eps, 1 - eps)
    p_va_cal /= p_va_cal.sum(axis=1, keepdims=True)
    oof[va_idx] = p_va_cal.astype(np.float32)
    ll = log_loss(y_va, p_va_cal, labels=[0,1,2])
    log(f"Fold {fold} OOF-cal logloss={ll:.5f} total fold {perf_counter()-t_fold:.1f}s")

    # Test transform and predict for this fold
    Xa_c_te = tfidf_char.transform(ra_tr_te); Xb_c_te = tfidf_char.transform(rb_tr_te)
    Xa_w_te = tfidf_word.transform(ra_tr_te); Xb_w_te = tfidf_word.transform(rb_tr_te)
    Xp_w_te = tfidf_word.transform(prompt_tr_te)
    sim_pa_te = cosine_rows(Xp_w_te, Xa_w_te)
    sim_pb_te = cosine_rows(Xp_w_te, Xb_w_te)
    sim_diff_te = sp.csr_matrix((sim_pb_te - sim_pa_te).reshape(-1,1))
    cos_ab_te = sp.csr_matrix(cosine_rows(Xa_w_te, Xb_w_te).reshape(-1,1))
    cos_pa_te = sp.csr_matrix(sim_pa_te.reshape(-1,1))
    cos_pb_te = sp.csr_matrix(sim_pb_te.reshape(-1,1))
    num_te = build_scalar_blocks(A_counts_te, B_counts_te)
    w_diff_te = (Xb_w_te - Xa_w_te); w_adiff_te = abs(w_diff_te)
    X_te = sp.hstack([Xb_c_te - Xa_c_te, w_diff_te, w_adiff_te, sim_diff_te, cos_ab_te, cos_pa_te, cos_pb_te, num_te], format='csr')
    p_te = clf.predict_proba(X_te)
    p_te = clean_proba(p_te)
    # apply isotonic per class
    p_te_cal = np.zeros_like(p_te, dtype=np.float64)
    for c in range(3):
        if iso_models[c] is None:
            p_te_cal[:, c] = p_te[:, c]
        else:
            p_te_cal[:, c] = iso_models[c].predict(p_te[:, c])
    p_te_cal = np.clip(p_te_cal, eps, 1 - eps)
    p_te_cal /= p_te_cal.sum(axis=1, keepdims=True)
    test_pred += (p_te_cal.astype(np.float32) / cv.n_splits)

# Final OOF
oof_ll = log_loss(y, oof, labels=[0,1,2])
log(f"v3 OOF logloss={oof_ll:.5f}; total elapsed={perf_counter()-start_all:.1f}s")

# Write submission
sub = pd.DataFrame({
    'id': test['id'].values,
    'winner_model_a': test_pred[:,0],
    'winner_model_b': test_pred[:,1],
    'winner_tie': test_pred[:,2],
})
eps = 1e-15
probs = sub[['winner_model_a','winner_model_b','winner_tie']].values
probs = np.clip(probs, eps, 1 - eps)
probs /= probs.sum(axis=1, keepdims=True)
sub[['winner_model_a','winner_model_b','winner_tie']] = probs
sub.to_csv('submission.csv', index=False)
log('Wrote submission.csv (v3 calibrated)')

[2025-09-25T00:05:51Z] Counts computed in 19.4s


[2025-09-25T00:06:01Z] Fold 0 start tr=41206 va=10523


[2025-09-25T00:07:57Z] Fold 0 vec fit 116.0s


[2025-09-25T00:10:23Z] Fold 0 SGD fit 3.2s


[2025-09-25T00:10:23Z] Fold 0 OOF-cal logloss=1.08409 total fold 262.5s


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[2025-09-25T00:10:40Z] Fold 1 start tr=41466 va=10263


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[2025-09-25T00:12:34Z] Fold 1 vec fit 114.4s


[2025-09-25T00:14:58Z] Fold 1 SGD fit 2.4s


[2025-09-25T00:14:58Z] Fold 1 OOF-cal logloss=1.08711 total fold 258.7s


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[2025-09-25T00:15:14Z] Fold 2 start tr=41493 va=10236


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[2025-09-25T00:17:10Z] Fold 2 vec fit 116.1s


[2025-09-25T00:19:38Z] Fold 2 SGD fit 2.8s


[2025-09-25T00:19:38Z] Fold 2 OOF-cal logloss=1.09287 total fold 264.2s


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[2025-09-25T00:19:55Z] Fold 3 start tr=41248 va=10481


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[2025-09-25T00:21:52Z] Fold 3 vec fit 117.3s


[2025-09-25T00:24:17Z] Fold 3 SGD fit 2.5s


[2025-09-25T00:24:17Z] Fold 3 OOF-cal logloss=1.08772 total fold 262.1s


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[2025-09-25T00:24:33Z] Fold 4 start tr=41503 va=10226


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[2025-09-25T00:26:29Z] Fold 4 vec fit 116.0s


[2025-09-25T00:28:56Z] Fold 4 SGD fit 3.5s


[2025-09-25T00:28:56Z] Fold 4 OOF-cal logloss=1.08548 total fold 262.5s


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


[2025-09-25T00:29:12Z] v3 OOF logloss=1.08744; total elapsed=1400.4s


[2025-09-25T00:29:12Z] Wrote submission.csv (v3 calibrated)


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


In [8]:
# GPU setup: install cu121 torch stack and transformers; sanity check GPU
import os, sys, subprocess, shutil, time
import numpy as np
from pathlib import Path

def pip(*args):
    print('>', *args, flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

# Uninstall any stray torch stacks
for pkg in ('torch','torchvision','torchaudio'):
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', pkg], check=False)

# Clean common stray site dirs
for d in (
    '/app/.pip-target/torch',
    '/app/.pip-target/torchvision',
    '/app/.pip-target/torchaudio',
    '/app/.pip-target/torch-2.8.0.dist-info',
    '/app/.pip-target/torch-2.4.1.dist-info',
    '/app/.pip-target/torchvision-0.23.0.dist-info',
    '/app/.pip-target/torchaudio-2.4.1.dist-info',
):
    if os.path.exists(d):
        print('Removing', d, flush=True)
        shutil.rmtree(d, ignore_errors=True)

# Install exact cu121 torch stack
pip('install',
    '--index-url', 'https://download.pytorch.org/whl/cu121',
    '--extra-index-url', 'https://pypi.org/simple',
    'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1')

# Constraints file to freeze torch versions
Path('constraints.txt').write_text('torch==2.4.1\ntorchvision==0.19.1\ntorchaudio==2.4.1\n')

# Install HF stack
pip('install', '-c', 'constraints.txt',
    'transformers==4.44.2', 'accelerate==0.34.2', 'datasets==2.21.0', 'evaluate==0.4.2',
    'sentencepiece', '--upgrade-strategy', 'only-if-needed')

import torch
print('torch:', torch.__version__, 'CUDA build:', getattr(torch.version, 'cuda', None))
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
else:
    raise SystemExit('CUDA not available; cannot train transformer')





> install --index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://pypi.org/simple torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1




Looking in indexes: https://download.pytorch.org/whl/cu121, https://pypi.org/simple


Collecting torch==2.4.1


  Downloading https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (799.0 MB)


Collecting torchvision==0.19.1
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.19.1%2Bcu121-cp311-cp311-linux_x86_64.whl (7.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.1/7.1 MB 446.7 MB/s eta 0:00:00


Collecting torchaudio==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.4/3.4 MB 25.9 MB/s eta 0:00:00


Collecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 176.2/176.2 MB 229.1 MB/s eta 0:00:00


Collecting nvidia-cusolver-cu12==11.4.5.107
  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 241.3 MB/s eta 0:00:00


Collecting triton==3.0.0
  Downloading triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.4/209.4 MB 81.2 MB/s eta 0:00:00


Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 539.0 MB/s eta 0:00:00


Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)


Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.9/134.9 KB 508.4 MB/s eta 0:00:00


Collecting nvidia-cuda-cupti-cu12==12.1.105
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 110.9 MB/s eta 0:00:00


Collecting nvidia-nvtx-cu12==12.1.105
  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 KB 408.9 MB/s eta 0:00:00


Collecting nvidia-cublas-cu12==12.1.3.1
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 187.9 MB/s eta 0:00:00


Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 516.8 MB/s eta 0:00:00


Collecting networkx
  Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 513.0 MB/s eta 0:00:00
Collecting nvidia-cufft-cu12==11.0.2.54
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 175.5 MB/s eta 0:00:00


Collecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 KB 293.4 MB/s eta 0:00:00
Collecting nvidia-curand-cu12==10.3.2.106
  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 294.6 MB/s eta 0:00:00


Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 231.1 MB/s eta 0:00:00


Collecting typing-extensions>=4.8.0
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 KB 377.1 MB/s eta 0:00:00


Collecting fsspec
  Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.3/199.3 KB 520.5 MB/s eta 0:00:00
Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 225.1 MB/s eta 0:00:00


Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 531.9 MB/s eta 0:00:00


Collecting pillow!=8.3.*,>=5.3.0
  Downloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 124.4 MB/s eta 0:00:00


Collecting nvidia-nvjitlink-cu12
  Downloading nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.7/39.7 MB 208.7 MB/s eta 0:00:00


Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23 kB)


Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 KB 515.7 MB/s eta 0:00:00


Installing collected packages: mpmath, typing-extensions, sympy, pillow, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, MarkupSafe, fsspec, filelock, triton, nvidia-cusparse-cu12, nvidia-cudnn-cu12, jinja2, nvidia-cusolver-cu12, torch, torchvision, torchaudio


Successfully installed MarkupSafe-3.0.2 filelock-3.19.1 fsspec-2025.9.0 jinja2-3.1.6 mpmath-1.3.0 networkx-3.5 numpy-1.26.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.9.86 nvidia-nvtx-cu12-12.1.105 pillow-11.3.0 sympy-1.14.0 torch-2.4.1+cu121 torchaudio-2.4.1+cu121 torchvision-0.19.1+cu121 triton-3.0.0 typing-extensions-4.15.0


> install -c constraints.txt transformers==4.44.2 accelerate==0.34.2 datasets==2.21.0 evaluate==0.4.2 sentencepiece --upgrade-strategy only-if-needed


Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.5/9.5 MB 86.6 MB/s eta 0:00:00


Collecting accelerate==0.34.2
  Downloading accelerate-0.34.2-py3-none-any.whl (324 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 324.4/324.4 KB 470.9 MB/s eta 0:00:00
Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 527.3/527.3 KB 513.5 MB/s eta 0:00:00
Collecting evaluate==0.4.2
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.1/84.1 KB 445.6 MB/s eta 0:00:00
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (1.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 197.4 MB/s eta 0:00:00


Collecting tokenizers<0.20,>=0.19
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.6/3.6 MB 284.9 MB/s eta 0:00:00
Collecting packaging>=20.0
  Downloading packaging-25.0-py3-none-any.whl (66 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.5/66.5 KB 380.8 MB/s eta 0:00:00


Collecting regex!=2019.12.17
  Downloading regex-2025.9.18-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (798 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 799.0/799.0 KB 344.6 MB/s eta 0:00:00


Collecting numpy>=1.17
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 243.4 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.7/64.7 KB 424.0 MB/s eta 0:00:00


Collecting safetensors>=0.4.1
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (485 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 485.8/485.8 KB 467.6 MB/s eta 0:00:00
Collecting tqdm>=4.27
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.5/78.5 KB 449.7 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2
  Downloading huggingface_hub-0.35.1-py3-none-any.whl (563 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 563.3/563.3 KB 307.9 MB/s eta 0:00:00
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (762 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 763.0/763.0 KB 565.2 MB/s eta 0:00:00


Collecting psutil
  Downloading psutil-7.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (291 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 291.2/291.2 KB 550.2 MB/s eta 0:00:00
Collecting torch>=1.10.0
  Downloading torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl (797.1 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 797.1/797.1 MB 26.0 MB/s eta 0:00:00


Collecting pyarrow>=15.0.0
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (42.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.8/42.8 MB 500.1 MB/s eta 0:00:00


Collecting aiohttp
  Downloading aiohttp-3.12.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 470.9 MB/s eta 0:00:00


Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.4/12.4 MB 478.6 MB/s eta 0:00:00
Collecting xxhash
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.8/194.8 KB 489.7 MB/s eta 0:00:00
Collecting multiprocess


  Downloading multiprocess-0.70.18-py311-none-any.whl (144 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.5/144.5 KB 371.4 MB/s eta 0:00:00
Collecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 KB 436.1 MB/s eta 0:00:00
Collecting fsspec[http]<=2024.6.1,>=2023.1.0
  Downloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 177.6/177.6 KB 529.1 MB/s eta 0:00:00


Collecting propcache>=0.2.0
  Downloading propcache-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 213.5/213.5 KB 455.8 MB/s eta 0:00:00


Collecting multidict<7.0,>=4.5
  Downloading multidict-6.6.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (246 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 246.7/246.7 KB 489.8 MB/s eta 0:00:00
Collecting aiohappyeyeballs>=2.5.0
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl (15 kB)


Collecting yarl<2.0,>=1.17.0
  Downloading yarl-1.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (348 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 349.0/349.0 KB 550.9 MB/s eta 0:00:00
Collecting attrs>=17.3.0
  Downloading attrs-25.3.0-py3-none-any.whl (63 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.8/63.8 KB 456.2 MB/s eta 0:00:00
Collecting aiosignal>=1.4.0
  Downloading aiosignal-1.4.0-py3-none-any.whl (7.5 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.7.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (235 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 235.3/235.3 KB 529.7 MB/s eta 0:00:00


Collecting typing-extensions>=3.7.4.3
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 KB 337.7 MB/s eta 0:00:00
Collecting hf-xet<2.0.0,>=1.1.3
  Downloading hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.2/3.2 MB 412.1 MB/s eta 0:00:00
Collecting certifi>=2017.4.17
  Downloading certifi-2025.8.3-py3-none-any.whl (161 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 161.2/161.2 KB 474.0 MB/s eta 0:00:00


Collecting charset_normalizer<4,>=2
  Downloading charset_normalizer-3.4.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (150 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.3/150.3 KB 521.6 MB/s eta 0:00:00
Collecting idna<4,>=2.5
  Downloading idna-3.10-py3-none-any.whl (70 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 70.4/70.4 KB 411.0 MB/s eta 0:00:00
Collecting urllib3<3,>=1.21.1
  Downloading urllib3-2.5.0-py3-none-any.whl (129 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.8/129.8 KB 511.2 MB/s eta 0:00:00
Collecting nvidia-cuda-cupti-cu12==12.1.105
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 220.8 MB/s eta 0:00:00
Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 214.2 MB/s eta 0:00:00
Collecting nvidia-cublas-cu12==12.1.3.1
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 77.2 MB/s eta 0:00:00


Collecting nvidia-cusolver-cu12==11.4.5.107
  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 160.9 MB/s eta 0:00:00
Collecting nvidia-curand-cu12==10.3.2.106
  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 202.1 MB/s eta 0:00:00
Collecting nvidia-cufft-cu12==11.0.2.54
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 220.6 MB/s eta 0:00:00
Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 111.4 MB/s eta 0:00:00


Collecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 176.2/176.2 MB 151.4 MB/s eta 0:00:00
Collecting triton==3.0.0
  Downloading triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.4/209.4 MB 290.8 MB/s eta 0:00:00
Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.9/134.9 KB 108.1 MB/s eta 0:00:00


Collecting networkx
  Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 572.9 MB/s eta 0:00:00
Collecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 KB 444.8 MB/s eta 0:00:00
Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 562.0 MB/s eta 0:00:00
Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 207.5 MB/s eta 0:00:00
Collecting nvidia-nvtx-cu12==12.1.105
  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 KB 458.4 MB/s eta 0:00:00
Collecting nvidia-nvjitlink-cu12


  Downloading nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.7/39.7 MB 240.7 MB/s eta 0:00:00


Collecting multiprocess
  Downloading multiprocess-0.70.17-py311-none-any.whl (144 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.3/144.3 KB 304.8 MB/s eta 0:00:00
  Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 143.5/143.5 KB 504.6 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 347.8/347.8 KB 526.4 MB/s eta 0:00:00
Collecting python-dateutil>=2.8.2
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 229.9/229.9 KB 529.6 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 509.2/509.2 KB 533.3 MB/s eta 0:00:00


Collecting six>=1.5
  Downloading six-1.17.0-py2.py3-none-any.whl (11 kB)
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23 kB)


Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 KB 565.8 MB/s eta 0:00:00


Installing collected packages: pytz, mpmath, xxhash, urllib3, tzdata, typing-extensions, tqdm, sympy, six, sentencepiece, safetensors, regex, pyyaml, pyarrow, psutil, propcache, packaging, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, multidict, MarkupSafe, idna, hf-xet, fsspec, frozenlist, filelock, dill, charset_normalizer, certifi, attrs, aiohappyeyeballs, yarl, triton, requests, python-dateutil, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, jinja2, aiosignal, pandas, nvidia-cusolver-cu12, huggingface-hub, aiohttp, torch, tokenizers, transformers, datasets, accelerate, evaluate


Successfully installed MarkupSafe-3.0.2 accelerate-0.34.2 aiohappyeyeballs-2.6.1 aiohttp-3.12.15 aiosignal-1.4.0 attrs-25.3.0 certifi-2025.8.3 charset_normalizer-3.4.3 datasets-2.21.0 dill-0.3.8 evaluate-0.4.2 filelock-3.19.1 frozenlist-1.7.0 fsspec-2024.6.1 hf-xet-1.1.10 huggingface-hub-0.35.1 idna-3.10 jinja2-3.1.6 mpmath-1.3.0 multidict-6.6.4 multiprocess-0.70.16 networkx-3.5 numpy-1.26.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.9.86 nvidia-nvtx-cu12-12.1.105 packaging-25.0 pandas-2.3.2 propcache-0.3.2 psutil-7.1.0 pyarrow-21.0.0 python-dateutil-2.9.0.post0 pytz-2025.2 pyyaml-6.0.2 regex-2025.9.18 requests-2.32.5 safetensors-0.6.2 sentencepiece-0.2.1 six-1.17.0 sympy-1.14.0 tokenizers-0.19.1 torch-2.4.1 t







torch: 2.4.1+cu121 CUDA build: 12.1
CUDA available: True
GPU: NVIDIA A10-24Q


In [None]:
# Cross-encoder v2: DeBERTa-v3-large, 3-fold x 2 seeds, swap aug + symmetric inference, bf16 + grad checkpointing
import os, json, math, random, time
from time import perf_counter
import numpy as np
import pandas as pd
from hashlib import blake2b
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import log_loss
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)

def log(msg):
    from datetime import datetime
    print(f"[{datetime.utcnow().isoformat(timespec='seconds')}Z] {msg}", flush=True)

# Performance/precision knobs
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = os.environ.get('PYTORCH_CUDA_ALLOC_CONF','') or 'expandable_segments:True'
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True

def norm_prompt_for_group(s: str) -> str:
    if not isinstance(s, str):
        return ''
    t = s.strip()
    try:
        obj = json.loads(t)
        if isinstance(obj, list):
            t = ' [TURN] '.join(map(str, obj))
        elif isinstance(obj, str):
            t = obj
    except Exception:
        pass
    t = t.replace('\r', ' ').replace('\n', ' ')
    t = ' '.join(t.split())
    return t.lower()

def hhash(*parts: str, nbytes: int = 8) -> int:
    h = blake2b(digest_size=nbytes)
    for p in parts:
        if p is None:
            p = ''
        if not isinstance(p, str):
            p = str(p)
        h.update(p.encode('utf-8', errors='ignore')); h.update(b'|')
    return int.from_bytes(h.digest(), 'little', signed=False)

def truncate_head_tail_text(s: str, head: int, tail: int) -> str:
    if not isinstance(s, str):
        return ''
    if len(s) <= head + tail:
        return s
    return s[:head] + s[-tail:]

def build_df(train_csv='train.csv', test_csv='test.csv', head_tail=(3000, 1000), prompt_ht=(1200, 400)):
    train = pd.read_csv(train_csv)
    test = pd.read_csv(test_csv)
    y_cols = ['winner_model_a','winner_model_b','winner_tie']
    y = train[y_cols].values.argmax(axis=1)
    ph, pt = prompt_ht
    h, t = head_tail
    tr_prompt = train['prompt'].astype(str).map(lambda s: truncate_head_tail_text(s, ph, pt))
    tr_a = train['response_a'].astype(str).map(lambda s: truncate_head_tail_text(s, h, t))
    tr_b = train['response_b'].astype(str).map(lambda s: truncate_head_tail_text(s, h, t))
    te_prompt = test['prompt'].astype(str).map(lambda s: truncate_head_tail_text(s, ph, pt))
    te_a = test['response_a'].astype(str).map(lambda s: truncate_head_tail_text(s, h, t))
    te_b = test['response_b'].astype(str).map(lambda s: truncate_head_tail_text(s, h, t))
    df_train = pd.DataFrame({
        'id': train['id'].values,
        'prompt': tr_prompt,
        'a': tr_a,
        'b': tr_b,
        'label': y,
        'group': train['prompt'].map(norm_prompt_for_group).map(lambda s: hhash(s))
    })
    df_test = pd.DataFrame({
        'id': test['id'].values,
        'prompt': te_prompt,
        'a': te_a,
        'b': te_b
    })
    return df_train, df_test

# Input template with explicit instruction and separators
def make_input(prompt, a, b, sep_token='[SEP]'):
    return (f"{sep_token} Instruction: decide which response is better overall (A, B, or Tie).\n"
            f"{sep_token} Prompt:\n{prompt}\n"
            f"{sep_token} Response A:\n{a}\n"
            f"{sep_token} Response B:\n{b}")

# Config
model_name = 'microsoft/deberta-v3-large'  # fallback to 'microsoft/deberta-v3-base' if OOM
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
sep_tok = tokenizer.sep_token or '[SEP]'

# Tokenization: dynamic padding, truncate to model limit (512 for DeBERTa v3); string template + budgets above
def tokenize_function(examples, max_length=512):
    return tokenizer(examples['text'], truncation=True, max_length=max_length, padding=False)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    return {'logloss': float(log_loss(labels, probs, labels=[0,1,2]))}

# Build dataframes
df_train, df_test = build_df()
labels = df_train['label'].values
groups = df_train['group'].values

# Pre-build test datasets (orig and swapped) once; reuse across folds/seeds
df_te_orig = df_test.copy()
df_te_orig['text'] = [make_input(p, a, b, sep_tok) for p,a,b in zip(df_te_orig['prompt'], df_te_orig['a'], df_te_orig['b'])]
ds_te_orig = Dataset.from_pandas(df_te_orig[['text']])
ds_te_orig = ds_te_orig.map(tokenize_function, batched=True, remove_columns=['text'])

df_te_swap = df_test.copy()
df_te_swap['text'] = [make_input(p, b, a, sep_tok) for p,a,b in zip(df_te_swap['prompt'], df_te_swap['a'], df_te_swap['b'])]
ds_te_swap = Dataset.from_pandas(df_te_swap[['text']])
ds_te_swap = ds_te_swap.map(tokenize_function, batched=True, remove_columns=['text'])

# Swap augmentation
def augment_swap(df):
    df_sw = df.copy()
    df_sw['a'], df_sw['b'] = df['b'], df['a']
    lab = df['label'].values.copy()
    lab_sw = lab.copy()
    lab_sw[lab == 0] = 1
    lab_sw[lab == 1] = 0
    df_sw['label'] = lab_sw
    return pd.concat([df, df_sw], axis=0, ignore_index=True)

seeds = [42, 2025]
n_folds = 3
skf = StratifiedGroupKFold(n_splits=n_folds, shuffle=True)

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

all_oof_logits = np.zeros((len(df_train), 3), dtype=np.float64)
test_pred_logits = np.zeros((len(df_test), 3), dtype=np.float64)

start_all = perf_counter()
for seed in seeds:
    log(f"Seed {seed} run start")
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    oof_logits_seed = np.zeros((len(df_train), 3), dtype=np.float64)
    for fold, (tr_idx, va_idx) in enumerate(skf.split(df_train, labels, groups=groups)):
        log(f"XEnc Fold {fold} (seed {seed}) start tr={len(tr_idx)} va={len(va_idx)}")
        df_tr = df_train.iloc[tr_idx].reset_index(drop=True)
        df_va = df_train.iloc[va_idx].reset_index(drop=True)
        df_tr_aug = augment_swap(df_tr)
        # Build texts
        df_tr_aug['text'] = [make_input(p, a, b, sep_tok) for p,a,b in zip(df_tr_aug['prompt'], df_tr_aug['a'], df_tr_aug['b'])]
        df_va['text'] = [make_input(p, a, b, sep_tok) for p,a,b in zip(df_va['prompt'], df_va['a'], df_va['b'])]
        ds_tr = Dataset.from_pandas(df_tr_aug[['text','label']])
        ds_va = Dataset.from_pandas(df_va[['text','label']])
        ds_tr = ds_tr.map(tokenize_function, batched=True, remove_columns=['text'])
        ds_va = ds_va.map(tokenize_function, batched=True, remove_columns=['text'])

        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
        # Regularization and memory
        try:
            model.gradient_checkpointing_enable()
        except Exception:
            pass
        if hasattr(model, 'config'):
            try:
                model.config.use_cache = False
                if hasattr(model.config, 'hidden_dropout_prob'):
                    model.config.hidden_dropout_prob = 0.1
                if hasattr(model.config, 'attention_probs_dropout_prob'):
                    model.config.attention_probs_dropout_prob = 0.1
            except Exception:
                pass

        args = TrainingArguments(
            output_dir=f'ce_fold{fold}_seed{seed}',
            learning_rate=1.5e-5,
            weight_decay=0.01,
            lr_scheduler_type='cosine',
            warmup_ratio=0.06,
            num_train_epochs=2,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=16,
            gradient_accumulation_steps=8,  # eff batch ~32
            bf16=True, fp16=False,
            label_smoothing_factor=0.05,
            logging_steps=50,
            evaluation_strategy='epoch',
            save_strategy='no',
            report_to=[],
            dataloader_num_workers=4,
            dataloader_pin_memory=True,
            optim='adamw_torch',
            seed=seed,
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=ds_tr,
            eval_dataset=ds_va,
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )
        t0 = perf_counter()
        trainer.train()
        log(f"Fold {fold} (seed {seed}) train done in {perf_counter()-t0:.1f}s")

        # Predict val
        preds = trainer.predict(ds_va).predictions
        oof_probs = torch.softmax(torch.tensor(preds), dim=-1).numpy().astype(np.float64)
        oof_logits_seed[va_idx] = preds.astype(np.float64)
        ll = log_loss(df_va['label'].values, oof_probs, labels=[0,1,2])
        log(f"Fold {fold} (seed {seed}) val logloss={ll:.5f}")

        # Test-time symmetric inference (reuse pre-tokenized ds_te_orig/ds_te_swap)
        logits_orig = trainer.predict(ds_te_orig).predictions.astype(np.float64)
        logits_sw = trainer.predict(ds_te_swap).predictions.astype(np.float64)
        logits_back = logits_sw.copy()
        logits_back[:,0], logits_back[:,1] = logits_sw[:,1], logits_sw[:,0]
        logits_avg = (logits_orig + logits_back) / 2.0
        test_pred_logits += logits_avg / (n_folds * len(seeds))

    # accumulate OOF logits (average over seeds later)
    all_oof_logits += oof_logits_seed / len(seeds)

oof_probs_final = torch.softmax(torch.tensor(all_oof_logits), dim=-1).numpy()
oof_ll = log_loss(df_train['label'].values, oof_probs_final, labels=[0,1,2])
log(f"XEnc OOF logloss={oof_ll:.5f}; total elapsed={perf_counter()-start_all:.1f}s")

# Build submission from softmax of averaged logits
test_probs = torch.softmax(torch.tensor(test_pred_logits), dim=-1).numpy()
sub = pd.DataFrame({
    'id': df_test['id'].values,
    'winner_model_a': test_probs[:,0],
    'winner_model_b': test_probs[:,1],
    'winner_tie': test_probs[:,2],
})
eps = 1e-15
P = sub[['winner_model_a','winner_model_b','winner_tie']].values
P = np.clip(P, eps, 1-eps); P /= P.sum(axis=1, keepdims=True)
sub[['winner_model_a','winner_model_b','winner_tie']] = P
sub.to_csv('submission.csv', index=False)
log('Wrote submission.csv (cross-encoder v2, no temp scaling)')

# Save OOF logits for later temperature scaling
np.save('oof_logits.npy', all_oof_logits.astype(np.float32))
np.save('test_logits.npy', test_pred_logits.astype(np.float32))
log('Saved oof_logits.npy and test_logits.npy')

  from .autonotebook import tqdm as notebook_tqdm




Map:   0%|          | 0/5748 [00:00<?, ? examples/s]

Map:  17%|█▋        | 1000/5748 [00:00<00:01, 3169.28 examples/s]

Map:  35%|███▍      | 2000/5748 [00:00<00:01, 3393.22 examples/s]

In [None]:
# Post-training: temperature scaling on saved logits and write calibrated submission
import numpy as np, pandas as pd
from sklearn.metrics import log_loss
from time import perf_counter

def log(msg):
    from datetime import datetime
    print(f"[{datetime.utcnow().isoformat(timespec='seconds')}Z] {msg}", flush=True)

def softmax(logits):
    x = logits - logits.max(axis=1, keepdims=True)
    ex = np.exp(x)
    return ex / ex.sum(axis=1, keepdims=True)

def find_best_temperature(oof_logits: np.ndarray, y_true: np.ndarray) -> float:
    # Optimize a single scalar T on logits: probs = softmax(logits / T)
    def nll_from_logT(logT: float) -> float:
        T = float(np.exp(logT))
        P = softmax(oof_logits / max(T, 1e-6))
        return log_loss(y_true, P, labels=[0,1,2])
    grid = np.linspace(-2.0, 2.0, 41)
    vals = [nll_from_logT(g) for g in grid]
    best_logT = float(grid[int(np.argmin(vals))])
    for _ in range(3):
        lo = max(-5.0, best_logT - 0.5); hi = min(5.0, best_logT + 0.5)
        grid = np.linspace(lo, hi, 31)
        vals = [nll_from_logT(g) for g in grid]
        best_logT = float(grid[int(np.argmin(vals))])
    return float(np.exp(best_logT))

t0 = perf_counter()
oof_logits = np.load('oof_logits.npy')
test_logits = np.load('test_logits.npy')
y = pd.read_csv('train.csv')[['winner_model_a','winner_model_b','winner_tie']].values.argmax(axis=1)
oof_probs = softmax(oof_logits)
base_ll = log_loss(y, oof_probs, labels=[0,1,2])
log(f"Uncalibrated OOF logloss from saved logits: {base_ll:.6f}")
T = find_best_temperature(oof_logits, y)
log(f"Optimal temperature T (logit scaling): {T:.4f}")
oof_probs_cal = softmax(oof_logits / T)
cal_ll = log_loss(y, oof_probs_cal, labels=[0,1,2])
log(f"Calibrated OOF logloss: {cal_ll:.6f}")

# Apply to test logits and write calibrated submission
test_probs_cal = softmax(test_logits / T).astype(np.float64)
eps = 1e-15
test_probs_cal = np.clip(test_probs_cal, eps, 1 - eps)
test_probs_cal /= test_probs_cal.sum(axis=1, keepdims=True)
test_ids = pd.read_csv('test.csv')['id'].values
sub = pd.DataFrame({
    'id': test_ids,
    'winner_model_a': test_probs_cal[:,0],
    'winner_model_b': test_probs_cal[:,1],
    'winner_tie': test_probs_cal[:,2],
})
sub.to_csv('submission.csv', index=False)
sub.to_csv('submission_calibrated.csv', index=False)
log(f"Wrote submission.csv and submission_calibrated.csv with temperature scaling; elapsed {perf_counter()-t0:.1f}s")
with open('calibration_T.txt','w') as f: f.write(str(T))