# US Patent Phrase to Phrase Matching – Plan

Goal: Achieve medal-level Pearson on public LB by building a strong, validated text similarity model with robust CV and iterative improvements.

Workflow:
- Environment check (GPU sanity) and data load
- EDA: target distribution, text lengths, CPC distributions
- Validation: GroupKFold by anchor or stratified bins on score; fix a single protocol
- Baseline: TF-IDF + Ridge/SGDRegressor; evaluate Pearson OOF
- Strong models:
  - Sentence-transformers (e.g., msmarco-distilbert-base-v4 / bge-small-en) finetune with cosine-sim regression
  - Cross-encoder (e.g., MiniLM-L12-v2) regression head
- Features:
  - Text normalization, lowercasing, dedup punctuation
  - Use CPC code as domain context (prefix to text)
  - Pairwise features: BM25/TFIDF cosine; embedding cosine; length ratios
- Ensembling:
  - Blend TF-IDF ridge, bi-encoder cosine, cross-encoder regression
- Error analysis: mine worst OOF buckets by CPC and length; iterate
- Submission: Generate test predictions; save submission.csv

Major checkpoints for expert review:
1) After this plan and environment check
2) After CV protocol + baseline OOF
3) After first transformer model OOF
4) Before long training or ensembling

Logging: Print elapsed time and fold indices; cache OOF/test predictions to reuse.

In [None]:
# Environment check and quick data scan
import os, sys, time, shutil, subprocess, json, textwrap
import pandas as pd, numpy as np
from pathlib import Path

t0 = time.time()
print('CWD:', os.getcwd())
print('Python:', sys.version)
print('Start time:', time.strftime('%Y-%m-%d %H:%M:%S'))

print('\n== nvidia-smi ==', flush=True)
subprocess.run(['bash','-lc','nvidia-smi || true'], check=False)

data_dir = Path('.')
train_path = data_dir/'train.csv'
test_path = data_dir/'test.csv'
assert train_path.exists() and test_path.exists(), 'Missing train/test CSVs'

usecols_train = None
train = pd.read_csv(train_path, usecols=usecols_train)
test = pd.read_csv(test_path)
print(f'Loaded train: {train.shape}, test: {test.shape}')
print('Train columns:', train.columns.tolist())
print('Test columns:', test.columns.tolist())

# Basic target and text stats
if 'score' in train.columns:
    s = train['score'].astype(float)
    print('Score describe:\n', s.describe())
    hist_counts, hist_bins = np.histogram(s, bins=10, range=(0,1))
    print('Score hist counts:', hist_counts.tolist())
    print('Score hist bins:', np.round(hist_bins,3).tolist())

def text_len_stats(df, cols):
    out = {}
    for c in cols:
        if c in df.columns:
            l = df[c].astype(str).str.len()
            out[c] = dict(count=int(l.count()), mean=float(l.mean()), p50=float(l.median()), p95=float(l.quantile(0.95)), max=int(l.max()))
    return out

txt_cols = ['anchor','target','context']
print('Train text len stats:', json.dumps(text_len_stats(train, txt_cols), indent=2))
print('Test  text len stats:', json.dumps(text_len_stats(test, txt_cols), indent=2))

if 'context' in train.columns:
    print('Top contexts (train):')
    print(train['context'].value_counts().head(10))
if 'anchor' in train.columns:
    print('Unique anchors:', train['anchor'].nunique())

print('Elapsed: %.2fs' % (time.time()-t0))

In [None]:
# Build deduped data and 5-fold GroupKFold by anchor; cache folds
import unicodedata
from sklearn.model_selection import GroupKFold

t0 = time.time()

def normalize_text(s: str) -> str:
    if pd.isna(s):
        return ''
    s = unicodedata.normalize('NFKC', str(s))
    s = s.lower()
    # collapse multiple spaces
    s = ' '.join(s.split())
    # simple punctuation de-dup (keep hyphens/slashes)
    for ch in [',', '.', ';', ':', '!', '?', '(', ')', '[', ']', '{', '}', "'", '"']:
        while ch*2 in s:
            s = s.replace(ch*2, ch)
    return s

# Prepare normalized columns (do not overwrite originals; folds use original anchor as group)
train['_anchor_n'] = train['anchor'].map(normalize_text)
train['_target_n'] = train['target'].map(normalize_text)
train['_context_n'] = train['context'].map(normalize_text)

# Exact-duplicate handling on original triplets (anchor, target, context)
key_cols = ['anchor','target','context']
train['_key'] = (train['anchor'].astype(str) + '\t' + train['target'].astype(str) + '\t' + train['context'].astype(str))

# Map from original row to dedup index
keys, inv = np.unique(train['_key'].values, return_inverse=True)
train['_dedup_idx'] = inv

# Aggregate scores per dedup_idx
agg = train.groupby('_dedup_idx', as_index=False).agg({
    'id':'first',
    'anchor':'first',
    'target':'first',
    'context':'first',
    '_anchor_n':'first',
    '_target_n':'first',
    '_context_n':'first',
    'score':'mean',
})
agg = agg.rename(columns={'score':'score_mean'})
agg = agg.sort_values('_dedup_idx').reset_index(drop=True)

# Create 5-fold GroupKFold by anchor (original anchor string)
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)
groups = agg['anchor'].astype(str).values
folds = np.full(len(agg), -1, dtype=int)
for f, (tr_idx, va_idx) in enumerate(gkf.split(np.arange(len(agg)), groups=groups)):
    folds[va_idx] = f
agg['fold'] = folds
assert (agg['fold']>=0).all(), 'Fold assignment failed'

# Persist artifacts
Path('artifacts').mkdir(exist_ok=True)
agg[['anchor','target','context','_anchor_n','_target_n','_context_n','score_mean','fold','_dedup_idx']].to_csv('artifacts/train_dedup_folds.csv', index=False)
pd.DataFrame({'orig_index': np.arange(len(train)), 'dedup_idx': train['_dedup_idx'].values}).to_csv('artifacts/orig_to_dedup_map.csv', index=False)

# Log fold sizes and CPC distribution sanity
sizes = agg['fold'].value_counts().sort_index().to_dict()
print('Fold sizes (dedup space):', sizes)
print('Unique dedup rows:', len(agg), 'from original rows:', len(train))
print('Elapsed: %.2fs' % (time.time()-t0))

In [None]:
# TF-IDF + Ridge baseline with cosine + length features; 5-fold GroupKFold; cache OOF/test and submission
import time
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.preprocessing import normalize as l2_normalize
from sklearn.metrics import r2_score
from scipy.stats import pearsonr

t0 = time.time()
print('Loading fold artifacts...', flush=True)
dedup = pd.read_csv('artifacts/train_dedup_folds.csv')
orig_map = pd.read_csv('artifacts/orig_to_dedup_map.csv')

def pair_text(df):
    return ('[CPC] ' + df['_context_n'].astype(str) + ' [A] ' + df['_anchor_n'].astype(str) + ' [B] ' + df['_target_n'].astype(str)).values

def rowwise_cosine(Xa, Xt):
    Xa = l2_normalize(Xa, axis=1, copy=False)
    Xt = l2_normalize(Xt, axis=1, copy=False)
    sim = (Xa.multiply(Xt)).sum(axis=1)
    return np.asarray(sim).ravel()

def build_dense_feats(df):
    la = df['_anchor_n'].astype(str).str.len().values.astype(np.float32)
    lt = df['_target_n'].astype(str).str.len().values.astype(np.float32)
    ratio = (la / np.maximum(1.0, lt)).astype(np.float32)
    adiff = np.abs(la - lt).astype(np.float32)
    return np.vstack([la, lt, ratio, adiff]).T

# Prepare test normalized columns
test['_anchor_n'] = test['anchor'].map(normalize_text)
test['_target_n'] = test['target'].map(normalize_text)
test['_context_n'] = test['context'].map(normalize_text)

y = dedup['score_mean'].values.astype(np.float32)
folds = dedup['fold'].values.astype(int)
n_folds = int(dedup['fold'].nunique())

oof = np.zeros(len(dedup), dtype=np.float32)
test_preds_folds = []

for f in range(n_folds):
    t_fold = time.time()
    tr_idx = np.where(folds != f)[0]
    va_idx = np.where(folds == f)[0]
    print(f'Fold {f} | train {len(tr_idx)} | valid {len(va_idx)}', flush=True)

    df_tr = dedup.iloc[tr_idx].reset_index(drop=True)
    df_va = dedup.iloc[va_idx].reset_index(drop=True)

    # Vectorizers for anchor/target (word + char) to compute cosine features
    vocab_corpus = pd.concat([df_tr['_anchor_n'], df_tr['_target_n']], axis=0).astype(str).values
    wvec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=200_000, analyzer='word')
    cvec = TfidfVectorizer(ngram_range=(3,5), min_df=2, max_features=300_000, analyzer='char_wb')
    wvec.fit(vocab_corpus)
    cvec.fit(vocab_corpus)

    # Transform anchor/target
    Xa_tr_w = wvec.transform(df_tr['_anchor_n'].astype(str).values)
    Xt_tr_w = wvec.transform(df_tr['_target_n'].astype(str).values)
    Xa_va_w = wvec.transform(df_va['_anchor_n'].astype(str).values)
    Xt_va_w = wvec.transform(df_va['_target_n'].astype(str).values)

    Xa_tr_c = cvec.transform(df_tr['_anchor_n'].astype(str).values)
    Xt_tr_c = cvec.transform(df_tr['_target_n'].astype(str).values)
    Xa_va_c = cvec.transform(df_va['_anchor_n'].astype(str).values)
    Xt_va_c = cvec.transform(df_va['_target_n'].astype(str).values)

    cos_tr_w = rowwise_cosine(Xa_tr_w, Xt_tr_w)[:, None]
    cos_va_w = rowwise_cosine(Xa_va_w, Xt_va_w)[:, None]
    cos_tr_c = rowwise_cosine(Xa_tr_c, Xt_tr_c)[:, None]
    cos_va_c = rowwise_cosine(Xa_va_c, Xt_va_c)[:, None]

    dense_tr = build_dense_feats(df_tr)
    dense_va = build_dense_feats(df_va)

    dense_tr_all = np.hstack([dense_tr, cos_tr_w, cos_tr_c]).astype(np.float32)
    dense_va_all = np.hstack([dense_va, cos_va_w, cos_va_c]).astype(np.float32)

    Xdense_tr = sparse.csr_matrix(dense_tr_all)
    Xdense_va = sparse.csr_matrix(dense_va_all)

    # Pair TF-IDF (word + char) on formatted string
    pair_tr = pair_text(df_tr)
    pair_va = pair_text(df_va)
    pair_te = pair_text(test)

    p_wvec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=300_000, analyzer='word')
    p_cvec = TfidfVectorizer(ngram_range=(3,5), min_df=2, max_features=400_000, analyzer='char_wb')
    Xpw_tr = p_wvec.fit_transform(pair_tr)
    Xpw_va = p_wvec.transform(pair_va)
    Xpc_tr = p_cvec.fit_transform(pair_tr)
    Xpc_va = p_cvec.transform(pair_va)

    # Final train/valid matrices
    X_tr = sparse.hstack([Xpw_tr, Xpc_tr, Xdense_tr], format='csr')
    X_va = sparse.hstack([Xpw_va, Xpc_va, Xdense_va], format='csr')

    y_tr = y[tr_idx]
    y_va = y[va_idx]

    model = Ridge(alpha=1.5, random_state=42)
    t_fit = time.time()
    model.fit(X_tr, y_tr)
    print(f'  F{f} fit done in {time.time()-t_fit:.2f}s, nfeat={X_tr.shape[1]:,}', flush=True)

    pred_va = model.predict(X_va)
    oof[va_idx] = pred_va
    pr = pearsonr(y_va, pred_va)[0]
    print(f'  F{f} Pearson: {pr:.6f}', flush=True)

    # Test features via current fold vectorizers
    Xpw_te = p_wvec.transform(pair_te)
    Xpc_te = p_cvec.transform(pair_te)
    dense_te = build_dense_feats(test)
    # For test cosine features reuse wvec/cvec on test normalized anchor/target
    Xa_te_w = wvec.transform(test['_anchor_n'].astype(str).values)
    Xt_te_w = wvec.transform(test['_target_n'].astype(str).values)
    Xa_te_c = cvec.transform(test['_anchor_n'].astype(str).values)
    Xt_te_c = cvec.transform(test['_target_n'].astype(str).values)
    cos_te_w = rowwise_cosine(Xa_te_w, Xt_te_w)[:, None]
    cos_te_c = rowwise_cosine(Xa_te_c, Xt_te_c)[:, None]
    dense_te_all = np.hstack([dense_te, cos_te_w, cos_te_c]).astype(np.float32)
    Xdense_te = sparse.csr_matrix(dense_te_all)
    X_te = sparse.hstack([Xpw_te, Xpc_te, Xdense_te], format='csr')
    pred_te = model.predict(X_te)
    test_preds_folds.append(pred_te.astype(np.float32))

    print(f'Fold {f} done in {time.time()-t_fold:.2f}s', flush=True)

# Clip to [0,1]
oof_clip = np.clip(oof, 0.0, 1.0)
oof_pearson = float(pearsonr(y, oof_clip)[0])
print(f'OOF Pearson (clipped): {oof_pearson:.6f}')

test_pred = np.mean(np.vstack(test_preds_folds), axis=0)
test_pred = np.clip(test_pred, 0.0, 1.0).astype(np.float32)

# Cache OOF/test
Path('artifacts').mkdir(exist_ok=True)
np.save('artifacts/oof_tfidf_ridge.npy', oof_clip)
np.save('artifacts/test_tfidf_ridge.npy', test_pred)
pd.DataFrame({'dedup_idx': np.arange(len(dedup)), 'oof': oof_clip}).to_csv('artifacts/oof_tfidf_ridge.csv', index=False)

# Expand OOF to original rows for diagnostics (mean per dedup_idx mapping)
oof_full = oof_clip[orig_map['dedup_idx'].values]
pd.DataFrame({'id': train['id'], 'oof': oof_full, 'score': train['score']}).to_csv('artifacts/oof_full_rows.csv', index=False)

# Build submission
sub = pd.DataFrame({'id': test['id'], 'score': test_pred})
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv with shape', sub.shape)
print('Total elapsed: %.2fs' % (time.time()-t0))

In [None]:
# Dense relational baseline: TF-IDF cosines + length features -> StandardScaler -> Ridge
import time
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize as l2_normalize, StandardScaler
from sklearn.linear_model import Ridge
from scipy.stats import pearsonr

t0 = time.time()
print('Loading fold artifacts...', flush=True)
dedup = pd.read_csv('artifacts/train_dedup_folds.csv')
orig_map = pd.read_csv('artifacts/orig_to_dedup_map.csv')

# Ensure test normalized columns exist
if '_anchor_n' not in test.columns:
    test['_anchor_n'] = test['anchor'].map(normalize_text)
    test['_target_n'] = test['target'].map(normalize_text)
    test['_context_n'] = test['context'].map(normalize_text)

def side_text_a(df):
    return (df['_context_n'].astype(str) + ' ' + df['_anchor_n'].astype(str)).values
def side_text_t(df):
    return (df['_context_n'].astype(str) + ' ' + df['_target_n'].astype(str)).values

def rowwise_cosine(Xa, Xt):
    Xa = l2_normalize(Xa, axis=1, copy=False)
    Xt = l2_normalize(Xt, axis=1, copy=False)
    sim = (Xa.multiply(Xt)).sum(axis=1)
    return np.asarray(sim).ravel()

def build_len_feats(df):
    la = df['_anchor_n'].astype(str).str.len().values.astype(np.float32)
    lt = df['_target_n'].astype(str).str.len().values.astype(np.float32)
    absdiff = np.abs(la - lt).astype(np.float32)
    ratio_sym = (np.minimum(la, lt) / np.maximum(1.0, np.maximum(la, lt))).astype(np.float32)
    return la, lt, absdiff, ratio_sym

y = dedup['score_mean'].values.astype(np.float32)
folds = dedup['fold'].values.astype(int)
n_folds = int(dedup['fold'].nunique())

oof = np.zeros(len(dedup), dtype=np.float32)
test_preds_folds = []

for f in range(n_folds):
    t_fold = time.time()
    tr_idx = np.where(folds != f)[0]
    va_idx = np.where(folds == f)[0]
    df_tr = dedup.iloc[tr_idx].reset_index(drop=True)
    df_va = dedup.iloc[va_idx].reset_index(drop=True)
    print(f'Fold {f} | train {len(tr_idx)} | valid {len(va_idx)}', flush=True)

    sA_tr = side_text_a(df_tr)
    sT_tr = side_text_t(df_tr)
    sA_va = side_text_a(df_va)
    sT_va = side_text_t(df_va)
    sA_te = side_text_a(test)
    sT_te = side_text_t(test)

    # Fit vectorizers on train-fold corpus (anchors+targets with CPC prepended)
    corpus = np.concatenate([sA_tr, sT_tr], axis=0)
    wvec = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=2, max_features=100_000)
    cvec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), min_df=2, max_features=200_000)
    wvec.fit(corpus)
    cvec.fit(corpus)

    # Transform and compute cosines
    Xa_tr_w = wvec.transform(sA_tr); Xt_tr_w = wvec.transform(sT_tr)
    Xa_va_w = wvec.transform(sA_va); Xt_va_w = wvec.transform(sT_va)
    Xa_te_w = wvec.transform(sA_te); Xt_te_w = wvec.transform(sT_te)
    cos_tr_w = rowwise_cosine(Xa_tr_w, Xt_tr_w)
    cos_va_w = rowwise_cosine(Xa_va_w, Xt_va_w)
    cos_te_w = rowwise_cosine(Xa_te_w, Xt_te_w)

    Xa_tr_c = cvec.transform(sA_tr); Xt_tr_c = cvec.transform(sT_tr)
    Xa_va_c = cvec.transform(sA_va); Xt_va_c = cvec.transform(sT_va)
    Xa_te_c = cvec.transform(sA_te); Xt_te_c = cvec.transform(sT_te)
    cos_tr_c = rowwise_cosine(Xa_tr_c, Xt_tr_c)
    cos_va_c = rowwise_cosine(Xa_va_c, Xt_va_c)
    cos_te_c = rowwise_cosine(Xa_te_c, Xt_te_c)

    # Length features + simple interactions
    la_tr, lt_tr, ad_tr, rs_tr = build_len_feats(df_tr)
    la_va, lt_va, ad_va, rs_va = build_len_feats(df_va)
    la_te, lt_te, ad_te, rs_te = build_len_feats(test)

    X_tr = np.vstack([
        cos_tr_w, cos_tr_c, la_tr, lt_tr, ad_tr, rs_tr,
        (la_tr * cos_tr_w), (lt_tr * cos_tr_c)
    ]).T.astype(np.float32)
    X_va = np.vstack([
        cos_va_w, cos_va_c, la_va, lt_va, ad_va, rs_va,
        (la_va * cos_va_w), (lt_va * cos_va_c)
    ]).T.astype(np.float32)
    X_te = np.vstack([
        cos_te_w, cos_te_c, la_te, lt_te, ad_te, rs_te,
        (la_te * cos_te_w), (lt_te * cos_te_c)
    ]).T.astype(np.float32)

    scaler = StandardScaler(with_mean=True, with_std=True)
    X_tr_s = scaler.fit_transform(X_tr)
    X_va_s = scaler.transform(X_va)
    X_te_s = scaler.transform(X_te)

    y_tr = y[tr_idx]
    y_va = y[va_idx]

    model = Ridge(alpha=1.5, random_state=42)
    t_fit = time.time()
    model.fit(X_tr_s, y_tr)
    pred_va = model.predict(X_va_s)
    pr = pearsonr(y_va, pred_va)[0]
    print(f'  F{f} Pearson: {pr:.6f} (fit {time.time()-t_fit:.2f}s)', flush=True)
    oof[va_idx] = pred_va

    pred_te = model.predict(X_te_s).astype(np.float32)
    test_preds_folds.append(pred_te)
    print(f'Fold {f} done in {time.time()-t_fold:.2f}s', flush=True)

oof_clip = np.clip(oof, 0.0, 1.0)
oof_pr = float(pearsonr(y, oof_clip)[0])
print(f'OOF Pearson (clipped): {oof_pr:.6f}', flush=True)

test_pred = np.mean(np.vstack(test_preds_folds), axis=0)
test_pred = np.clip(test_pred, 0.0, 1.0).astype(np.float32)

Path('artifacts').mkdir(exist_ok=True)
np.save('artifacts/oof_dense_ridge.npy', oof_clip)
np.save('artifacts/test_dense_ridge.npy', test_pred)
pd.DataFrame({'dedup_idx': np.arange(len(dedup)), 'oof': oof_clip}).to_csv('artifacts/oof_dense_ridge.csv', index=False)

# Expand OOF for diagnostics
oof_full = oof_clip[orig_map['dedup_idx'].values]
pd.DataFrame({'id': train['id'], 'oof': oof_full, 'score': train['score']}).to_csv('artifacts/oof_dense_full_rows.csv', index=False)

# Build submission
pd.DataFrame({'id': test['id'], 'score': test_pred}).to_csv('submission.csv', index=False)
print('Saved submission.csv; Total elapsed: %.2fs' % (time.time()-t0))

In [None]:
# Diagnostic: check raw cosine correlations (no model) to locate issue
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize as l2_normalize
from scipy.stats import pearsonr

def side_text_a(df):
    return (df['_context_n'].astype(str) + ' ' + df['_anchor_n'].astype(str)).values
def side_text_t(df):
    return (df['_context_n'].astype(str) + ' ' + df['_target_n'].astype(str)).values
def rowwise_cosine(Xa, Xt):
    Xa = l2_normalize(Xa, axis=1, copy=False)
    Xt = l2_normalize(Xt, axis=1, copy=False)
    sim = (Xa.multiply(Xt)).sum(axis=1)
    return np.asarray(sim).ravel()

dedup = pd.read_csv('artifacts/train_dedup_folds.csv')
y = dedup['score_mean'].values.astype(np.float32)
folds = dedup['fold'].values.astype(int)

prs_w = []
prs_c = []
for f in sorted(np.unique(folds)):
    tr_idx = np.where(folds != f)[0]
    va_idx = np.where(folds == f)[0]
    df_tr = dedup.iloc[tr_idx].reset_index(drop=True)
    df_va = dedup.iloc[va_idx].reset_index(drop=True)
    sA_tr = side_text_a(df_tr); sT_tr = side_text_t(df_tr)
    sA_va = side_text_a(df_va); sT_va = side_text_t(df_va)
    corpus = np.concatenate([sA_tr, sT_tr], axis=0)
    wvec = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=2, max_features=100_000)
    cvec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), min_df=2, max_features=200_000)
    wvec.fit(corpus); cvec.fit(corpus)
    cos_w = rowwise_cosine(wvec.transform(sA_va), wvec.transform(sT_va))
    cos_c = rowwise_cosine(cvec.transform(sA_va), cvec.transform(sT_va))
    pr_w = pearsonr(y[va_idx], cos_w)[0]
    pr_c = pearsonr(y[va_idx], cos_c)[0]
    prs_w.append(pr_w); prs_c.append(pr_c)
    print(f'Fold {f}: cos_word Pearson={pr_w:.4f}, cos_char Pearson={pr_c:.4f}, ranges: w[{cos_w.min():.3f},{cos_w.max():.3f}] c[{cos_c.min():.3f},{cos_c.max():.3f}]')

print('Mean cos_word Pearson:', float(np.mean(prs_w)))
print('Mean cos_char Pearson:', float(np.mean(prs_c)))

In [1]:
# Install PyTorch cu121 + transformers stack and sanity check GPU
import os, sys, subprocess, shutil, time
from pathlib import Path

def pip(*args):
    print('>', *args, flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

# Uninstall any preinstalled torch stack to avoid conflicts
for pkg in ('torch','torchvision','torchaudio'):
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', pkg], check=False)

# Clean stray site dirs that can shadow correct wheels (idempotent)
for d in (
    '/app/.pip-target/torch',
    '/app/.pip-target/torch-2.8.0.dist-info',
    '/app/.pip-target/torch-2.4.1.dist-info',
    '/app/.pip-target/torchvision',
    '/app/.pip-target/torchvision-0.23.0.dist-info',
    '/app/.pip-target/torchvision-0.19.1.dist-info',
    '/app/.pip-target/torchaudio',
    '/app/.pip-target/torchaudio-2.8.0.dist-info',
    '/app/.pip-target/torchaudio-2.4.1.dist-info',
    '/app/.pip-target/torchgen',
    '/app/.pip-target/functorch',
):
    if os.path.exists(d):
        print('Removing', d); shutil.rmtree(d, ignore_errors=True)

# Install exact cu121 torch stack
pip('install',
    '--index-url', 'https://download.pytorch.org/whl/cu121',
    '--extra-index-url', 'https://pypi.org/simple',
    'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1')

# Freeze versions for later installs
Path('constraints.txt').write_text('torch==2.4.1\ntorchvision==0.19.1\ntorchaudio==2.4.1\n')

# Install transformer ecosystem (avoid upgrading torch)
pip('install', '-c', 'constraints.txt',
    'transformers==4.44.2', 'accelerate==0.34.2',
    'datasets==2.21.0', 'evaluate==0.4.2',
    'sentencepiece', 'scikit-learn', 'sentence-transformers==3.0.1',
    '--upgrade-strategy', 'only-if-needed')

# Sanity check GPU
import torch
print('torch:', torch.__version__, 'CUDA build:', getattr(torch.version, 'cuda', None))
print('CUDA available:', torch.cuda.is_available())
assert str(getattr(torch.version,'cuda','')).startswith('12.1'), f'Wrong CUDA build: {torch.version.cuda}'
assert torch.cuda.is_available(), 'CUDA not available'
print('GPU:', torch.cuda.get_device_name(0))

Found existing installation: torch 2.4.1+cu121


Uninstalling torch-2.4.1+cu121:
  Successfully uninstalled torch-2.4.1+cu121


Found existing installation: torchvision 0.19.1+cu121
Uninstalling torchvision-0.19.1+cu121:
  Successfully uninstalled torchvision-0.19.1+cu121


Found existing installation: torchaudio 2.4.1+cu121
Uninstalling torchaudio-2.4.1+cu121:
  Successfully uninstalled torchaudio-2.4.1+cu121
Removing /app/.pip-target/torch-2.4.1.dist-info
> install --index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://pypi.org/simple torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1


Looking in indexes: https://download.pytorch.org/whl/cu121, https://pypi.org/simple


Collecting torch==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (799.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 799.0/799.0 MB 551.9 MB/s eta 0:00:00


Collecting torchvision==0.19.1
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.19.1%2Bcu121-cp311-cp311-linux_x86_64.whl (7.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.1/7.1 MB 275.7 MB/s eta 0:00:00


Collecting torchaudio==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.4/3.4 MB 525.6 MB/s eta 0:00:00
Collecting nvidia-cublas-cu12==12.1.3.1


  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 286.5 MB/s eta 0:00:00


Collecting nvidia-cusolver-cu12==11.4.5.107
  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 169.7 MB/s eta 0:00:00


Collecting nvidia-curand-cu12==10.3.2.106
  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 571.6 MB/s eta 0:00:00


Collecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 KB 290.1 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)


Collecting typing-extensions>=4.8.0
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 KB 403.9 MB/s eta 0:00:00
Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 238.2 MB/s eta 0:00:00


Collecting nvidia-cuda-cupti-cu12==12.1.105
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 361.7 MB/s eta 0:00:00


Collecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 176.2/176.2 MB 76.0 MB/s eta 0:00:00


Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 137.9 MB/s eta 0:00:00


Collecting networkx
  Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 510.0 MB/s eta 0:00:00


Collecting nvidia-cufft-cu12==11.0.2.54
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 238.3 MB/s eta 0:00:00


Collecting triton==3.0.0
  Downloading triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.4/209.4 MB 79.0 MB/s eta 0:00:00


Collecting fsspec
  Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.3/199.3 KB 486.7 MB/s eta 0:00:00


Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 508.1 MB/s eta 0:00:00


Collecting nvidia-nvtx-cu12==12.1.105
  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 KB 465.1 MB/s eta 0:00:00


Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 559.4 MB/s eta 0:00:00


Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.9/134.9 KB 478.6 MB/s eta 0:00:00


Collecting pillow!=8.3.*,>=5.3.0
  Downloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 553.5 MB/s eta 0:00:00


Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 233.5 MB/s eta 0:00:00


Collecting nvidia-nvjitlink-cu12


  Downloading nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.7 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.7/39.7 MB 287.8 MB/s eta 0:00:00


Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23 kB)


Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 KB 518.1 MB/s eta 0:00:00


Installing collected packages: mpmath, typing-extensions, sympy, pillow, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, MarkupSafe, fsspec, filelock, triton, nvidia-cusparse-cu12, nvidia-cudnn-cu12, jinja2, nvidia-cusolver-cu12, torch, torchvision, torchaudio


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.21.0 requires fsspec[http]<=2024.6.1,>=2023.1.0, but you have fsspec 2025.9.0 which is incompatible.


Successfully installed MarkupSafe-3.0.2 filelock-3.19.1 fsspec-2025.9.0 jinja2-3.1.6 mpmath-1.3.0 networkx-3.5 numpy-1.26.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.9.86 nvidia-nvtx-cu12-12.1.105 pillow-11.3.0 sympy-1.14.0 torch-2.4.1+cu121 torchaudio-2.4.1+cu121 torchvision-0.19.1+cu121 triton-3.0.0 typing-extensions-4.15.0




> install -c constraints.txt transformers==4.44.2 accelerate==0.34.2 datasets==2.21.0 evaluate==0.4.2 sentencepiece scikit-learn sentence-transformers==3.0.1 --upgrade-strategy only-if-needed


Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.5/9.5 MB 170.8 MB/s eta 0:00:00
Collecting accelerate==0.34.2
  Downloading accelerate-0.34.2-py3-none-any.whl (324 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 324.4/324.4 KB 524.5 MB/s eta 0:00:00
Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 527.3/527.3 KB 530.7 MB/s eta 0:00:00


Collecting evaluate==0.4.2
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.1/84.1 KB 401.5 MB/s eta 0:00:00
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (1.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 356.8 MB/s eta 0:00:00


Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.7/9.7 MB 208.9 MB/s eta 0:00:00
Collecting sentence-transformers==3.0.1
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 227.1/227.1 KB 533.9 MB/s eta 0:00:00


Collecting tqdm>=4.27
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.5/78.5 KB 433.6 MB/s eta 0:00:00


Collecting numpy>=1.17
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 531.5 MB/s eta 0:00:00


Collecting tokenizers<0.20,>=0.19
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.6/3.6 MB 336.9 MB/s eta 0:00:00


Collecting safetensors>=0.4.1
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (485 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 485.8/485.8 KB 473.9 MB/s eta 0:00:00
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.7/64.7 KB 439.7 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2
  Downloading huggingface_hub-0.35.1-py3-none-any.whl (563 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 563.3/563.3 KB 526.4 MB/s eta 0:00:00
Collecting packaging>=20.0
  Downloading packaging-25.0-py3-none-any.whl (66 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.5/66.5 KB 418.6 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)


Collecting pyyaml>=5.1
  Downloading PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (762 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 763.0/763.0 KB 521.1 MB/s eta 0:00:00


Collecting regex!=2019.12.17
  Downloading regex-2025.9.18-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (798 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 799.0/799.0 KB 539.8 MB/s eta 0:00:00
Collecting psutil
  Downloading psutil-7.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (291 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 291.2/291.2 KB 514.0 MB/s eta 0:00:00


Collecting torch>=1.10.0
  Downloading torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl (797.1 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 797.1/797.1 MB 289.2 MB/s eta 0:00:00


Collecting xxhash
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.8/194.8 KB 485.0 MB/s eta 0:00:00
Collecting pyarrow>=15.0.0
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (42.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.8/42.8 MB 249.6 MB/s eta 0:00:00
Collecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 KB 470.5 MB/s eta 0:00:00


Collecting aiohttp
  Downloading aiohttp-3.12.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 217.4 MB/s eta 0:00:00


Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.4/12.4 MB 494.3 MB/s eta 0:00:00
Collecting multiprocess
  Downloading multiprocess-0.70.18-py311-none-any.whl (144 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.5/144.5 KB 472.3 MB/s eta 0:00:00
Collecting fsspec[http]<=2024.6.1,>=2023.1.0
  Downloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 177.6/177.6 KB 483.7 MB/s eta 0:00:00


Collecting Pillow
  Downloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 397.5 MB/s eta 0:00:00


Collecting scipy
  Downloading scipy-1.16.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.9/35.9 MB 254.2 MB/s eta 0:00:00


Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 308.4/308.4 KB 449.1 MB/s eta 0:00:00


Collecting yarl<2.0,>=1.17.0
  Downloading yarl-1.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (348 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 349.0/349.0 KB 489.1 MB/s eta 0:00:00
Collecting aiosignal>=1.4.0
  Downloading aiosignal-1.4.0-py3-none-any.whl (7.5 kB)
Collecting aiohappyeyeballs>=2.5.0
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl (15 kB)
Collecting propcache>=0.2.0
  Downloading propcache-0.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 213.5/213.5 KB 450.6 MB/s eta 0:00:00


Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.7.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (235 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 235.3/235.3 KB 457.6 MB/s eta 0:00:00
Collecting attrs>=17.3.0
  Downloading attrs-25.3.0-py3-none-any.whl (63 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.8/63.8 KB 425.6 MB/s eta 0:00:00


Collecting multidict<7.0,>=4.5
  Downloading multidict-6.6.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (246 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 246.7/246.7 KB 505.9 MB/s eta 0:00:00
Collecting hf-xet<2.0.0,>=1.1.3
  Downloading hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.2/3.2 MB 534.7 MB/s eta 0:00:00
Collecting typing-extensions>=3.7.4.3
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 KB 369.1 MB/s eta 0:00:00


Collecting charset_normalizer<4,>=2
  Downloading charset_normalizer-3.4.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (150 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.3/150.3 KB 484.8 MB/s eta 0:00:00
Collecting certifi>=2017.4.17
  Downloading certifi-2025.8.3-py3-none-any.whl (161 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 161.2/161.2 KB 501.7 MB/s eta 0:00:00
Collecting urllib3<3,>=1.21.1
  Downloading urllib3-2.5.0-py3-none-any.whl (129 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.8/129.8 KB 439.6 MB/s eta 0:00:00
Collecting idna<4,>=2.5
  Downloading idna-3.10-py3-none-any.whl (70 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 70.4/70.4 KB 412.6 MB/s eta 0:00:00


Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 507.3 MB/s eta 0:00:00
Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 133.1 MB/s eta 0:00:00
Collecting triton==3.0.0


  Downloading triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.4/209.4 MB 238.7 MB/s eta 0:00:00
Collecting nvidia-nvtx-cu12==12.1.105
  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 KB 426.8 MB/s eta 0:00:00
Collecting nvidia-cusolver-cu12==11.4.5.107


  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 317.1 MB/s eta 0:00:00
Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.9/134.9 KB 494.5 MB/s eta 0:00:00
Collecting networkx
  Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 405.1 MB/s eta 0:00:00
Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 129.5 MB/s eta 0:00:00


Collecting nvidia-curand-cu12==10.3.2.106
  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 293.1 MB/s eta 0:00:00
Collecting nvidia-cublas-cu12==12.1.3.1
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 233.7 MB/s eta 0:00:00


Collecting nvidia-cufft-cu12==11.0.2.54
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 302.4 MB/s eta 0:00:00
Collecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 176.2/176.2 MB 270.7 MB/s eta 0:00:00
Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 287.3 MB/s eta 0:00:00
Collecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 KB 506.4 MB/s eta 0:00:00
Collecting nvidia-cuda-cupti-cu12==12.1.105
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 183.3 MB/s eta 0:00:00
Collecting nvidia-nvjitlink-cu12
  Downloading nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.7 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.7/39.7 MB 170.1 MB/s eta 0:00:00
Collecting multiprocess
  Downloading multiprocess-0.70.17-py311-none-any.whl (144 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.3/144.3 KB 475.7 MB/s eta 0:00:00
  Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 143.5/143.5 KB 484.7 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 509.2/509.2 KB 361.9 MB/s eta 0:00:00


Collecting python-dateutil>=2.8.2
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 229.9/229.9 KB 438.2 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 347.8/347.8 KB 525.4 MB/s eta 0:00:00


Collecting six>=1.5
  Downloading six-1.17.0-py2.py3-none-any.whl (11 kB)
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23 kB)
Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 KB 492.9 MB/s eta 0:00:00


Installing collected packages: pytz, mpmath, xxhash, urllib3, tzdata, typing-extensions, tqdm, threadpoolctl, sympy, six, sentencepiece, safetensors, regex, pyyaml, pyarrow, psutil, propcache, Pillow, packaging, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, multidict, MarkupSafe, joblib, idna, hf-xet, fsspec, frozenlist, filelock, dill, charset_normalizer, certifi, attrs, aiohappyeyeballs, yarl, triton, scipy, requests, python-dateutil, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, jinja2, aiosignal, scikit-learn, pandas, nvidia-cusolver-cu12, huggingface-hub, aiohttp, torch, tokenizers, transformers, datasets, accelerate, sentence-transformers, evaluate


Successfully installed MarkupSafe-3.0.2 Pillow-11.3.0 accelerate-0.34.2 aiohappyeyeballs-2.6.1 aiohttp-3.12.15 aiosignal-1.4.0 attrs-25.3.0 certifi-2025.8.3 charset_normalizer-3.4.3 datasets-2.21.0 dill-0.3.8 evaluate-0.4.2 filelock-3.19.1 frozenlist-1.7.0 fsspec-2024.6.1 hf-xet-1.1.10 huggingface-hub-0.35.1 idna-3.10 jinja2-3.1.6 joblib-1.5.2 mpmath-1.3.0 multidict-6.6.4 multiprocess-0.70.16 networkx-3.5 numpy-1.26.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.9.86 nvidia-nvtx-cu12-12.1.105 packaging-25.0 pandas-2.3.2 propcache-0.3.2 psutil-7.1.0 pyarrow-21.0.0 python-dateutil-2.9.0.post0 pytz-2025.2 pyyaml-6.0.2 regex-2025.9.18 requests-2.32.5 safetensors-0.6.2 scikit-learn-1.7.2 scipy-1.16.2 sentence-transfo



torch: 2.4.1+cu121 CUDA build: 12.1
CUDA available: False


AssertionError: CUDA not available