## **Этот ноутбук**: используя запросы пользователей (синтетические + из других датасетов), векторизирует их с помощью bge-m3 и сохраняет.

In [None]:
# -*- coding: utf-8 -*-
import os, math, torch, pandas as pd, numpy as np
from typing import List, Dict, Any
from transformers import AutoTokenizer, AutoModel

# ---- конфиг ----
MODEL_NAME = "BAAI/bge-m3"
TEXT_COL = "text"
BATCH_SIZE = 2048
MAX_LEN = 42
CHECKPOINT_LAYERS = [13, 17, 24]

# ---- данные ----
df = pd.read_parquet("/kaggle/input/queries/transef_lerning_merged_10_langueges.parquet")
texts = []
for i in range(len(df)):
    for col in df.columns:
        texts.append(df[col].iloc[i])

# ---- cuda тюн ----
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.6"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.enable_flash_sdp(True)
torch.backends.cuda.enable_mem_efficient_sdp(True)
torch.backends.cuda.enable_math_sdp(False)

# ---- pooling ----
def mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    summed = (last_hidden_state * mask).sum(dim=1)
    denom = mask.sum(dim=1).clamp_min(1e-6)
    return summed / denom

# ---- модель ----
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModel.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()

@torch.inference_mode()
def encode_checkpoints(batch_texts: List[str]) -> Dict[int, np.ndarray]:
    enc = tokenizer(batch_texts, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    first_device = next(iter(model.parameters())).device
    enc = {k: v.to(first_device) for k, v in enc.items()}
    with torch.autocast("cuda", dtype=torch.float16):
        out = model(**enc, output_hidden_states=True, return_dict=True)
        hs = out.hidden_states
    attn = enc["attention_mask"]

    pooled = {}
    for L in CHECKPOINT_LAYERS:
        h = mean_pool(hs[L], attn)
        h = torch.nn.functional.normalize(h, p=2, dim=-1)
        pooled[L] = h.half().cpu().numpy()
    return pooled

# ---- батчинг и сохранение ----
num_batches = math.ceil(len(texts) / BATCH_SIZE)

global_part_num = 2
global_part_count = 2
texts = texts[len(texts)*(global_part_num-1)//global_part_count : len(texts)*global_part_num//global_part_count]

split_on = 5
save_every = max(1, num_batches // split_on)

file_part = 1
buf_layers = {L: [] for L in CHECKPOINT_LAYERS}
buf_index = []

for bi in range(num_batches):
    sl = slice(bi * BATCH_SIZE, (bi + 1) * BATCH_SIZE)
    batch = texts[sl]
    if not batch:
        continue

    pooled_dict = encode_checkpoints(batch)
    B = pooled_dict[CHECKPOINT_LAYERS[0]].shape[0]

    # накапливаем
    for L in CHECKPOINT_LAYERS:
        buf_layers[L].append(pooled_dict[L])
    for i in range(B):
        buf_index.append({"idx": bi * BATCH_SIZE + i, TEXT_COL: batch[i]})

    # сохраняем кусок
    if (bi + 1) % save_every == 0:
        if buf_index:
            arrs = {f"layer_{L}": np.concatenate(buf_layers[L], axis=0).astype(np.float16) for L in CHECKPOINT_LAYERS}
            np.savez_compressed(f"embeds_part{file_part:03d}.npz", **arrs)
            pd.DataFrame(buf_index).to_csv(f"index_part{file_part:03d}.csv", index=False)
            print(f"Saved part {file_part} | rows={len(buf_index)}")
            file_part += 1
            buf_layers = {L: [] for L in CHECKPOINT_LAYERS}
            buf_index = []

# хвост
if buf_index:
    arrs = {f"layer_{L}": np.concatenate(buf_layers[L], axis=0).astype(np.float16) for L in CHECKPOINT_LAYERS}
    np.savez_compressed(f"embeds_part{file_part:03d}.npz", **arrs)
    pd.DataFrame(buf_index).to_csv(f"index_part{file_part:03d}.csv", index=False)
    print(f"Saved part {file_part} | rows={len(buf_index)}")


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

2025-09-16 03:48:15.697186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757994495.895525      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757994495.951237      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Saved part 1 | rows=272384
Saved part 2 | rows=272384
Saved part 3 | rows=139662
