# Дообучение E5 на MIRACL

Fine-tuning multilingual-e5-base с использованием hard negatives.

In [1]:
!pip install pyarrow==18.1.0 -q
!pip install datasets==2.14.0 sentence-transformers sentencepiece faiss-cpu rank-bm25 torch tqdm -q
!pip install huggingface_hub gdown -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-adk 1.22.1 requires google-cloud-bigquery-storage>=2.0.0, which is not installed.
bigframes 2.26.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
datasets 4.4.2 requires pyarrow>=21.0.0, but you have pyarrow 18.1.0 which is incompatible.
bigframes 2.26.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━

In [2]:
# Скачивание FAISS индекса
import gdown

file_id = "1_z6Kup484-UKe4UxTTKRN2fzPmtYLsKn"
url = f"https://drive.google.com/uc?id={file_id}&export=download"
output = "dense_e5.faiss"

gdown.download(url, output, quiet=False)
print(f"Индекс скачан: {output}")

Downloading...
From (original): https://drive.google.com/uc?id=1_z6Kup484-UKe4UxTTKRN2fzPmtYLsKn&export=download
From (redirected): https://drive.google.com/uc?id=1_z6Kup484-UKe4UxTTKRN2fzPmtYLsKn&export=download&confirm=t&uuid=bbaccbfa-30e6-4b06-9397-e6882e0592f1
To: /kaggle/working/dense_e5.faiss
100%|██████████| 1.66G/1.66G [00:13<00:00, 126MB/s] 

Индекс скачан: dense_e5.faiss





In [3]:
import warnings
warnings.filterwarnings('ignore')

import os
import json
import gzip
import random
import requests
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from collections import defaultdict
from tqdm.notebook import tqdm
from huggingface_hub import hf_hub_download, list_repo_files
import sentencepiece as spm
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
import faiss
import gc
import time

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

2026-01-22 14:00:12.429477: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769090412.652774      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769090412.719484      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769090413.281241      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769090413.281273      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769090413.281276      55 computation_placer.cc:177] computation placer alr

CUDA: True
GPU: Tesla T4
VRAM: 15.8 GB


## 1. Загрузка данных

In [4]:
BASE_URL = "https://huggingface.co/datasets/miracl/miracl/resolve/main/miracl-v1.0-ru/"

def load_queries(split: str) -> dict:
    url = f"{BASE_URL}topics/topics.miracl-v1.0-ru-{split}.tsv"
    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        queries = {}
        for line in response.text.strip().split('\n')[1:]:
            parts = line.split('\t')
            if len(parts) >= 2:
                queries[parts[0]] = parts[1]
        return queries
    except:
        return {}

def load_qrels(split: str) -> dict:
    url = f"{BASE_URL}qrels/qrels.miracl-v1.0-ru-{split}.tsv"
    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        qrels = defaultdict(lambda: {'positive': [], 'negative': []})
        for line in response.text.strip().split('\n'):
            parts = line.split('\t')
            if len(parts) >= 4:
                qid, doc_id, rel = parts[0], parts[2], int(parts[3])
                key = 'positive' if rel > 0 else 'negative'
                qrels[qid][key].append(doc_id)
        return dict(qrels)
    except:
        return {}

def get_required_doc_ids(qrels_dict: dict) -> set:
    doc_ids = set()
    for q in qrels_dict.values():
        doc_ids.update(q.get('positive', []))
        doc_ids.update(q.get('negative', []))
    return doc_ids

In [5]:
dev_queries = load_queries('dev')
dev_qrels = load_qrels('dev')
train_queries = load_queries('train')
train_qrels = load_qrels('train')

all_qrels = {**dev_qrels, **train_qrels}
required_doc_ids = get_required_doc_ids(all_qrels)

print(f"Dev: {len(dev_queries)} запросов")
print(f"Train: {len(train_queries)} запросов")

Dev: 1251 запросов
Train: 4682 запросов


In [6]:
def load_corpus(target_docs: int = 500_000, seed: int = 42) -> tuple:
    random.seed(seed)
    
    all_files = list_repo_files("miracl/miracl-corpus", repo_type="dataset")
    jsonl_files = sorted([f for f in all_files if 'ru' in f and f.endswith('.jsonl.gz')])
    docs_per_file = target_docs // len(jsonl_files)

    corpus = {}
    doc_ids = []
    passages = []

    for jsonl_file in tqdm(jsonl_files, desc="Загрузка корпуса"):
        try:
            file_path = hf_hub_download(
                repo_id="miracl/miracl-corpus",
                filename=jsonl_file,
                repo_type="dataset"
            )

            file_docs = []
            required_docs = []

            with gzip.open(file_path, 'rt', encoding='utf-8') as f:
                for line in f:
                    if not line.strip():
                        continue
                    item = json.loads(line)
                    doc = {
                        'docid': item['docid'],
                        'title': item['title'],
                        'text': item['text'],
                        'full_text': f"{item['title']}. {item['text']}"
                    }
                    
                    if item['docid'] in required_doc_ids:
                        required_docs.append(doc)
                    else:
                        file_docs.append(doc)

            sample_size = min(docs_per_file, len(file_docs))
            selected = random.sample(file_docs, sample_size) if file_docs else []
            
            for doc in selected + required_docs:
                if doc['docid'] not in corpus:
                    corpus[doc['docid']] = {
                        'title': doc['title'],
                        'text': doc['text'],
                        'full_text': doc['full_text']
                    }
                    doc_ids.append(doc['docid'])
                    passages.append(doc['full_text'])

        except Exception as e:
            print(f"Ошибка: {e}")

    return corpus, doc_ids, passages

corpus, doc_ids, passages = load_corpus(target_docs=500_000)
print(f"Загружено документов: {len(corpus):,}")

Загрузка корпуса:   0%|          | 0/20 [00:00<?, ?it/s]

miracl-corpus-v1.0-ru/docs-0.jsonl.gz:   0%|          | 0.00/100M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-1.jsonl.gz:   0%|          | 0.00/98.0M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-10.jsonl.gz:   0%|          | 0.00/75.7M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-11.jsonl.gz:   0%|          | 0.00/78.3M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-12.jsonl.gz:   0%|          | 0.00/76.5M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-13.jsonl.gz:   0%|          | 0.00/78.2M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-14.jsonl.gz:   0%|          | 0.00/78.3M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-15.jsonl.gz:   0%|          | 0.00/79.4M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-16.jsonl.gz:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-17.jsonl.gz:   0%|          | 0.00/82.0M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-18.jsonl.gz:   0%|          | 0.00/81.0M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-19.jsonl.gz:   0%|          | 0.00/6.75M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-2.jsonl.gz:   0%|          | 0.00/90.1M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-3.jsonl.gz:   0%|          | 0.00/89.4M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-4.jsonl.gz:   0%|          | 0.00/82.0M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-5.jsonl.gz:   0%|          | 0.00/88.5M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-6.jsonl.gz:   0%|          | 0.00/81.1M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-7.jsonl.gz:   0%|          | 0.00/80.2M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-8.jsonl.gz:   0%|          | 0.00/74.5M [00:00<?, ?B/s]

miracl-corpus-v1.0-ru/docs-9.jsonl.gz:   0%|          | 0.00/73.5M [00:00<?, ?B/s]

Загружено документов: 541,007


## 2. Dense Retriever для майнинга негативов

In [7]:
MODEL_NAME = "intfloat/multilingual-e5-base"
DENSE_INDEX_PATH = "dense_e5.faiss"

class DenseRetrieverE5:
    def __init__(self, model_name: str):
        print(f"Загрузка модели: {model_name}")
        self.model = SentenceTransformer(model_name)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        if self.device == 'cuda':
            self.model = self.model.to('cuda')
            self.model.half()
        
        self.index = None
        self.doc_ids = None
        self.passages = None
        self.dimension = None
    
    def _prepare_passage(self, text: str) -> str:
        return f"passage: {text}"
    
    def _prepare_query(self, text: str) -> str:
        return f"query: {text}"
    
    def fit(self, doc_ids: list, passages: list, batch_size: int = 128):
        print("Построение индекса...")
        self.doc_ids = doc_ids
        self.passages = passages
        
        with torch.no_grad():
            sample = self.model.encode(
                [self._prepare_passage(passages[0])],
                convert_to_numpy=True,
                normalize_embeddings=True
            )
        self.dimension = sample.shape[1]
        
        self.index = faiss.IndexFlatIP(self.dimension)
        
        for start_idx in tqdm(range(0, len(passages), batch_size), desc="Индексация"):
            end_idx = min(start_idx + batch_size, len(passages))
            batch = [self._prepare_passage(p) for p in passages[start_idx:end_idx]]
            
            with torch.no_grad():
                embeddings = self.model.encode(
                    batch,
                    batch_size=batch_size,
                    show_progress_bar=False,
                    convert_to_numpy=True,
                    normalize_embeddings=True
                )
            self.index.add(embeddings.astype('float32'))
            
            if (start_idx // batch_size) % 50 == 0:
                gc.collect()
                torch.cuda.empty_cache()
        
        print(f"Индекс построен: {len(doc_ids):,} документов")
    
    def search(self, query: str, top_k: int = 10) -> list:
        prepared_query = self._prepare_query(query)
        
        with torch.no_grad():
            query_emb = self.model.encode(
                [prepared_query],
                convert_to_numpy=True,
                normalize_embeddings=True
            ).astype('float32')
        
        scores, indices = self.index.search(query_emb, top_k)
        
        return [{
            'doc_id': self.doc_ids[idx],
            'score': float(score),
            'passage': self.passages[idx]
        } for idx, score in zip(indices[0], scores[0])]
    
    def save_index(self, path: str):
        faiss.write_index(self.index, path)
        print(f"Индекс сохранён: {path}")

In [8]:
# Загрузка индекса для майнинга hard negatives
dense_retriever = DenseRetrieverE5(MODEL_NAME)
dense_retriever.index = faiss.read_index(DENSE_INDEX_PATH)
dense_retriever.doc_ids = doc_ids
dense_retriever.passages = passages
dense_retriever.dimension = dense_retriever.index.d

print(f"Dense индекс загружен: {dense_retriever.index.ntotal:,} документов")

Загрузка модели: intfloat/multilingual-e5-base


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Dense индекс загружен: 541,007 документов


## 3. Подготовка обучающих данных

In [9]:
def prepare_training_data(queries, qrels, corpus, retriever):
    """Hard negatives из retriever + random negatives"""
    training_data = []
    corpus_keys = list(corpus.keys())
    
    valid_queries = {k: v for k, v in queries.items() if k in qrels}
    
    for qid, query_text in tqdm(valid_queries.items(), desc="Подготовка данных"):
        pos_ids = [pid for pid in qrels[qid]['positive'] if pid in corpus]
        if not pos_ids:
            continue
        
        # Hard negatives
        hard_negs = []
        try:
            hits = retriever.search(query_text, top_k=20)
            hard_negs = [h['doc_id'] for h in hits 
                        if h['doc_id'] not in pos_ids and h['doc_id'] in corpus]
        except:
            pass
        
        # Random negatives
        random_negs = []
        while len(random_negs) < 3:
            rid = random.choice(corpus_keys)
            if rid not in pos_ids and rid not in hard_negs:
                random_negs.append(rid)
        
        negatives = hard_negs[:5] + random_negs
        
        if negatives:
            for pid in pos_ids:
                training_data.append({
                    'query': query_text,
                    'positive': corpus[pid]['full_text'],
                    'negatives': [corpus[n]['full_text'] for n in negatives]
                })
    
    print(f"Примеров: {len(training_data):,}")
    return training_data

training_data = prepare_training_data(train_queries, train_qrels, corpus, dense_retriever)

Подготовка данных:   0%|          | 0/4682 [00:00<?, ?it/s]

Примеров: 9,999


In [10]:
class E5Dataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Префиксы для E5
        query = f"query: {item['query']}"
        pos = f"passage: {item['positive']}"
        
        # Случайные 3 негатива
        negs = item['negatives'].copy()
        random.shuffle(negs)
        selected_negs = [f"passage: {n}" for n in negs[:3]]
        
        return InputExample(texts=[query, pos] + selected_negs)

train_dataset = E5Dataset(training_data)
print(f"Dataset: {len(train_dataset):,} примеров")

Dataset: 9,999 примеров


## 4. Обучение

In [11]:
OUTPUT_PATH = "finetuned_e5"
BATCH_SIZE = 24
EPOCHS = 1
MAX_SEQ_LENGTH = 160
LEARNING_RATE = 2e-5

gc.collect()
torch.cuda.empty_cache()

print(f"Загрузка модели: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)
model.max_seq_length = MAX_SEQ_LENGTH
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_loss = losses.MultipleNegativesRankingLoss(model)

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    drop_last=True,
    num_workers=0,
    collate_fn=model.smart_batching_collate
)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_dataloader) * EPOCHS
warmup_steps = int(total_steps * 0.1)

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=warmup_steps, 
    num_training_steps=total_steps
)

scaler = GradScaler()

print(f"\nКонфигурация:")
print(f"  Device: {device}")
print(f"  Batch: {BATCH_SIZE}")
print(f"  Steps: {total_steps}")
print(f"  LR: {LEARNING_RATE}")

Загрузка модели: intfloat/multilingual-e5-base

Конфигурация:
  Device: cuda
  Batch: 24
  Steps: 416
  LR: 2e-05


In [12]:
print(f"\nСтарт обучения: {time.strftime('%H:%M:%S')}")
model.train()
start_time = time.time()

for epoch in range(EPOCHS):
    epoch_loss = 0.0
    pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for step, (batch_features, labels) in enumerate(pbar):
        features = [{k: v.to(device) for k, v in component.items()} 
                   for component in batch_features]
        
        optimizer.zero_grad()
        
        with autocast():
            loss_value = train_loss(features, labels)
        
        scaler.scale(loss_value).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        epoch_loss += loss_value.item()
        
        if step % 10 == 0:
            pbar.set_postfix({'loss': f"{loss_value.item():.4f}"})
        
        if step % 50 == 0:
            torch.cuda.empty_cache()

    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}: avg_loss = {avg_loss:.4f}")

print(f"\nОбучение завершено за {(time.time()-start_time)/60:.1f} мин")

model.save(OUTPUT_PATH)
print(f"Модель сохранена: {OUTPUT_PATH}")


Старт обучения: 14:14:23


Epoch 1/1:   0%|          | 0/416 [00:00<?, ?it/s]

Epoch 1: avg_loss = 0.8331

Обучение завершено за 6.7 мин
Модель сохранена: finetuned_e5


## 5. Создание нового Retriever

In [13]:
# Очистка памяти
del model, train_loss, optimizer, scheduler
gc.collect()
torch.cuda.empty_cache()

# Новый retriever с дообученной моделью
trained_retriever = DenseRetrieverE5(model_name=OUTPUT_PATH)
trained_retriever.fit(doc_ids, passages, batch_size=64)
trained_retriever.save_index("trained_e5.faiss")

Загрузка модели: finetuned_e5
Построение индекса...


Индексация:   0%|          | 0/8454 [00:00<?, ?it/s]

Индекс построен: 541,007 документов
Индекс сохранён: trained_e5.faiss


## 6. Оценка

In [14]:
def evaluate(retriever, queries: dict, qrels: dict, ks: list = [1, 5, 10, 20, 100]) -> dict:
    metrics = defaultdict(list)
    indexed = set(retriever.doc_ids)
    
    for qid, text in tqdm(queries.items(), desc="Оценка"):
        if qid not in qrels:
            continue
        
        relevant = [d for d in qrels[qid]['positive'] if d in indexed]
        if not relevant:
            continue
        
        results = retriever.search(text, top_k=max(ks))
        retrieved = [r['doc_id'] for r in results]
        
        for k in ks:
            hit = len(set(retrieved[:k]) & set(relevant))
            metrics[f'Recall@{k}'].append(hit / len(relevant))
        
        for rank, doc_id in enumerate(retrieved, 1):
            if doc_id in set(relevant):
                metrics['MRR'].append(1.0 / rank)
                break
        else:
            metrics['MRR'].append(0.0)
        
        dcg = sum(1.0/np.log2(i+2) for i, d in enumerate(retrieved[:10]) if d in set(relevant))
        idcg = sum(1.0/np.log2(i+2) for i in range(min(10, len(relevant))))
        metrics['NDCG@10'].append(dcg / idcg if idcg > 0 else 0)
    
    return {k: np.mean(v) for k, v in metrics.items()}

In [15]:
trained_metrics = evaluate(trained_retriever, dev_queries, dev_qrels)

baseline = {'MRR': 0.7935, 'NDCG@10': 0.7479, 'Recall@10': 0.8641}

print("\nСравнение результатов:")
print("-" * 45)
print(f"{'Метрика':<12} {'Baseline':<12} {'Fine-tuned':<12} {'Δ':<10}")
print("-" * 45)
for m in ['MRR', 'NDCG@10', 'Recall@10']:
    b = baseline.get(m, 0)
    t = trained_metrics.get(m, 0)
    d = t - b
    sign = "+" if d > 0 else ""
    status = "✓" if d > 0 else ""
    print(f"{m:<12} {b:<12.4f} {t:<12.4f} {sign}{d:.4f} {status}")

Оценка:   0%|          | 0/1251 [00:00<?, ?it/s]


Сравнение результатов:
---------------------------------------------
Метрика      Baseline     Fine-tuned   Δ         
---------------------------------------------
MRR          0.7935       0.8061       +0.0126 ✓
NDCG@10      0.7479       0.7631       +0.0152 ✓
Recall@10    0.8641       0.8704       +0.0063 ✓


## 7. Тестирование

In [16]:
test_questions = [
    "Кто такой Юрий Гагарин?",
    "Столица России",
    "Кто написал Войну и мир?"
]

for q in test_questions:
    print(f"\n{q}")
    result = trained_retriever.search(q, top_k=1)[0]
    print(f"  [{result['score']:.4f}] {result['passage'][:120]}...")


Кто такой Юрий Гагарин?
  [0.5232] Гагарин, Григорий Григорьевич (1945). Князь Григо́рий Григо́рьевич Гага́рин (род. 2 октября 1945, Вильжюиф, Франция) — п...

Столица России
  [0.6927] Столицы России. Столица России — главный город государства, политический и административный центр страны. На протяжении ...

Кто написал Войну и мир?
  [0.6274] Война и мир. Замысел эпопеи формировался задолго до начала работы над тем текстом, который известен под названием «Война...


## 8. Сохранение результатов

In [17]:
results = {
    'model': OUTPUT_PATH,
    'baseline_metrics': baseline,
    'trained_metrics': trained_metrics,
    'improvement': {
        'MRR': trained_metrics['MRR'] - baseline['MRR'],
        'Recall@10': trained_metrics['Recall@10'] - baseline['Recall@10']
    }
}

with open('finetuning_results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("Результаты сохранены: finetuning_results.json")

Результаты сохранены: finetuning_results.json


In [18]:
print(f"""
Итоги дообучения E5:

Конфигурация:
  Базовая модель: {MODEL_NAME}
  Batch size: {BATCH_SIZE}
  Epochs: {EPOCHS}
  Learning rate: {LEARNING_RATE}
  Примеров: {len(training_data):,}

Результаты (dev):
  MRR:       {baseline['MRR']:.4f} → {trained_metrics['MRR']:.4f}
  Recall@10: {baseline['Recall@10']:.4f} → {trained_metrics['Recall@10']:.4f}
  NDCG@10:   {baseline['NDCG@10']:.4f} → {trained_metrics['NDCG@10']:.4f}

Файлы:
  {OUTPUT_PATH}/ - дообученная модель
  trained_e5.faiss - FAISS индекс
""")


Итоги дообучения E5:

Конфигурация:
  Базовая модель: intfloat/multilingual-e5-base
  Batch size: 24
  Epochs: 1
  Learning rate: 2e-05
  Примеров: 9,999

Результаты (dev):
  MRR:       0.7935 → 0.8061
  Recall@10: 0.8641 → 0.8704
  NDCG@10:   0.7479 → 0.7631

Файлы:
  finetuned_e5/ - дообученная модель
  trained_e5.faiss - FAISS индекс

