In [None]:
from typing import List, Dict, Union

import numpy as np
import torch

from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

In [None]:
class RAG:

    def __init__(
            self,
            llm_name: str = "t-tech/T-lite-it-1.0",
            embedder_name: str = "cointegrated/rubert-tiny2",
            device: str = "cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device

        # Инициализация модели для эмбеддингов
        self.embedder_tokenizer = AutoTokenizer.from_pretrained(embedder_name) # Ваш код здесь
        self.embedder_model = AutoModel.from_pretrained(embedder_name).to(self.device) # Ваш код здесь

        # Инициализация LLM
        self.llm_tokenizer = AutoTokenizer.from_pretrained(llm_name) # Ваш код здесь
        self.llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to(self.device) # Ваш код здесь

        # База знаний: словарь с текстами и их эмбеддингами
        self.knowledge_base = {
            "texts": [],
            "embeddings": None  # Тензор с эмбеддингами
        }

    def get_embedding(self, text: str) -> torch.Tensor:
        """
        Получение эмбеддинга текста

        :param text: входной текст
        :return: эмбеддинг текста (тензор)
        """

        # Ваш код здесь
        inputs = self.embedder_tokenizer(text,
                                         return_tensors="pt",
                                         padding="max_length",
                                         truncation=True,
                                         max_length=512).to(self.device)

        with torch.no_grad():
            outputs = self.embedder_model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings = torch.nn.functional.normalize(embeddings)

        return embeddings.cpu().numpy()

    def add_to_knowledge_base(self, text: str|List[str]) -> None:
        """
        Добавление текста в базу знаний

        :param text: текст для добавления
        """
        embedding = self.get_embedding(text)

        # Добавляем текст в список
        if isinstance(text, str):
            self.knowledge_base["texts"].append(text)
        else:
            self.knowledge_base["texts"].extend(text)

        # Обновляем тензор эмбеддингов
        if self.knowledge_base["embeddings"] is None:
            self.knowledge_base["embeddings"] = embedding
        else:
            self.knowledge_base["embeddings"] = np.vstack(
                [self.knowledge_base["embeddings"], embedding], dim=0)

# Task 1-2

In [None]:
from typing import List, Dict, Union

import numpy as np
import torch

from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline, set_seed

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class RAG:

    def __init__(
            self,
            llm_name: str = "t-tech/T-lite-it-1.0",
            embedder_name: str = "cointegrated/rubert-tiny2",
            device: str = "cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device

        # Инициализация модели для эмбеддингов
        self.embedder_tokenizer = AutoTokenizer.from_pretrained(embedder_name)
        self.embedder_model = AutoModel.from_pretrained(embedder_name).to(
            self.device)

        # Инициализация LLM
        self.llm_tokenizer = AutoTokenizer.from_pretrained(llm_name)
        self.llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to(
            self.device)

        # База знаний: словарь с текстами и их эмбеддингами
        self.knowledge_base = {
            "texts": [],
            "embeddings": None  # Тензор с эмбеддингами
        }

    def get_embedding(self, text: str) -> torch.Tensor:
        """
        Получение эмбеддинга текста

        :param text: входной текст
        :return: эмбеддинг текста (тензор)
        """
        inputs = self.embedder_tokenizer(text,
                                         return_tensors="pt",
                                         padding="max_length",
                                         truncation=True,
                                         max_length=512).to(self.device)

        with torch.no_grad():
            outputs = self.embedder_model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings = torch.nn.functional.normalize(embeddings)

        return embeddings.cpu().numpy()

    def add_to_knowledge_base(self, text: str|List[str]) -> None:
        """
        Добавление текста в базу знаний

        :param text: текст для добавления
        """
        embedding = self.get_embedding(text)

        # Добавляем текст в список
        if isinstance(text, str):
            self.knowledge_base["texts"].append(text)
        else:
            self.knowledge_base["texts"].extend(text)

        # Обновляем тензор эмбеддингов
        if self.knowledge_base["embeddings"] is None:
            self.knowledge_base["embeddings"] = embedding
        else:
            self.knowledge_base["embeddings"] = np.vstack(
                [self.knowledge_base["embeddings"], embedding], dim=0)

    def find_closest(self,
                     query_embedding: np.ndarray,
                     top_k: int = 3) -> List[Dict]:
        """
        Поиск ближайших текстов в базе знаний

        :param query_embedding: эмбеддинг запроса
        :param top_k: количество возвращаемых результатов
        :return: список словарей с текстами и оценкой схожести
        """
        if len(self.knowledge_base["texts"]) == 0:
            return []

        # Вычисляем косинусную схожесть
        similarities = cosine_similarity(
            query_embedding.reshape(1, -1),
            self.knowledge_base["embeddings"])[0]

        # Получаем индексы топ-K результатов
        top_indices = similarities.argsort().argsort()[::-1][:min(top_k, len(similarities))]

        # Формируем результат
        results = []
        for idx in top_indices:
            results.append({
                "text": self.knowledge_base["texts"][idx],
                "score": similarities[idx]
            })

        return results

    def _create_prompt(self, question: str, context_texts: List[str]) -> str:
        """
        Создание промпта для LLM

        :param question: вопрос пользователя
        :param context_texts: список релевантных текстов из базы знаний
        :return: сформированный промпт
        """
        context = "\n\n".join([
            f"Контекст {i+1}: {text}" for i, text in enumerate(context_texts)
        ])

        prompt = f"""Используя приведённые ниже контексты, максимально кратко ответь на вопрос. Если в контекстах нет нужной информации, скажи об этом.

        {context}

        Вопрос: {question}"""

        return prompt

    def ask_question(self, question: str, top_k: int = 3) -> str:
        """
        Задание вопроса к системе RAG

        :param question: текст вопроса
        :param top_k: количество используемых контекстов из базы знаний
        :return: ответ модели
        """
        # Получаем эмбеддинг вопроса
        question_embedding = self.get_embedding(question)

        # Ищем релевантные тексты
        closest = self.find_closest(question_embedding, top_k=top_k)
        if len(closest) > 0:
            context_texts = [item["text"] for item in closest]
        else:
            context_texts = ["Релевантной информации не найдено"]

        # Создаём промпт
        prompt = self._create_prompt(question, context_texts)

        # Формируем сообщения для LLM
        messages = [{
            "role":
            "system",
            "content":
            "Ты виртуальный ассистент. Твоя задача - быть полезным диалоговым ассистентом."
        }, {
            "role": "user",
            "content": prompt
        }]

        # Генерируем ответ
        text = self.llm_tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True)

        model_inputs = self.llm_tokenizer([text],
                                          return_tensors="pt").to(self.device)

        generated_ids = self.llm_model.generate(**model_inputs,
                                                max_new_tokens=1024,
                                                do_sample=True,
                                                temperature=0.1)

        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(
                model_inputs.input_ids, generated_ids)
        ]

        response = self.llm_tokenizer.batch_decode(generated_ids,
                                                   skip_special_tokens=True)[0]

        return response

In [None]:

rag = RAG()
db = ["X0Ja_asd - пароль от моего компьютера", "RisingTide - новая группа, состоящая из бывших моряков"]


In [None]:

# добавляем в БД информацию
rag.add_to_knowledge_base(db)


In [None]:

set_seed(42) # для воспроизводимости
answer = rag.ask_question("Я забыл пароль от своего компьютера")
print(answer)

# LEsson 2

In [None]:
pip install faiss-gpu-cu12

In [None]:
import numpy as np
import faiss

In [None]:
d = 256                          # размерность векторов
nb = 100000                      # число векторов (мы их создадим случайно в таком количестве)
nq = 10000                       # число векторов для поиска

np.random.seed(1234)
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000. # добавляем информацию об очерёдности векторов



In [None]:
index = faiss.IndexFlatL2(d)   # инициализируем индекс
print(index.is_trained)
index.add(xb)                  # добавляем векторы
print(index.ntotal)

In [None]:
k = 4                          # 4 соседа
D, I = index.search(xb[:5], k) # поиск
print(I)
print(D)

# IndexIVFFlat

In [None]:
nlist = 100
k = 4

In [None]:
quantizer = faiss.IndexFlatL2(d) # где d, размерность векторов, была задана ранее
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [None]:
assert not index.is_trained # в начале индекс "не обучен"
index.train(xb)             # обучаем индекс
assert index.is_trained

In [None]:
index.add(xb)                 # nprobe = 1
D, I = index.search(xq, k)
print(I[-5:])

index.nprobe = 10              # изменим число кластеров для обхода
D, I = index.search(xq, k)
print(I[-5:])

# IndexHNSW

In [None]:
M = 64
ef_search = 16
ef_construction = 32

In [None]:
# инициализируем индекс
index = faiss.IndexHNSWFlat(d, M)
# задаём параметры
index.hnsw.efConstruction = ef_construction

In [None]:
index.hnsw.efSearch = ef_search
# добавляем данные в индекс
index.add(xb)

In [None]:
D, I = index.search(xq, k)
print(I[:5])

# Оптимизация хранения индекса

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
main_path = '/content/drive/MyDrive/PRACTICUM_DLE/sprint_7/'

In [None]:
def get_index_size(index):
    import os

    # запишем индекс на диск
    faiss.write_index(index, main_path + 'data/temp.index')
    # получаем размер файла
    index_size = os.path.getsize(main_path + 'data/temp.index')
    # удаляем сохранённый индекс
    os.remove(main_path + 'data/temp.index')
    return index_size

In [None]:
# инициализируем и вычисляем размер l2 индекса
index_l2 = faiss.IndexFlatL2(d)
index_l2.add(xb)
index_l2_size = get_index_size(index_l2)

# инициализируем и вычисляем размер PQ индекса
M = 16
assert d % M == 0 # из исходного вектора должно получаться целое число векторов
nbits = 8

index_pq = faiss.IndexPQ(d, M, nbits)
index_pq.train(xb) # обучаем индекс PQ
index_pq_size = get_index_size(index_pq)

print(f"Отношение индексов PQ/L2: {index_pq_size/index_l2_size:.4f}")

# Lesson 3 Методы retrieval

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch

In [None]:
model_name = "cross-encoder/stsb-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda")

In [None]:
query = "How to choose a laptop for work?"
candidates = [
"2023 rating of best laptops for office work",
"Comparison of Intel Core i5 vs i7 processors",
"How to improve performance of an old laptop",
"Optimal laptop specifications for programmers",
"Difference between SSD and HDD drives",
"10 common mistakes when buying a laptop",
"How to connect a laptop to a TV",
"Best budget laptops under 50,000 rubles",
"What graphics card is needed for graphic design work",
"How to extend laptop battery life",
]

In [None]:
def rerank(query, candidates):
    pairs = [(query, cand) for cand in candidates]
    inputs = tokenizer(pairs, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits).flatten().tolist()
    return sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)

In [None]:
reranked = rerank(query, candidates)
print(reranked)

In [None]:
reranked

В итоге общий код запроса к ранжирующей модели (с примерами на русском) выглядит так:

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
def format_instruction(instruction, query, doc):
    if instruction is None:
        instruction = 'Given a web search query, retrieve relevant passages that answer the query'
    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction,query=query, doc=doc)
    return output

In [None]:
def process_inputs(pairs):
    inputs = tokenizer(
        pairs, padding=False, truncation='longest_first',
        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)
    )
    for i, ele in enumerate(inputs['input_ids']):
        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens
    inputs = tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=max_length)

    # переносим тензоры на девайс модели
    for key in inputs:
        inputs[key] = inputs[key].to(model.device)
    return inputs

In [None]:
def compute_logits(inputs):
    batch_scores = model(**inputs).logits[:, -1, :]
    true_vector = batch_scores[:, token_true_id]
    false_vector = batch_scores[:, token_false_id]
    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
    scores = batch_scores[:, 1].exp().tolist()
    return scores

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Reranker-0.6B", padding_side='left')
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B").eval()


In [None]:
token_false_id = tokenizer.convert_tokens_to_ids("no")
token_true_id = tokenizer.convert_tokens_to_ids("yes")
max_length = 8192

In [None]:
prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)

In [None]:
task = 'Given a web search query, retrieve relevant passages that answer the query'

In [None]:
queries = ['Сколько лететь до марса',
    'растёт ли кукуруза в тени',
]

In [None]:
documents = [
   "Время полёта на Марс зависит от множества факторов, включая траекторию, скорость корабля и расположение планет. В среднем полёт может занять от 6 до 9 месяцев. Самый быстрый способ добраться до Марса, по расчётам, займёт около 70-80 суток, но потребует значительного количества топлива",
   "Кукуруза не растёт хорошо в тени. Она нуждается в достаточном количестве солнечного света для нормального развития и плодоношения",
   "Уход за кукурузой включает в себя полив, рыхление, прополку, подкормку и удаление пасынков. Важно обеспечить кукурузе достаточное количество влаги, особенно в период цветения и формирования початков, а также поддерживать почву рыхлой и свободной от сорняков."
]

In [None]:
pairs = []
for q in queries:
    for d in documents:
        pairs.append(format_instruction(task, q, d))

In [None]:
inputs = process_inputs(pairs)
scores = compute_logits(inputs)

In [None]:
print("scores: ", [round(x, 6) for x in scores])

# LEsson 4 Фреймворки для RAG

In [None]:
pip install faiss-gpu-cu12

In [None]:
pip install langchain-text-splitters

In [None]:
pip install langchain-huggingface sentence-transformers

In [None]:
pip install langchain-community faiss-gpu-cu12

# Task 4-1

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
text = """Глубокое обучение в медицинской диагностике.
Введение. Современные CNN достигают высокой точности при анализе рентгеновских снимков.
Методы. Мы сравнивали ResNet-50 и Vision Transformer на наборе данных CheXpert.
Результаты. ViT показал преимущество для выявления пневмонии.
Обсуждение. Несмотря на прогресс, сохраняются проблемы:
1) Нехватка размеченных данных.
2) "Чёрный ящик" принятия решений.
Заключение. Перспективным направлением является ..."""

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=30,
    separators=["\n\n", "\n", ". ", " "]
)

In [None]:
chunks = splitter.split_text(text)
for i, chunk in enumerate(chunks):
    print(f"chunk {i+1}: {chunk}\n---")

# Task 4-2

In [None]:
import faiss

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document

In [None]:
chunk1 = "RAG позволяет расширить знания LLM"
chunk2 = "кросс-энкодер позволят проводить ранжирование отобранных результатов"
chunk3 = "RAG = robust attack on giants"

In [None]:
chunks = [chunk1, chunk2, chunk3]
metadata = ["RAG", "ML", "fantasy"]

In [None]:
docs = [
    Document(
        page_content=c,
        metadata={"source": m},
    ) for c, m in zip(chunks, metadata)
]

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

In [None]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
vector_store.add_documents(docs)
vector_store.similarity_search("Зачем нужен RAG ?", k=1, filter={"source": {"$neq": "fantasy"}})

# Оценка качества RAG

# Lesson 6 Практика. Эксперименты с RAG

In [None]:
# albucore==0.0.24
# albumentations==2.0.8
# annotated-types==0.7.0
# anyio==4.9.0
# asttokens==3.0.0
# audioread==3.0.1
# backports-tarfile==1.2.0
# blis==1.3.0
# build==1.2.2.post1
# cachecontrol==0.14.3
# catalogue==2.0.10
# certifi==2025.6.15
# cffi==1.17.1
# charset-normalizer==3.4.2
# cleo==2.1.0
# click==8.2.1
# cloudpathlib==0.21.1
# comm==0.2.2
# confection==0.1.5
# crashtest==0.4.1
# cryptography==45.0.4
# cymem==2.0.11
# dawg2-python==0.9.0
# debugpy==1.8.14
# decorator==5.2.1
# distlib==0.3.9
# dulwich==0.22.8
# exceptiongroup==1.3.0
# executing==2.2.0
# fastjsonschema==2.21.1
# filelock==3.18.0
# findpython==0.6.3
# fsspec==2025.5.1
# gensim==4.3.2
# h11==0.16.0
# hf-xet==1.1.5
# httpcore==1.0.9
# httpx==0.28.1
# huggingface-hub==0.33.0
# idna==3.10
# importlib-metadata==8.7.0
# installer==0.7.0
# ipykernel==6.29.5
# ipython==8.37.0
# jaraco-classes==3.4.0
# jaraco-context==6.0.1
# jaraco-functools==4.2.1
# jedi==0.19.2
# jeepney==0.9.0
# jinja2==3.1.6
# joblib==1.5.1
# jupyter-client==8.6.3
# jupyter-core==5.8.1
# keyring==25.6.0
# langcodes==3.5.0
# language-data==1.3.0
# lazy-loader==0.4
# librosa==0.11.0
# llvmlite==0.44.0
# marisa-trie==1.2.1
# markdown-it-py==3.0.0
# markupsafe==3.0.2
# matplotlib==3.10.3
# matplotlib-inline==0.1.7
# mdurl==0.1.2
# more-itertools==10.7.0
# mpmath==1.3.0
# msgpack==1.1.1
# murmurhash==1.0.13
# nest-asyncio==1.6.0
# networkx==3.4.2
# nltk==3.9.1
# numba==0.61.2
# numpy==2.2.6
# nvidia-cublas-cu12==12.6.4.1
# nvidia-cuda-cupti-cu12==12.6.80
# nvidia-cuda-nvrtc-cu12==12.6.77
# nvidia-cuda-runtime-cu12==12.6.77
# nvidia-cudnn-cu12==9.5.1.17
# nvidia-cufft-cu12==11.3.0.4
# nvidia-cufile-cu12==1.11.1.6
# nvidia-curand-cu12==10.3.7.77
# nvidia-cusolver-cu12==11.7.1.2
# nvidia-cusparse-cu12==12.5.4.2
# nvidia-cusparselt-cu12==0.6.3
# nvidia-nccl-cu12==2.26.2
# nvidia-nvjitlink-cu12==12.6.85
# nvidia-nvtx-cu12==12.6.77
# opencv-python==4.11.0.86
# opencv-python-headless==4.11.0.86
# packaging==25.0
# pandas==2.3.0
# parso==0.8.4
# pbs-installer==2025.6.12
# pexpect==4.9.0
# pillow==11.2.1
# pkginfo==1.12.1.2
# platformdirs==4.3.8
# poetry==2.1.3
# poetry-core==2.1.3
# pooch==1.8.2
# preshed==3.0.10
# prompt-toolkit==3.0.51
# psutil==7.0.0
# ptyprocess==0.7.0
# pure-eval==0.2.3
# pycparser==2.22
# pydantic==2.11.7
# pydantic-core==2.33.2
# pygments==2.19.2
# pymorphy3==2.0.4
# pymorphy3-dicts-ru==2.4.417150.4580142
# pyproject-hooks==1.2.0
# python-dateutil==2.9.0.post0
# pytz==2025.2
# pyyaml==6.0.2
# pyzmq==27.0.0
# rapidfuzz==3.13.0
# regex==2024.11.6
# requests==2.32.4
# requests-toolbelt==1.0.0
# rich==14.0.0
# safetensors==0.5.3
# scikit-learn==1.7.0
# scipy==1.15.3
# secretstorage==3.3.3
# setuptools==70.3.0
# shellingham==1.5.4
# simsimd==6.4.9
# six==1.17.0
# smart-open==7.1.0
# sniffio==1.3.1
# soundfile==0.13.1
# soxr==0.5.0.post1
# spacy==3.8.7
# spacy-legacy==3.0.12
# spacy-loggers==1.0.5
# srsly==2.5.1
# stack-data==0.6.3
# stringzilla==3.12.5
# sympy==1.14.0
# thinc==8.3.6
# threadpoolctl==3.6.0
# timm==1.0.15
# tokenizers==0.21.1
# tomli==2.2.1
# tomlkit==0.13.3
# torch==2.7.1
# torchaudio==2.7.1
# torchvision==0.22.1
# torchmetrics==1.7.4
# tornado==6.5.1
# tqdm==4.67.1
# traitlets==5.14.3
# transformers==4.52.4
# triton==3.3.1
# trove-classifiers==2025.5.9.12
# typer==0.16.0
# typing-extensions==4.14.0
# typing-inspection==0.4.1
# tzdata==2025.2
# urllib3==2.5.0
# virtualenv==20.31.2
# wasabi==1.1.3
# wcwidth==0.2.13
# weasel==0.4.1
# wrapt==1.17.2
# zipp==3.23.0
# zstandard==0.23.0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
main_path = '/content/drive/MyDrive/PRACTICUM_DLE/sprint_7/'

# Task 6-1

In [None]:
# pip install pypdf

In [None]:
import os
from typing import List, Optional

import faiss
import numpy as np

from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    JSONLoader,
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

import torch
from torch import Tensor
import torch.nn.functional as F

In [None]:
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["categories"] = record["categories"]
    metadata["id"] = record["id"]
    return metadata

In [None]:
class RAG:
    def __init__(
        self,
        # embedder_name: str = "Qwen/Qwen3-Embedding-0.6B",
        # reranker_name: str = "Qwen/Qwen3-Reranker-0.6B",
        embedder_name: str = "intfloat/e5-base-v2",
        reranker_name: str = "cross-encoder/ms-marco-MiniLM-L-12-v2",

        chunk_size: int = 500,
        chunk_overlap: int = 125,
        device: Optional[str] = None,
    ):
        self.device = device or ("cuda"
                                 if torch.cuda.is_available() else "cpu")
        self.emb_tokenizer = AutoTokenizer.from_pretrained(embedder_name)
        self.embedder = AutoModel.from_pretrained(embedder_name).to(
            self.device)
        self.embedder.eval()

        self.rr_tokenizer = AutoTokenizer.from_pretrained(
            reranker_name,
            padding_side='left')
        self.reranker = AutoModelForCausalLM.from_pretrained(
            reranker_name).to(self.device)
        self.reranker.eval()

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
        self.index = None
        self.doc_store = []

        self.max_length = 8192
        self.token_false_id = self.rr_tokenizer.convert_tokens_to_ids("no")
        self.token_true_id = self.rr_tokenizer.convert_tokens_to_ids("yes")
        prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
        suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
        self.prefix_tokens = self.rr_tokenizer.encode(prefix,
                                                      add_special_tokens=False)
        self.suffix_tokens = self.rr_tokenizer.encode(suffix,
                                                      add_special_tokens=False)

    def _generate_embeddings(self, texts: List[str]) -> np.ndarray:
        inputs = self.emb_tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length,
        ).to(self.device)

        with torch.no_grad():
            outputs = self.embedder(**inputs)

        inputs.to("cpu")
        embeddings = self.last_token_pool(outputs.last_hidden_state,
                                          inputs.attention_mask).cpu()
        return F.normalize(embeddings, p=2, dim=1).numpy()

    @staticmethod
    def last_token_pool(last_hidden_states: Tensor,
                        attention_mask: Tensor) -> Tensor:
        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
        if left_padding:
            return last_hidden_states[:, -1]
        else:
            sequence_lengths = attention_mask.sum(dim=1) - 1
            batch_size = last_hidden_states.shape[0]
            return last_hidden_states[
                torch.arange(batch_size, device=last_hidden_states.device),
                sequence_lengths]

    def load_and_process_file(self, file_path: str) -> List[Document]:
        """Загрузка и экстракция данных из файлов"""
        
        ext = os.path.splitext(file_path)[1]
        if ext == ".pdf":
            loader = PyPDFLoader(file_path)
        elif ext == ".txt":
            loader = TextLoader(file_path, encoding="utf-8")
        elif ext == ".json":
            loader = JSONLoader(
                file_path,
                jq_schema='.[]',
                content_key="abstract",
                # text_content=True
                metadata_func=metadata_func
            )
        else:
            raise ValueError(f"Unsupported file type: {ext}")

        docs = loader.load()

        return self.text_splitter.split_documents(docs)

    def build_index(self, file_paths: List[str], batch_size: int = 32) -> None:
        """Строим индекс FAISS"""
        all_docs = []
        for path in file_paths:
            all_docs.extend(self.load_and_process_file(path))
        self.doc_store = all_docs

        # Вычислим numpy-эмбеддинги по батчам
        embeddings = []
        for i in range(0, len(all_docs), batch_size):
            batch = [doc.page_content for doc in all_docs[i:i + batch_size]]
            embeddings.append(self._generate_embeddings(batch))

        embeddings = np.concatenate(embeddings)

        # Инициализируем индекс
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings)

    @staticmethod
    def get_detailed_instruct(task_description: str, query: str):
        return f'Instruct: {task_description}\nQuery:{query}'

    @staticmethod
    def format_reranker_instruction(query, doc, instruction=None):
        if instruction is None:
            instruction = 'Given a web search query, retrieve relevant passages that answer the query'
        output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
            instruction=instruction, query=query, doc=doc)
        return output

    def process_inputs(self, pairs):
        """Обработка данных для реранкера"""
        inputs = self.rr_tokenizer(pairs,
                                   padding=False,
                                   truncation='longest_first',
                                   return_attention_mask=False,
                                   max_length=self.max_length -
                                   len(self.prefix_tokens) -
                                   len(self.suffix_tokens))
        for i, ele in enumerate(inputs['input_ids']):
            inputs['input_ids'][
                i] = self.prefix_tokens + ele + self.suffix_tokens
        inputs = self.rr_tokenizer.pad(inputs,
                                       padding=True,
                                       return_tensors="pt",
                                       max_length=self.max_length)

        # переносим тензоры на девайс ранжирующей модели
        for key in inputs:
            inputs[key] = inputs[key].to(self.device)
        return inputs

    def search(self,
               query: str,
               k: int = 5,
               task: str = None):
        if self.index is None:
            raise ValueError("Index not initialized")

        if task is None:
            task = 'Given a web search query, retrieve relevant passages that answer the query'

        query_embedding = self._generate_embeddings([query])
        distances, indices = self.index.search(query_embedding, k)
        return distances, indices

    @torch.no_grad()
    def compute_logits(self, inputs):
        batch_scores = self.reranker(**inputs).logits[:, -1, :]
        true_vector = batch_scores[:, self.token_true_id]
        false_vector = batch_scores[:, self.token_false_id]
        batch_scores = torch.stack([false_vector, true_vector], dim=1)
        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
        scores = batch_scores[:, 1].exp().tolist()
        return scores

    def rerank(self, query: str, documents: List[str], batch_size=1):
        pairs = []
        for d in documents:
            pairs.append(self.format_reranker_instruction(query, d))

        scores = []
        for i in range(0, len(pairs), batch_size):
            inputs = self.process_inputs(pairs[i:i + batch_size])
            sc = self.compute_logits(inputs)
            scores.extend(sc)
        return scores
    
    def rerank_debug(self, query: str, documents: List[str], batch_size=1):
        pairs = []
        for d in documents:
            pairs.append(self.format_reranker_instruction(query, d))

        scores = []
        for i in range(0, len(pairs), batch_size):
            result = pairs[i:i + batch_size]
            break
        return result, pairs

# Task 6-2

In [None]:
# вопрос к RAG
q = "what is the attention mechanism used in deepseek-v3"

In [None]:
k = 5

In [None]:
# # освободим место на GPU
# torch.cuda.empty_cache()
# rag.embedder.to("cpu")
# rag.reranker.to("cpu")

In [None]:
# Проинициализируйте и заполните индекс
rag = RAG(device="cuda")

In [None]:
main_path = ''

In [None]:
rag.build_index([
    # main_path + "data/deepseek_r1.pdf",
    # main_path + "data/deepseek_v3_tech_report.pdf",
    # main_path + "data/gemini_1.5.txt",

    # main_path + "data/testik.txt",

    main_path + "data/sample_data.json",
])

In [None]:
D, I = rag.search(q, k=k)
candidates = [rag.doc_store[i].page_content for i in I[0]]

In [None]:
for c in candidates:
    print(c)
    print("-#" * 20)
    print()

# Task 3

In [None]:
# вопрос к RAG
q = "what is the attention mechanism used in deepseek-v3"

In [None]:
D, I = rag.search(q, k=100)
candidates = [rag.doc_store[i].page_content for i in I[0]]

In [None]:
k = 5

In [None]:
result, pairs = rag.rerank_debug(q, candidates)

In [None]:
result[0]

In [None]:
pairs[-1]

In [None]:
len(candidates)

In [None]:
scores = rag.rerank(q, candidates)

In [None]:
array = np.array(scores)

In [None]:
indices = np.argsort(array)[::-1][:k]

In [None]:
for i in indices:
    print(candidates[i])
    print("-#" * 20)
    print()

# Task 4

In [None]:
# вопрос к RAG
q = "what is the attention mechanism used in deepseek-v3"

In [None]:
D, I = rag.search(q, k=100)
candidates = [rag.doc_store[i].page_content for i in I[0]]
scores = rag.rerank(q, candidates)

In [None]:
k = 5
array = np.array(scores)
indices = np.argsort(array)[::-1][:k]

In [None]:
# освободим место на GPU
torch.cuda.empty_cache()
rag.embedder.to("cpu")
rag.reranker.to("cpu")

In [None]:
# готовим промпт
prompt = f"Given texts info below give me a very short answer to a question: {q}\n\n"


In [None]:
for i, v in enumerate(indices):
    prompt += f"chunk {i}: {candidates[v]}\n\n"

In [None]:
torch.manual_seed(42)
llm_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [None]:
assert False

In [None]:
# загружаем модель
tokenizer = AutoTokenizer.from_pretrained(llm_name)
model = AutoModelForCausalLM.from_pretrained(
    llm_name, torch_dtype="auto").to("cuda")

In [None]:
# применяем темплейт и токенизируем
messages = [
    {"role": "user", "content": prompt}
]

In [None]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

In [None]:
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

In [None]:
# генерируем ответ
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=1000
)

In [None]:
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

In [None]:
# убираем reasoning часть из ответа и визуализируем
content = tokenizer.decode(output_ids, skip_special_tokens=True)
print("Answer:", content.split('</think>')[1].strip('\n'))