In [None]:
from typing import List, Dict, Union

import numpy as np
import torch

from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

In [None]:
class RAG:

    def __init__(
            self,
            llm_name: str = "t-tech/T-lite-it-1.0",
            embedder_name: str = "cointegrated/rubert-tiny2",
            device: str = "cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device

        # Инициализация модели для эмбеддингов
        self.embedder_tokenizer = AutoTokenizer.from_pretrained(embedder_name) # Ваш код здесь
        self.embedder_model = AutoModel.from_pretrained(embedder_name).to(self.device) # Ваш код здесь

        # Инициализация LLM
        self.llm_tokenizer = AutoTokenizer.from_pretrained(llm_name) # Ваш код здесь
        self.llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to(self.device) # Ваш код здесь

        # База знаний: словарь с текстами и их эмбеддингами
        self.knowledge_base = {
            "texts": [],
            "embeddings": None  # Тензор с эмбеддингами
        }

    def get_embedding(self, text: str) -> torch.Tensor:
        """
        Получение эмбеддинга текста

        :param text: входной текст
        :return: эмбеддинг текста (тензор)
        """

        # Ваш код здесь
        inputs = self.embedder_tokenizer(text,
                                         return_tensors="pt",
                                         padding="max_length",
                                         truncation=True,
                                         max_length=512).to(self.device)

        with torch.no_grad():
            outputs = self.embedder_model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings = torch.nn.functional.normalize(embeddings)

        return embeddings.cpu().numpy()

    def add_to_knowledge_base(self, text: str|List[str]) -> None:
        """
        Добавление текста в базу знаний

        :param text: текст для добавления
        """
        embedding = self.get_embedding(text)

        # Добавляем текст в список
        if isinstance(text, str):
            self.knowledge_base["texts"].append(text)
        else:
            self.knowledge_base["texts"].extend(text)

        # Обновляем тензор эмбеддингов
        if self.knowledge_base["embeddings"] is None:
            self.knowledge_base["embeddings"] = embedding
        else:
            self.knowledge_base["embeddings"] = np.vstack(
                [self.knowledge_base["embeddings"], embedding], dim=0)

# Task 1-2

In [None]:
from typing import List, Dict, Union

import numpy as np
import torch

from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline, set_seed

from sklearn.metrics.pairwise import cosine_similarity



In [None]:
class RAG:

    def __init__(
            self,
            llm_name: str = "t-tech/T-lite-it-1.0",
            embedder_name: str = "cointegrated/rubert-tiny2",
            device: str = "cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device

        # Инициализация модели для эмбеддингов
        self.embedder_tokenizer = AutoTokenizer.from_pretrained(embedder_name)
        self.embedder_model = AutoModel.from_pretrained(embedder_name).to(
            self.device)

        # Инициализация LLM
        self.llm_tokenizer = AutoTokenizer.from_pretrained(llm_name)
        self.llm_model = AutoModelForCausalLM.from_pretrained(llm_name).to(
            self.device)

        # База знаний: словарь с текстами и их эмбеддингами
        self.knowledge_base = {
            "texts": [],
            "embeddings": None  # Тензор с эмбеддингами
        }

    def get_embedding(self, text: str) -> torch.Tensor:
        """
        Получение эмбеддинга текста

        :param text: входной текст
        :return: эмбеддинг текста (тензор)
        """
        inputs = self.embedder_tokenizer(text,
                                         return_tensors="pt",
                                         padding="max_length",
                                         truncation=True,
                                         max_length=512).to(self.device)

        with torch.no_grad():
            outputs = self.embedder_model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings = torch.nn.functional.normalize(embeddings)

        return embeddings.cpu().numpy()

    def add_to_knowledge_base(self, text: str|List[str]) -> None:
        """
        Добавление текста в базу знаний

        :param text: текст для добавления
        """
        embedding = self.get_embedding(text)

        # Добавляем текст в список
        if isinstance(text, str):
            self.knowledge_base["texts"].append(text)
        else:
            self.knowledge_base["texts"].extend(text)

        # Обновляем тензор эмбеддингов
        if self.knowledge_base["embeddings"] is None:
            self.knowledge_base["embeddings"] = embedding
        else:
            self.knowledge_base["embeddings"] = np.vstack(
                [self.knowledge_base["embeddings"], embedding], dim=0)

    def find_closest(self,
                     query_embedding: np.ndarray,
                     top_k: int = 3) -> List[Dict]:
        """
        Поиск ближайших текстов в базе знаний

        :param query_embedding: эмбеддинг запроса
        :param top_k: количество возвращаемых результатов
        :return: список словарей с текстами и оценкой схожести
        """
        if len(self.knowledge_base["texts"]) == 0:
            return []

        # Вычисляем косинусную схожесть
        similarities = cosine_similarity(
            query_embedding.reshape(1, -1),
            self.knowledge_base["embeddings"])[0]

        # Получаем индексы топ-K результатов
        top_indices = similarities.argsort().argsort()[::-1][:min(top_k, len(similarities))]

        # Формируем результат
        results = []
        for idx in top_indices:
            results.append({
                "text": self.knowledge_base["texts"][idx],
                "score": similarities[idx]
            })

        return results

    def _create_prompt(self, question: str, context_texts: List[str]) -> str:
        """
        Создание промпта для LLM

        :param question: вопрос пользователя
        :param context_texts: список релевантных текстов из базы знаний
        :return: сформированный промпт
        """
        context = "\n\n".join([
            f"Контекст {i+1}: {text}" for i, text in enumerate(context_texts)
        ])

        prompt = f"""Используя приведённые ниже контексты, максимально кратко ответь на вопрос. Если в контекстах нет нужной информации, скажи об этом.

        {context}

        Вопрос: {question}"""

        return prompt

    def ask_question(self, question: str, top_k: int = 3) -> str:
        """
        Задание вопроса к системе RAG

        :param question: текст вопроса
        :param top_k: количество используемых контекстов из базы знаний
        :return: ответ модели
        """
        # Получаем эмбеддинг вопроса
        question_embedding = self.get_embedding(question)

        # Ищем релевантные тексты
        closest = self.find_closest(question_embedding, top_k=top_k)
        if len(closest) > 0:
            context_texts = [item["text"] for item in closest]
        else:
            context_texts = ["Релевантной информации не найдено"]

        # Создаём промпт
        prompt = self._create_prompt(question, context_texts)

        # Формируем сообщения для LLM
        messages = [{
            "role":
            "system",
            "content":
            "Ты виртуальный ассистент. Твоя задача - быть полезным диалоговым ассистентом."
        }, {
            "role": "user",
            "content": prompt
        }]

        # Генерируем ответ
        text = self.llm_tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True)

        model_inputs = self.llm_tokenizer([text],
                                          return_tensors="pt").to(self.device)

        generated_ids = self.llm_model.generate(**model_inputs,
                                                max_new_tokens=1024,
                                                do_sample=True,
                                                temperature=0.1)

        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(
                model_inputs.input_ids, generated_ids)
        ]

        response = self.llm_tokenizer.batch_decode(generated_ids,
                                                   skip_special_tokens=True)[0]

        return response

In [None]:

rag = RAG()
db = ["X0Ja_asd - пароль от моего компьютера", "RisingTide - новая группа, состоящая из бывших моряков"]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.87G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:

# добавляем в БД информацию
rag.add_to_knowledge_base(db)


In [None]:

set_seed(42) # для воспроизводимости
answer = rag.ask_question("Я забыл пароль от своего компьютера")
print(answer)

# LEsson 2

In [None]:
pip install faiss-gpu-cu12

Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting nvidia-cuda-runtime-cu12>=12.1.105 (from faiss-gpu-cu12)
  Downloading nvidia_cuda_runtime_cu12-12.9.79-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cublas-cu12>=12.1.3.1 (from faiss-gpu-cu12)
  Downloading nvidia_cublas_cu12-12.9.1.4-py3-none-manylinux_2_27_x86_64.whl.metadata (1.7 kB)
Downloading faiss_gpu_cu12-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cublas_cu12-12.9.1.4-py3-none-manylinux_2_27_x86_64.whl (581.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.2/581.2 MB[0m [31m993.3 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cuda_runtime_cu12-12.9.79-py3-none-manylinux2014_x86_64.

In [None]:
import numpy as np
import faiss

In [None]:
d = 256                          # размерность векторов
nb = 100000                      # число векторов (мы их создадим случайно в таком количестве)
nq = 10000                       # число векторов для поиска

np.random.seed(1234)
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000. # добавляем информацию об очерёдности векторов



In [None]:
index = faiss.IndexFlatL2(d)   # инициализируем индекс
print(index.is_trained)
index.add(xb)                  # добавляем векторы
print(index.ntotal)

True
100000


In [None]:
k = 4                          # 4 соседа
D, I = index.search(xb[:5], k) # поиск
print(I)
print(D)

[[   0  991  135  447]
 [   1 1001  991  251]
 [   2  384  314  269]
 [   3  103  393  595]
 [   4  402 1277   45]]
[[ 0.       34.318687 34.56546  35.755974]
 [ 0.       33.328285 34.142128 35.18398 ]
 [ 0.       35.259205 35.605064 35.81136 ]
 [ 0.       35.942703 35.963043 36.410927]
 [ 0.       33.867443 34.014343 34.198784]]


# IndexIVFFlat

In [None]:
nlist = 100
k = 4

In [None]:
quantizer = faiss.IndexFlatL2(d) # где d, размерность векторов, была задана ранее
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [None]:
assert not index.is_trained # в начале индекс "не обучен"
index.train(xb)             # обучаем индекс
assert index.is_trained

In [None]:
index.add(xb)                 # nprobe = 1
D, I = index.search(xq, k)
print(I[-5:])

index.nprobe = 10              # изменим число кластеров для обхода
D, I = index.search(xq, k)
print(I[-5:])

KeyboardInterrupt: 

# IndexHNSW

In [None]:
M = 64
ef_search = 16
ef_construction = 32

In [None]:
# инициализируем индекс
index = faiss.IndexHNSWFlat(d, M)
# задаём параметры
index.hnsw.efConstruction = ef_construction

In [None]:
index.hnsw.efSearch = ef_search
# добавляем данные в индекс
index.add(xb)

In [None]:
D, I = index.search(xq, k)
print(I[:5])

NameError: name 'xq' is not defined

# Оптимизация хранения индекса

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
main_path = '/content/drive/MyDrive/PRACTICUM_DLE/sprint_7/'

In [None]:
def get_index_size(index):
    import os

    # запишем индекс на диск
    faiss.write_index(index, main_path + 'data/temp.index')
    # получаем размер файла
    index_size = os.path.getsize(main_path + 'data/temp.index')
    # удаляем сохранённый индекс
    os.remove(main_path + 'data/temp.index')
    return index_size

In [None]:
# инициализируем и вычисляем размер l2 индекса
index_l2 = faiss.IndexFlatL2(d)
index_l2.add(xb)
index_l2_size = get_index_size(index_l2)

# инициализируем и вычисляем размер PQ индекса
M = 16
assert d % M == 0 # из исходного вектора должно получаться целое число векторов
nbits = 8

index_pq = faiss.IndexPQ(d, M, nbits)
index_pq.train(xb) # обучаем индекс PQ
index_pq_size = get_index_size(index_pq)

print(f"Отношение индексов PQ/L2: {index_pq_size/index_l2_size:.4f}")

RuntimeError: Error in faiss::FileIOWriter::FileIOWriter(const char*) at /project/faiss/faiss/impl/io.cpp:103: Error: 'f' failed: could not open /content/drive/MyDrive/PRACTICUM_DLE/sprint_7/data/temp.index for writing: No such file or directory

# Lesson 3 Методы retrieval

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch

In [2]:
model_name = "cross-encoder/stsb-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [3]:
query = "How to choose a laptop for work?"
candidates = [
"2023 rating of best laptops for office work",
"Comparison of Intel Core i5 vs i7 processors",
"How to improve performance of an old laptop",
"Optimal laptop specifications for programmers",
"Difference between SSD and HDD drives",
"10 common mistakes when buying a laptop",
"How to connect a laptop to a TV",
"Best budget laptops under 50,000 rubles",
"What graphics card is needed for graphic design work",
"How to extend laptop battery life",
]

In [4]:
def rerank(query, candidates):
    pairs = [(query, cand) for cand in candidates]
    inputs = tokenizer(pairs, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.sigmoid(outputs.logits).flatten().tolist()
    return sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)

In [5]:
reranked = rerank(query, candidates)
print(reranked)

[('How to connect a laptop to a TV', 0.39422059059143066), ('2023 rating of best laptops for office work', 0.3703474998474121), ('Optimal laptop specifications for programmers', 0.3701727092266083), ('How to improve performance of an old laptop', 0.3137408494949341), ('How to extend laptop battery life', 0.2821332812309265), ('10 common mistakes when buying a laptop', 0.246943861246109), ('Best budget laptops under 50,000 rubles', 0.2291668802499771), ('What graphics card is needed for graphic design work', 0.16224971413612366), ('Comparison of Intel Core i5 vs i7 processors', 0.07215014845132828), ('Difference between SSD and HDD drives', 0.029510242864489555)]


In [6]:
reranked

[('How to connect a laptop to a TV', 0.39422059059143066),
 ('2023 rating of best laptops for office work', 0.3703474998474121),
 ('Optimal laptop specifications for programmers', 0.3701727092266083),
 ('How to improve performance of an old laptop', 0.3137408494949341),
 ('How to extend laptop battery life', 0.2821332812309265),
 ('10 common mistakes when buying a laptop', 0.246943861246109),
 ('Best budget laptops under 50,000 rubles', 0.2291668802499771),
 ('What graphics card is needed for graphic design work', 0.16224971413612366),
 ('Comparison of Intel Core i5 vs i7 processors', 0.07215014845132828),
 ('Difference between SSD and HDD drives', 0.029510242864489555)]

В итоге общий код запроса к ранжирующей модели (с примерами на русском) выглядит так:

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [8]:
def format_instruction(instruction, query, doc):
    if instruction is None:
        instruction = 'Given a web search query, retrieve relevant passages that answer the query'
    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction,query=query, doc=doc)
    return output

In [9]:
def process_inputs(pairs):
    inputs = tokenizer(
        pairs, padding=False, truncation='longest_first',
        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)
    )
    for i, ele in enumerate(inputs['input_ids']):
        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens
    inputs = tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=max_length)

    # переносим тензоры на девайс модели
    for key in inputs:
        inputs[key] = inputs[key].to(model.device)
    return inputs

In [10]:
def compute_logits(inputs):
    batch_scores = model(**inputs).logits[:, -1, :]
    true_vector = batch_scores[:, token_true_id]
    false_vector = batch_scores[:, token_false_id]
    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
    scores = batch_scores[:, 1].exp().tolist()
    return scores

In [11]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Reranker-0.6B", padding_side='left')
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B").eval()


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

In [12]:
token_false_id = tokenizer.convert_tokens_to_ids("no")
token_true_id = tokenizer.convert_tokens_to_ids("yes")
max_length = 8192

In [13]:
prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)

In [14]:
task = 'Given a web search query, retrieve relevant passages that answer the query'

In [15]:
queries = ['Сколько лететь до марса',
    'растёт ли кукуруза в тени',
]

In [16]:
documents = [
   "Время полёта на Марс зависит от множества факторов, включая траекторию, скорость корабля и расположение планет. В среднем полёт может занять от 6 до 9 месяцев. Самый быстрый способ добраться до Марса, по расчётам, займёт около 70-80 суток, но потребует значительного количества топлива",
   "Кукуруза не растёт хорошо в тени. Она нуждается в достаточном количестве солнечного света для нормального развития и плодоношения",
   "Уход за кукурузой включает в себя полив, рыхление, прополку, подкормку и удаление пасынков. Важно обеспечить кукурузе достаточное количество влаги, особенно в период цветения и формирования початков, а также поддерживать почву рыхлой и свободной от сорняков."
]

In [17]:
pairs = []
for q in queries:
    for d in documents:
        pairs.append(format_instruction(task, q, d))

In [18]:
inputs = process_inputs(pairs)
scores = compute_logits(inputs)

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [19]:
print("scores: ", [round(x, 6) for x in scores])

scores:  [0.999456, 8e-06, 3e-06, 4e-06, 0.999958, 0.884114]


# LEsson 4 Фреймворки для RAG

In [34]:
pip install faiss-gpu-cu12

Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading faiss_gpu_cu12-1.13.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu-cu12
Successfully installed faiss-gpu-cu12-1.13.2


In [25]:
pip install langchain-text-splitters



In [None]:
pip install langchain-huggingface sentence-transformers

In [42]:
pip install langchain-community faiss-gpu-cu12

Collecting langchain-community
  Using cached langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.2-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.4.1-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Task 4-1

In [28]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [29]:
text = """Глубокое обучение в медицинской диагностике.
Введение. Современные CNN достигают высокой точности при анализе рентгеновских снимков.
Методы. Мы сравнивали ResNet-50 и Vision Transformer на наборе данных CheXpert.
Результаты. ViT показал преимущество для выявления пневмонии.
Обсуждение. Несмотря на прогресс, сохраняются проблемы:
1) Нехватка размеченных данных.
2) "Чёрный ящик" принятия решений.
Заключение. Перспективным направлением является ..."""

In [30]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=30,
    separators=["\n\n", "\n", ". ", " "]
)

In [31]:
chunks = splitter.split_text(text)
for i, chunk in enumerate(chunks):
    print(f"chunk {i+1}: {chunk}\n---")

chunk 1: Глубокое обучение в медицинской диагностике.
Введение. Современные CNN достигают высокой точности при анализе рентгеновских снимков.
---
chunk 2: Методы. Мы сравнивали ResNet-50 и Vision Transformer на наборе данных CheXpert.
Результаты. ViT показал преимущество для выявления пневмонии.
---
chunk 3: Обсуждение. Несмотря на прогресс, сохраняются проблемы:
1) Нехватка размеченных данных.
2) "Чёрный ящик" принятия решений.
---
chunk 4: Заключение. Перспективным направлением является ...
---


# Task 4-2

In [7]:
import faiss

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document

In [14]:
chunk1 = "RAG позволяет расширить знания LLM"
chunk2 = "кросс-энкодер позволят проводить ранжирование отобранных результатов"
chunk3 = "RAG = robust attack on giants"

In [15]:
chunks = [chunk1, chunk2, chunk3]
metadata = ["RAG", "ML", "fantasy"]

In [16]:
docs = [
    Document(
        page_content=c,
        metadata={"source": m},
    ) for c, m in zip(chunks, metadata)
]

In [17]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

In [18]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [19]:
vector_store.add_documents(docs)
vector_store.similarity_search("Зачем нужен RAG ?", k=1, filter={"source": {"$neq": "fantasy"}})

[Document(id='52f3300c-2c86-4e87-a955-328307791422', metadata={'source': 'RAG'}, page_content='RAG позволяет расширить знания LLM')]

# Оценка качества RAG

# Lesson 6 Практика. Эксперименты с RAG

In [20]:
# albucore==0.0.24
# albumentations==2.0.8
# annotated-types==0.7.0
# anyio==4.9.0
# asttokens==3.0.0
# audioread==3.0.1
# backports-tarfile==1.2.0
# blis==1.3.0
# build==1.2.2.post1
# cachecontrol==0.14.3
# catalogue==2.0.10
# certifi==2025.6.15
# cffi==1.17.1
# charset-normalizer==3.4.2
# cleo==2.1.0
# click==8.2.1
# cloudpathlib==0.21.1
# comm==0.2.2
# confection==0.1.5
# crashtest==0.4.1
# cryptography==45.0.4
# cymem==2.0.11
# dawg2-python==0.9.0
# debugpy==1.8.14
# decorator==5.2.1
# distlib==0.3.9
# dulwich==0.22.8
# exceptiongroup==1.3.0
# executing==2.2.0
# fastjsonschema==2.21.1
# filelock==3.18.0
# findpython==0.6.3
# fsspec==2025.5.1
# gensim==4.3.2
# h11==0.16.0
# hf-xet==1.1.5
# httpcore==1.0.9
# httpx==0.28.1
# huggingface-hub==0.33.0
# idna==3.10
# importlib-metadata==8.7.0
# installer==0.7.0
# ipykernel==6.29.5
# ipython==8.37.0
# jaraco-classes==3.4.0
# jaraco-context==6.0.1
# jaraco-functools==4.2.1
# jedi==0.19.2
# jeepney==0.9.0
# jinja2==3.1.6
# joblib==1.5.1
# jupyter-client==8.6.3
# jupyter-core==5.8.1
# keyring==25.6.0
# langcodes==3.5.0
# language-data==1.3.0
# lazy-loader==0.4
# librosa==0.11.0
# llvmlite==0.44.0
# marisa-trie==1.2.1
# markdown-it-py==3.0.0
# markupsafe==3.0.2
# matplotlib==3.10.3
# matplotlib-inline==0.1.7
# mdurl==0.1.2
# more-itertools==10.7.0
# mpmath==1.3.0
# msgpack==1.1.1
# murmurhash==1.0.13
# nest-asyncio==1.6.0
# networkx==3.4.2
# nltk==3.9.1
# numba==0.61.2
# numpy==2.2.6
# nvidia-cublas-cu12==12.6.4.1
# nvidia-cuda-cupti-cu12==12.6.80
# nvidia-cuda-nvrtc-cu12==12.6.77
# nvidia-cuda-runtime-cu12==12.6.77
# nvidia-cudnn-cu12==9.5.1.17
# nvidia-cufft-cu12==11.3.0.4
# nvidia-cufile-cu12==1.11.1.6
# nvidia-curand-cu12==10.3.7.77
# nvidia-cusolver-cu12==11.7.1.2
# nvidia-cusparse-cu12==12.5.4.2
# nvidia-cusparselt-cu12==0.6.3
# nvidia-nccl-cu12==2.26.2
# nvidia-nvjitlink-cu12==12.6.85
# nvidia-nvtx-cu12==12.6.77
# opencv-python==4.11.0.86
# opencv-python-headless==4.11.0.86
# packaging==25.0
# pandas==2.3.0
# parso==0.8.4
# pbs-installer==2025.6.12
# pexpect==4.9.0
# pillow==11.2.1
# pkginfo==1.12.1.2
# platformdirs==4.3.8
# poetry==2.1.3
# poetry-core==2.1.3
# pooch==1.8.2
# preshed==3.0.10
# prompt-toolkit==3.0.51
# psutil==7.0.0
# ptyprocess==0.7.0
# pure-eval==0.2.3
# pycparser==2.22
# pydantic==2.11.7
# pydantic-core==2.33.2
# pygments==2.19.2
# pymorphy3==2.0.4
# pymorphy3-dicts-ru==2.4.417150.4580142
# pyproject-hooks==1.2.0
# python-dateutil==2.9.0.post0
# pytz==2025.2
# pyyaml==6.0.2
# pyzmq==27.0.0
# rapidfuzz==3.13.0
# regex==2024.11.6
# requests==2.32.4
# requests-toolbelt==1.0.0
# rich==14.0.0
# safetensors==0.5.3
# scikit-learn==1.7.0
# scipy==1.15.3
# secretstorage==3.3.3
# setuptools==70.3.0
# shellingham==1.5.4
# simsimd==6.4.9
# six==1.17.0
# smart-open==7.1.0
# sniffio==1.3.1
# soundfile==0.13.1
# soxr==0.5.0.post1
# spacy==3.8.7
# spacy-legacy==3.0.12
# spacy-loggers==1.0.5
# srsly==2.5.1
# stack-data==0.6.3
# stringzilla==3.12.5
# sympy==1.14.0
# thinc==8.3.6
# threadpoolctl==3.6.0
# timm==1.0.15
# tokenizers==0.21.1
# tomli==2.2.1
# tomlkit==0.13.3
# torch==2.7.1
# torchaudio==2.7.1
# torchvision==0.22.1
# torchmetrics==1.7.4
# tornado==6.5.1
# tqdm==4.67.1
# traitlets==5.14.3
# transformers==4.52.4
# triton==3.3.1
# trove-classifiers==2025.5.9.12
# typer==0.16.0
# typing-extensions==4.14.0
# typing-inspection==0.4.1
# tzdata==2025.2
# urllib3==2.5.0
# virtualenv==20.31.2
# wasabi==1.1.3
# wcwidth==0.2.13
# weasel==0.4.1
# wrapt==1.17.2
# zipp==3.23.0
# zstandard==0.23.0

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [23]:
main_path = '/content/drive/MyDrive/PRACTICUM_DLE/sprint_7/'

# Task 6-1

In [36]:
pip install pypdf

Collecting pypdf
  Downloading pypdf-6.6.2-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.6.2-py3-none-any.whl (329 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/329.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.1/329.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.6.2


In [25]:
from typing import List, Optional

import faiss
import numpy as np

from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

import torch
from torch import Tensor
import torch.nn.functional as F

In [26]:
class RAG:

    def __init__(
        self,
        embedder_name: str = "Qwen/Qwen3-Embedding-0.6B",
        reranker_name: str = "Qwen/Qwen3-Reranker-0.6B",
        chunk_size: int = 500,
        chunk_overlap: int = 125,
        device: Optional[str] = None,
    ):
        self.device = device or ("cuda"
                                 if torch.cuda.is_available() else "cpu")
        self.emb_tokenizer = AutoTokenizer.from_pretrained(embedder_name)
        self.embedder = AutoModel.from_pretrained(embedder_name).to(
            self.device)
        self.embedder.eval()

        self.rr_tokenizer = AutoTokenizer.from_pretrained(
            reranker_name,
            padding_side='left')
        self.reranker = AutoModelForCausalLM.from_pretrained(
            reranker_name).to(self.device)
        self.reranker.eval()

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )
        self.index = None
        self.doc_store = []

        self.max_length = 8192
        self.token_false_id = self.rr_tokenizer.convert_tokens_to_ids("no")
        self.token_true_id = self.rr_tokenizer.convert_tokens_to_ids("yes")
        prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
        suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
        self.prefix_tokens = self.rr_tokenizer.encode(prefix,
                                                      add_special_tokens=False)
        self.suffix_tokens = self.rr_tokenizer.encode(suffix,
                                                      add_special_tokens=False)

    def _generate_embeddings(self, texts: List[str]) -> np.ndarray:
        inputs = self.emb_tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length,
        ).to(self.device)

        with torch.no_grad():
            outputs = self.embedder(**inputs)

        inputs.to("cpu")
        embeddings = self.last_token_pool(outputs.last_hidden_state,
                                          inputs.attention_mask).cpu()
        return F.normalize(embeddings, p=2, dim=1).numpy()

    @staticmethod
    def last_token_pool(last_hidden_states: Tensor,
                        attention_mask: Tensor) -> Tensor:
        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
        if left_padding:
            return last_hidden_states[:, -1]
        else:
            sequence_lengths = attention_mask.sum(dim=1) - 1
            batch_size = last_hidden_states.shape[0]
            return last_hidden_states[
                torch.arange(batch_size, device=last_hidden_states.device),
                sequence_lengths]

    def load_and_process_file(self, file_path: str) -> List[Document]:
        """Загрузка и экстракция данных из файлов"""
        ext = os.path.splitext(file_path)[1]
        if ext == ".pdf":
            loader = PyPDFLoader(file_path)
        elif ext == ".txt":
            loader = TextLoader(file_path, encoding="utf-8")
        else:
            raise ValueError(f"Unsupported file type: {ext}")

        docs = loader.load()

        return self.text_splitter.split_documents(docs)

    def build_index(self, file_paths: List[str], batch_size: int = 32) -> None:
        """Строим индекс FAISS"""
        all_docs = []
        for path in file_paths:
            all_docs.extend(self.load_and_process_file(path))
        self.doc_store = all_docs

        # Вычислим numpy-эмбеддинги по батчам
        embeddings = []
        for i in range(0, len(all_docs), batch_size):
            batch = [doc.page_content for doc in all_docs[i:i + batch_size]]
            embeddings.append(self._generate_embeddings(batch))

        embeddings = np.concatenate(embeddings)

        # Инициализируем индекс
        self.index = faiss.IndexFlatL2(embeddings.shape[1])
        self.index.add(embeddings)

    @staticmethod
    def get_detailed_instruct(task_description: str, query: str):
        return f'Instruct: {task_description}\nQuery:{query}'

    @staticmethod
    def format_reranker_instruction(query, doc, instruction=None):
        if instruction is None:
            instruction = 'Given a web search query, retrieve relevant passages that answer the query'
        output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
            instruction=instruction, query=query, doc=doc)
        return output

    def process_inputs(self, pairs):
        """Обработка данных для реранкера"""
        inputs = self.rr_tokenizer(pairs,
                                   padding=False,
                                   truncation='longest_first',
                                   return_attention_mask=False,
                                   max_length=self.max_length -
                                   len(self.prefix_tokens) -
                                   len(self.suffix_tokens))
        for i, ele in enumerate(inputs['input_ids']):
            inputs['input_ids'][
                i] = self.prefix_tokens + ele + self.suffix_tokens
        inputs = self.rr_tokenizer.pad(inputs,
                                       padding=True,
                                       return_tensors="pt",
                                       max_length=self.max_length)

        # переносим тензоры на девайс ранжирующей модели
        for key in inputs:
            inputs[key] = inputs[key].to(self.device)
        return inputs

    def search(self,
               query: str,
               k: int = 5,
               task: str = None):
        if self.index is None:
            raise ValueError("Index not initialized")

        if task is None:
            task = 'Given a web search query, retrieve relevant passages that answer the query'

        query_embedding = self._generate_embeddings([query])
        distances, indices = self.index.search(query_embedding, k)
        return distances, indices

    @torch.no_grad()
    def compute_logits(self, inputs):
        batch_scores = self.reranker(**inputs).logits[:, -1, :]
        true_vector = batch_scores[:, self.token_true_id]
        false_vector = batch_scores[:, self.token_false_id]
        batch_scores = torch.stack([false_vector, true_vector], dim=1)
        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
        scores = batch_scores[:, 1].exp().tolist()
        return scores

    def rerank(self, query: str, documents: List[str], batch_size=1):
        pairs = []
        for d in documents:
            pairs.append(self.format_reranker_instruction(query, d))

        scores = []
        for i in range(0, len(pairs), batch_size):
            inputs = self.process_inputs(pairs[i:i + batch_size])
            sc = self.compute_logits(inputs)
            scores.extend(sc)
        return scores

# Task 6-2

In [28]:
# вопрос к RAG
q = "what is the attention mechanism used in deepseek-v3"

In [29]:
k = 5

In [30]:
# Проинициализируйте и заполните индекс
rag = RAG(device="cuda")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

In [32]:
import os

In [37]:
rag.build_index([
    main_path + "data/deepseek_r1.pdf",
    main_path + "data/deepseek_v3_tech_report.pdf",
    main_path + "data/gemini_1.5.txt"
])

In [38]:
D, I = rag.search(q, k=k)
candidates = [rag.doc_store[i].page_content for i in I[0]]

In [39]:
for c in candidates:
    print(c)
    print("-#" * 20)
    print()

Compared with Qwen2.5 72B Base, the state-of-the-art Chinese open-source model, with only
half of the activated parameters, DeepSeek-V3-Base also demonstrates remarkable advantages,
25
-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#

DeepSeek-V3 Technical Report
DeepSeek-AI
research@deepseek.com
Abstract
We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total
parameters with 37B activated for each token. To achieve efficient inference and cost-effective
training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architec-
tures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers
-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#

Non-Reasoning data For non-reasoning data, such as writing, factual QA, self-cognition,
and translation, we adopt the DeepSeek-V3 pipeline and reuse portions of the SFT dataset of
DeepSeek-V3. For certain non-reasoning tasks, we call DeepSeek-V3 to generate a potential
chain-of-thought before an

# Task 3

In [40]:
# вопрос к RAG
q = "what is the attention mechanism used in deepseek-v3"

In [41]:
D, I = rag.search(q, k=100)
candidates = [rag.doc_store[i].page_content for i in I[0]]

In [42]:
k = 5

In [43]:
scores = rag.rerank(q, candidates)

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [44]:
array = np.array(scores)

In [45]:
indices = np.argsort(array)[::-1][:k]

In [46]:
for i in indices:
    print(candidates[i])
    print("-#" * 20)
    print()

by the effort to ensure load balance. Figure 2 illustrates the basic architecture of DeepSeek-V3,
and we will briefly review the details of MLA and DeepSeekMoE in this section.
2.1.1. Multi-Head Latent Attention
For attention, DeepSeek-V3 adopts the MLA architecture. Let 𝑑 denote the embedding dimen-
sion, 𝑛ℎ denote the number of attention heads, 𝑑ℎ denote the dimension per head, and h𝑡 ∈ R𝑑
denote the attention input for the 𝑡-th token at a given attention layer. The core of MLA is the
-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#

model with 671B parameters, of which 37B are activated for each token.
With a forward-looking perspective, we consistently strive for strong model performance
and economical costs. Therefore, in terms of architecture, DeepSeek-V3 still adopts Multi-head
Latent Attention (MLA) (DeepSeek-AI, 2024c) for efficient inference and DeepSeekMoE (Dai
et al., 2024) for cost-effective training. These two architectures have been validated in DeepSeek-
-#-#-#-#-#-#-#-#-#-#-#-

# Task 4

In [47]:
# вопрос к RAG
q = "what is the attention mechanism used in deepseek-v3"

In [48]:
D, I = rag.search(q, k=100)
candidates = [rag.doc_store[i].page_content for i in I[0]]
scores = rag.rerank(q, candidates)

In [49]:
k = 5
array = np.array(scores)
indices = np.argsort(array)[::-1][:k]

In [50]:
# освободим место на GPU
torch.cuda.empty_cache()
rag.embedder.to("cpu")
rag.reranker.to("cpu")

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151669, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layer

In [51]:
# готовим промпт
prompt = f"Given texts info below give me a very short answer to a question: {q}\n\n"


In [52]:
for i, v in enumerate(indices):
    prompt += f"chunk {i}: {candidates[v]}\n\n"

In [53]:
torch.manual_seed(42)
llm_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [54]:
# загружаем модель
tokenizer = AutoTokenizer.from_pretrained(llm_name)
model = AutoModelForCausalLM.from_pretrained(
    llm_name, torch_dtype="auto").to("cuda")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [55]:
# применяем темплейт и токенизируем
messages = [
    {"role": "user", "content": prompt}
]

In [56]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)

In [57]:
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

In [58]:
# генерируем ответ
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=1000
)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [59]:
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

In [60]:
# убираем reasoning часть из ответа и визуализируем
content = tokenizer.decode(output_ids, skip_special_tokens=True)
print("Answer:", content.split('</think>')[1].strip('\n'))

Answer: MLA
