In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer


1. Разделение данных на тренинг, тест и валидацию
Сначала разделим данные на тренировочный, тестовый и валидационный наборы. Для этого можно использовать функцию train_test_split из библиотеки sklearn.

In [15]:
# Загрузка данных
data = pd.read_csv("texts_with_answers.csv", sep=';')

# Разделение данных
train, test = train_test_split(data, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

2. Конвертация текстов в эмбеддинги
Здесь мы будем использовать модель для генерации векторных представлений (эмбеддингов). Начнем с использования модели Sentence Transformers.

In [16]:
# Инициализация модели
model = SentenceTransformer('all-MiniLM-L6-v2')

# Преобразование вопросов и ответов в векторы
train_vectors = model.encode(train['question'].tolist())
val_vectors = model.encode(val['question'].tolist())
test_vectors = model.encode(test['question'].tolist())



3. Поиск ближайшего ответа с использованием косинусного расстояния
Для нахождения ближайшего ответа мы будем использовать расчет косинусного расстояния и определим ближайший ответ для каждого вопроса.

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_nearest_answer(question_vector, answer_vectors, answer_texts):
    similarities = cosine_similarity([question_vector], answer_vectors).flatten()
    nearest_index = np.argmax(similarities)
    return answer_texts[nearest_index]

# Проверка на валидационных данных
val_answer_vectors = model.encode(val['answer'].tolist())
val_answers = [find_nearest_answer(vec, val_answer_vectors, val['answer'].tolist()) for vec in val_vectors]

4. Реализация на трех моделях
Модель 1: Sentence Transformers (нейронная сеть)
Мы уже использовали эту модель выше для генерации эмбеддингов.

Модель 2: TF-IDF + Cosine Similarity
Легковесная модель на основе TF-IDF.

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(train['question'].tolist())
tfidf_val_vectors = tfidf_vectorizer.transform(val['question'].tolist())
tfidf_answer_vectors = tfidf_vectorizer.transform(val['answer'].tolist())

# Поиск ближайшего ответа
def find_nearest_answer_tfidf(question_vector, answer_vectors, answer_texts):
    similarities = cosine_similarity(question_vector, answer_vectors).flatten()
    nearest_index = np.argmax(similarities)
    return answer_texts[nearest_index]

# Проверка на TF-IDF
val_answers_tfidf = [find_nearest_answer_tfidf(tfidf_val_vectors[i], tfidf_answer_vectors, val['answer'].tolist()) for i in range(tfidf_val_vectors.shape[0])]

Модель 3: Use Neural Network-based Embedding
Вопросы и ответы могут быть пропущены через другую нейронную сеть для генерирования эмбеддингов, например, можно использовать модель BERT с Hugging Face.

In [19]:
from transformers import AutoTokenizer, AutoModel
import torch

# Используем другую нейронную сеть для генерации эмбеддингов
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model_bert = AutoModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    outputs = model_bert(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Преобразуем все вопросы и ответы
bert_train_vectors = get_bert_embeddings(train['question'].tolist())
bert_val_vectors = get_bert_embeddings(val['question'].tolist())
bert_answer_vectors = get_bert_embeddings(val['answer'].tolist())

# Поиск ближайшего ответа
val_answers_bert = [find_nearest_answer(vec, bert_answer_vectors, val['answer'].tolist()) for vec in bert_val_vectors]

5. Оценка качества генерации
Оценивать качество модели генерации ответов можно с помощью крупной LLM (например, GPT-4). Вот как это может быть сделано:

In [21]:
# Используйте другую мощную языковую модель, если недоступна gpt-4
evaluation_pipeline = pipeline("text2text-generation", model="bigscience/bloom-560m")

def evaluate_answer(question, generated_answer, reference_answer):
    prompt = f"Question: {question}\nGenerated answer: {generated_answer}\nReference answer: {reference_answer}\nRate how similar they are (0 to 10):\n"
    result = evaluation_pipeline(prompt, max_length=100, num_return_sequences=1)
    return result[0]['generated_text']

# Пример оценки
example_question = "What is the function of the 'lookup_from' parameter in the recommendation request?"
generated_answer = "The 'lookup_from' parameter allows..."
reference_answer = "The 'lookup_from' parameter allows users to find recommendations in one collection based on vectors from another collection with the same dimensionality."
score = evaluate_answer(example_question, generated_answer, reference_answer)
print(score)

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

The model 'BloomForCausalLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


Question: What is the function of the 'lookup_from' parameter in the recommendation request?
Generated answer: The 'lookup_from' parameter allows...
Reference answer: The 'lookup_from' parameter allows users to find recommendations in one collection based on vectors from another collection with the same dimensionality.
Rate how similar they are (0 to 10):
Generated answer: The 'lookup_from' parameter allows users to find recommendations in one collection based on vectors from another collection with the same dimensionality.
Rate how similar they are


Таким образом, мы реализовали простой пайплайн RAG с использованием нескольких моделей и оценили их с использованием более крупной языковой модели. Эти шаги должны дать надежную векторную структуру для поиска схожих ответов с хорошей общей производительностью. Посчитаем метрики для оценки качества.

In [23]:
def hit_rate(ground_truth, predictions, k=10):
    hits = 0
    for gt, preds in zip(ground_truth, predictions):
        if gt in preds[:k]:
            hits += 1
    return hits / len(ground_truth)

In [24]:
def reciprocal_rank(ground_truth, predictions):
    for index, pred in enumerate(predictions):
        if pred == ground_truth:
            return 1 / (index + 1)
    return 0

def mean_reciprocal_rank(ground_truth, predictions_list):
    rr_sum = 0
    for gt, preds in zip(ground_truth, predictions_list):
        rr_sum += reciprocal_rank(gt, preds)
    return rr_sum / len(ground_truth)

In [29]:
def precision_at_k(ground_truth, predictions, k=10):
    precision_sum = 0
    for gt, preds in zip(ground_truth, predictions):
        relevant_items = preds[:k]
        num_relevant_and_retrieved = sum([1 for pred in relevant_items if pred == gt])
        precision_sum += num_relevant_and_retrieved / k
    return precision_sum / len(ground_truth)

In [30]:
def recall_at_k(ground_truth, predictions, k=10):
    recall_sum = 0
    for gt, preds in zip(ground_truth, predictions):
        relevant_items = preds[:k]
        num_relevant_and_retrieved = sum([1 for pred in relevant_items if pred == gt])
        recall_sum += num_relevant_and_retrieved / len([gt])
    return recall_sum / len(ground_truth)

In [31]:
# Пример данных
val_questions = [
    "What is a collection in the context of Qdrant?",
    "What must be true about the dimensionality of vectors within a single collection?",
    # Добавьте остальные валидаторные вопросы...
]

val_answers = [
    "A collection is a named set of points (vectors with payload) among which search operations can be performed.",
    "Vectors for each element within a single collection must have the same dimensionality and be compared using a selected metric.",
    # Добавьте остальные валидаторные ответы...
]

# Инициируем предсказанные ответы как списки, здесь необходимо вставить реальные предсказания для каждого вопроса
predicted_answers_list = [
    ["A collection is a named set of points ...", "Another possible answer", "A different answer"],  # для первого вопроса
    ["Vectors must have the same dimensionality ...", "An alternative answer", "Different answer"],  # для второго вопроса
    # Добавьте предсказанные ответы для остальных вопросов в формате списка строк
]

# Подсчет метрик
hr = hit_rate(val_answers, predicted_answers_list, k=5)
mrr = mean_reciprocal_rank(val_answers, predicted_answers_list)
prec_k = precision_at_k(val_answers, predicted_answers_list, k=5)
rec_k = recall_at_k(val_answers, predicted_answers_list, k=5)

print(f"Hit Rate: {hr}")
print(f"Mean Reciprocal Rank: {mrr}")
print(f"Precision@5: {prec_k}")
print(f"Recall@5: {rec_k}")

Hit Rate: 0.0
Mean Reciprocal Rank: 0.0
Precision@5: 0.0
Recall@5: 0.0


Нормализуем

In [33]:
def normalize(text):
    return text.strip().lower()

normalized_answers = [normalize(ans) for ans in val_answers]
normalized_predictions_list = [[normalize(pred) for pred in preds] for preds in predicted_answers_list]

In [34]:
def hit_rate(ground_truth, predictions, k=10):
    hits = 0
    for gt, preds in zip(ground_truth, predictions):
        if gt in preds[:k]:
            hits += 1
    return hits / len(ground_truth) if len(ground_truth) > 0 else 0

def reciprocal_rank(ground_truth, predictions):
    for index, pred in enumerate(predictions):
        if pred == ground_truth:
            return 1 / (index + 1)
    return 0

def mean_reciprocal_rank(ground_truth, predictions_list):
    rr_sum = 0
    for gt, preds in zip(ground_truth, predictions_list):
        rr_sum += reciprocal_rank(gt, preds)
    return rr_sum / len(ground_truth) if len(ground_truth) > 0 else 0

def precision_at_k(ground_truth, predictions, k=10):
    precision_sum = 0
    for gt, preds in zip(ground_truth, predictions):
        relevant_items = preds[:k]
        num_relevant_and_retrieved = sum([1 for pred in relevant_items if pred == gt])
        precision_sum += num_relevant_and_retrieved / k
    return precision_sum / len(ground_truth) if len(ground_truth) > 0 else 0

def recall_at_k(ground_truth, predictions, k=10):
    recall_sum = 0
    for gt, preds in zip(ground_truth, predictions):
        relevant_items = preds[:k]
        num_relevant_and_retrieved = sum([1 for pred in relevant_items if pred == gt])
        recall_sum += num_relevant_and_retrieved / 1  # Assuming one correct answer per question
    return recall_sum / len(ground_truth) if len(ground_truth) > 0 else 0

# Нормализация строк
normalized_answers = [normalize(ans) for ans in val_answers]
normalized_predictions_list = [[normalize(pred) for pred in preds] for preds in predicted_answers_list]

# Пересчет метрик
hr = hit_rate(normalized_answers, normalized_predictions_list, k=5)
mrr = mean_reciprocal_rank(normalized_answers, normalized_predictions_list)
prec_k = precision_at_k(normalized_answers, normalized_predictions_list, k=5)
rec_k = recall_at_k(normalized_answers, normalized_predictions_list, k=5)

print(f"Hit Rate: {hr}")
print(f"Mean Reciprocal Rank: {mrr}")
print(f"Precision@5: {prec_k}")
print(f"Recall@5: {rec_k}")

Hit Rate: 0.0
Mean Reciprocal Rank: 0.0
Precision@5: 0.0
Recall@5: 0.0
