In [13]:
pip install datasets faiss-gpu faiss-cpu



In [14]:
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

In [15]:

issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset.set_format("pandas")
df = issues_dataset[:]
comments_df = df.explode("comments", ignore_index=True)
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }


comments_dataset = comments_dataset.map(concatenate_text)

Repo card metadata block was not found. Setting CardData to empty.


Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [16]:
class Retriever:
    def __init__(self, model_ckpt):
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        model = AutoModel.from_pretrained(model_ckpt)

        device = (
            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        )
        model.to(device)

        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.dataset = None

    def add_dataset_and_index(self, dataset: Dataset, document_column: str):
        embeddings_dataset = dataset.map(
            lambda x: {
                "embeddings": self._encode(x[document_column]).detach().cpu().numpy()[0]
            }
        )
        embeddings_dataset.add_faiss_index(column="embeddings")
        self.dataset = embeddings_dataset

    def find_similar(self, query: str, top_k: int = 10):
        self._check_dataset_exists()

        query_embedding = self._encode(query).detach().cpu().numpy()
        results = self.dataset.get_nearest_examples(
            "embeddings", query_embedding, k=top_k
        )
        return results

    def _encode(self, text_list: list) -> torch.Tensor:
        encoded_input = self.tokenizer(
            text_list, padding=True, truncation=True, return_tensors="pt"
        )
        encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
        model_output = self.model(**encoded_input)
        return self._cls_pooling(model_output)

    # We get the [CLS] token hidden state from the transformer output and treat it as the text embedding
    def _cls_pooling(self, model_output):
        return model_output.last_hidden_state[:, 0]

    def _check_dataset_exists(self):
        if self.dataset is None:
            raise ValueError("Please add a dataset and create the index first.")

In [17]:
class ReRanker:
    def __init__(self, model_ckpt):
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.model.to(self.device)

    def rerank(self, query: str, candidates: list):
        scores = []
        for candidate in candidates:
            input_pair = self.tokenizer(
                query, candidate, truncation=True, padding=True, return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                output = self.model(**input_pair)
                score = output.logits.squeeze().item()  # Ostateczny wynik trafności
                scores.append(score)

        # Posortowanie dokumentów według trafności
        sorted_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
        sorted_candidates = [candidates[i] for i in sorted_indices]
        sorted_scores = [scores[i] for i in sorted_indices]

        return sorted_candidates, sorted_scores, sorted_indices

In [19]:
retriever = Retriever("sentence-transformers/multi-qa-mpnet-base-dot-v1")
retriever.add_dataset_and_index(comments_dataset, "text")

scores, samples = retriever.find_similar("How can I load a dataset offline?", top_k=10)
candidates = samples["text"]

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
reranker = ReRanker("cross-encoder/ms-marco-TinyBERT-L-6")
reranked_candidates, reranked_scores, sorted_indices = reranker.rerank("How can I load a dataset offline?", candidates)
for i, (doc, score, text_ids) in enumerate(zip(reranked_candidates, reranked_scores, sorted_indices)):
        print(f"Rank {i + 1}:")
        print(f"Score: {score}")
        print(f"Text IDs: {text_ids}")
        # print(f"Document: {doc}")
        print("=" * 50)

Rank 1:
Score: 4.005478858947754
Text IDs: 4
Rank 2:
Score: 3.9349308013916016
Text IDs: 2
Rank 3:
Score: 3.7901697158813477
Text IDs: 5
Rank 4:
Score: 3.682295560836792
Text IDs: 0
Rank 5:
Score: 3.6231203079223633
Text IDs: 1
Rank 6:
Score: 3.609133005142212
Text IDs: 3
Rank 7:
Score: 1.4651695489883423
Text IDs: 6
Rank 8:
Score: 1.1875054836273193
Text IDs: 8
Rank 9:
Score: 0.8377321362495422
Text IDs: 9
Rank 10:
Score: -3.504711866378784
Text IDs: 7


In [None]:
from sklearn.metrics import ndcg_score

def compute_recall_at_k(relevant_indices, retrieved_indices, k):
    """
    Oblicz Recall@k: stosunek istotnych dokumentów w top-k do liczby istotnych dokumentów.
    """
    retrieved_top_k = retrieved_indices[:k]
    relevant_retrieved = len(set(relevant_indices).intersection(set(retrieved_top_k)))
    total_relevant = len(relevant_indices)
    return relevant_retrieved / total_relevant if total_relevant > 0 else 0.0

def compute_mrr(relevant_indices, retrieved_indices):
    """
    Oblicz Mean Reciprocal Rank (MRR): odwrotność pozycji pierwszego istotnego dokumentu.
    """
    for rank, doc_id in enumerate(retrieved_indices, start=1):
        if doc_id in relevant_indices:
            return 1 / rank
    return 0.0

def compute_ndcg(relevance_scores, retrieved_scores, k):
    """
    Oblicz NDCG@k: uwzględnia zarówno trafność, jak i pozycję wyników.
    """
    true_relevance = [1 if idx in relevance_scores else 0 for idx in range(len(retrieved_scores[:k]))]
    predicted_relevance = retrieved_scores[:k]
    return ndcg_score([true_relevance], [predicted_relevance])


In [31]:
retrieved_indices = list(range(len(samples["text"])))  # Indeksy wyników przed re-rankingiem
relevant_indices = [0]  # Symulowane istotne wyniki dla zapytania (dla testu)
recall_before = compute_recall_at_k(relevant_indices, retrieved_indices, k=5)
mrr_before = compute_mrr(relevant_indices, retrieved_indices)

In [32]:
reranked_indices = sorted(range(len(reranked_scores)), key=lambda i: reranked_scores[i], reverse=True)
recall_after = compute_recall_at_k(relevant_indices, reranked_indices, k=5)
mrr_after = compute_mrr(relevant_indices, reranked_indices)

In [None]:
reranked_indices = sorted(range(len(reranked_scores)), key=lambda i: reranked_scores[i], reverse=True)
recall_after = compute_recall_at_k(relevant_indices, reranked_indices, k=5)
mrr_after = compute_mrr(relevant_indices, reranked_indices)

# Wyświetlenie wyników
print(f"Recall@5 Before Re-Ranking: {recall_before}")
print(f"Recall@5 After Re-Ranking: {recall_after}")
print(f"MRR Before Re-Ranking: {mrr_before}")
print(f"MRR After Re-Ranking: {mrr_after}")

# Opcjonalnie: NDCG
ndcg_before = compute_ndcg(relevant_indices, scores, k=5)
ndcg_after = compute_ndcg(relevant_indices, reranked_scores, k=5)
print(f"NDCG@5 Before Re-Ranking: {ndcg_before}")
print(f"NDCG@5 After Re-Ranking: {ndcg_after}")

Recall@5 Before Re-Ranking: 1.0
Recall@5 After Re-Ranking: 1.0
MRR Before Re-Ranking: 1.0
MRR After Re-Ranking: 1.0
NDCG@5 Before Re-Ranking: 0.38685280723454163
NDCG@5 After Re-Ranking: 1.0
