In [1]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

In [2]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
import pandas as pd
from llama_hub.semanticscholar.base import SemanticScholarReader



In [3]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
)
from ragas.llama_index import evaluate

from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    harmfulness,
]

s2reader = SemanticScholarReader()


  from .autonotebook import tqdm as notebook_tqdm


In [4]:

documents = s2reader.load_data(
        query="datasets for paragraph retrieval", limit=10
    )
vector_index = VectorStoreIndex.from_documents(
    documents, service_context=ServiceContext.from_defaults(chunk_size=512)
)

query_engine = vector_index.as_query_engine()

eval_questions = [
    "What is a paragraph retrieval dataset?",
    "What are some examples of paragraph retrieval datasets?",
    "Why are paragraph retrieval datasets important?",
    "What is the MSMARCO dataset used for?",
    "How can paragraph retrieval datasets help in machine learning?",
]

eval_answers = [
    "A paragraph retrieval dataset is a resource used in machine learning and natural language processing that contains various paragraphs, documented to be used in the processes of information retrieval. The dataset can contain paragraphs from a wide variety of topics and could help in tasks like data mining, machine translation, formulation of answers, among others.",
    "Examples of paragraph retrieval datasets include MSMARCO, SQuAD (Stanford Question Answering Dataset), DeepMind Q&A Dataset, NarrativeQA, and NewsQA. Each of these datasets presents different challenges, like answering complex questions or comprehending written narratives in fine detail.",
    "Paragraph retrieval datasets are key to improving the accuracy of information retrieval systems and training machines for complex language-related tasks. They enable systems to understand how to locate the most relevant paragraphs to a given query or subject, which is crucial in fields such as digital assistants, search engines, text summarization, and more.",
    "MSMARCO, also known as Microsoft Machine Reading Comprehension, is a large-scale dataset used in training machine reading comprehension and question answering models. It contains real user queries and manually generated answers, which provide a meaningful challenge to AI models and help improve their real-world performance.",
    "Paragraph retrieval datasets help in machine learning by providing a resource that can be used to train AI models. These models learn to understand the structure, context, and content within paragraphs and improve their information retrieval capabilities, helping in various tasks including text summarization, text generation, machine translation, recommendation systems, among many others.",
]

eval_answers = [[a] for a in eval_answers]
result = evaluate(query_engine, metrics, eval_questions, eval_answers)
result.to_pandas()

evaluating with [faithfulness]


100%|██████████| 1/1 [00:19<00:00, 19.14s/it]


evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:10<00:00, 10.75s/it]


evaluating with [context_relevancy]


100%|██████████| 1/1 [00:09<00:00,  9.24s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:11<00:00, 11.47s/it]


evaluating with [harmfulness]


100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


Unnamed: 0,question,contexts,answer,ground_truths,faithfulness,answer_relevancy,context_relevancy,context_recall,harmfulness
0,What is a paragraph retrieval dataset?,[Multi-Hop Paragraph Retrieval for Open-Domain...,A paragraph retrieval dataset is a dataset tha...,[A paragraph retrieval dataset is a resource u...,0.75,0.937075,0.583333,0.0,0
1,What are some examples of paragraph retrieval ...,[Multi-Hop Paragraph Retrieval for Open-Domain...,SQuAD-Open and HotpotQA are examples of paragr...,[Examples of paragraph retrieval datasets incl...,0.0,1.0,0.5625,0.0,0
2,Why are paragraph retrieval datasets important?,[Analysing the Resourcefulness of the Paragrap...,Paragraph retrieval datasets are important bec...,[Paragraph retrieval datasets are key to impro...,1.0,0.991355,0.5,0.0,0
3,What is the MSMARCO dataset used for?,[Teaching Smaller Language Models To Generalis...,The context information does not provide any i...,"[MSMARCO, also known as Microsoft Machine Read...",1.0,0.717319,0.0,0.0,0
4,How can paragraph retrieval datasets help in m...,[Analysing the Resourcefulness of the Paragrap...,Paragraph retrieval datasets can help in machi...,[Paragraph retrieval datasets help in machine ...,0.8,0.978917,0.416667,0.5,0


In [5]:
documents = s2reader.load_data(
        query="biases in large language models", limit=10
    )
vector_index = VectorStoreIndex.from_documents(
    documents, service_context=ServiceContext.from_defaults(chunk_size=512)
)

query_engine = vector_index.as_query_engine()

eval_questions = [
    "What does it mean when we say a language model is biased?",
    "How do biases enter large language models?",
    "What is the impact of biases in language models?",
    "How can we reduce the biases in large language models?",
]

eval_answers = [
    "When we say a language model is biased, we mean that it exhibits or propagates unfair prejudices in favor or against certain groups, subjects, or ideas. It could unfairly favor one group over another or portray specific groups in harmful or discriminatory ways, reflecting societal or systemic biases.",
    "Biases enter large language models during their training process. These models are trained on vast amounts of data from the internet, which often include biased content or narratives. As the models learn from this data to predict or generate text, they absorb and encode these biases, perpetuating them in their outputs.",
    "The impact of biases in language models is significant. Such biases can unintentionally harm individuals or groups that the biases are against by reinforcing stereotypes, spreading misinformation, or promoting discrimination. In addition, they can skew the use of language technologies, affecting decision-making processes in areas like hiring, sentencing, loan approval, and more.",
    "Reducing biases in large language models is a complex task involving multiple strategies. This can include careful curation of the training set to minimize biased content, utilizing de-biasing techniques during model training, auditing and fine-tuning the model's outputs, and incorporating feedback from a diverse set of users. It also involves ongoing research and attention to the social, ethical, and cultural implications of these computational systems.",
]
eval_answers = [[a] for a in eval_answers]
result = evaluate(query_engine, metrics, eval_questions, eval_answers)
result.to_pandas()

evaluating with [faithfulness]


100%|██████████| 1/1 [00:18<00:00, 18.45s/it]


evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:05<00:00,  5.76s/it]


evaluating with [context_relevancy]


100%|██████████| 1/1 [00:09<00:00,  9.48s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:07<00:00,  7.61s/it]


evaluating with [harmfulness]


100%|██████████| 1/1 [00:05<00:00,  5.14s/it]


Unnamed: 0,question,contexts,answer,ground_truths,faithfulness,answer_relevancy,context_relevancy,context_recall,harmfulness
0,What does it mean when we say a language model...,"[Biases in Large Language Models: Origins, Inv...","When we say a language model is biased, it mea...","[When we say a language model is biased, we me...",1.0,1.0,0.888889,0.0,0
1,How do biases enter large language models?,"[Biases in Large Language Models: Origins, Inv...",Biases enter large language models through var...,[Biases enter large language models during the...,0.888889,0.945333,0.5,1.0,0
2,What is the impact of biases in language models?,"[Biases in Large Language Models: Origins, Inv...",The impact of biases in language models can ha...,[The impact of biases in language models is si...,1.0,0.970033,0.666667,0.0,0
3,How can we reduce the biases in large language...,"[Biases in Large Language Models: Origins, Inv...","There are directions focused on measuring, red...",[Reducing biases in large language models is a...,1.0,0.887473,0.333333,1.0,0
