In [1]:
# enable autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from askharrison.arxiv_search import expand_arxiv_query, run_multi_arixv_queries

In [3]:
problem_statement = """RAG(retrieval augmented generation) sometimes the sources retrieved are relevant but not enough to answer the question user ask. How do research approach this case"""

In [4]:
search_queries = expand_arxiv_query(problem_statement)

In [5]:
search_queries

['Relevant source retrieval in RAG',
 'Dealing with inadequate sources in RAG',
 'Improving source retrieval in Retrieval Augmented Generation',
 'Strategies to enhance relevancy of retrieved sources in RAG',
 'Techniques for better source retrieval in RAG',
 'Addressing relevancy in source retrieval in RAG',
 'Approaches to improve source usefulness in RAG',
 'Optimizing source retrieval in Retrieval Augmented Generation',
 'Methods to boost relevant source retrieval in Retrieval Augmented Generation',
 'Resolving relevance issue in source retrieval in RAG']

In [6]:
arxiv_query_results = run_multi_arixv_queries(search_queries)

100%|██████████| 10/10 [00:24<00:00,  2.45s/it]


In [7]:
len(arxiv_query_results.items())

10

In [8]:
# flatten arxiv_query_results
all_results = []
for query in arxiv_query_results:
    for result in arxiv_query_results[query]:
        all_results.append(result)

# make arxiv query results a dataframe and create a new dataframe with only unique entry_id
import pandas as pd

arixv_result_df = pd.DataFrame(all_results)
unique_arixv_result_df = arixv_result_df.drop_duplicates(subset='entry_id')

In [9]:
arixv_result_df.shape, unique_arixv_result_df.shape

((200, 13), (95, 13))

In [10]:
arixv_result_df.groupby('entry_id').count().sort_values('title', ascending=False).head(10)

Unnamed: 0_level_0,authors,categories,comment,doi,journal_ref,links,pdf_url,primary_category,published,summary,title,updated
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
http://arxiv.org/abs/2410.07176v1,7,7,7,0,0,7,7,7,7,7,7,7
http://arxiv.org/abs/2405.19207v1,7,7,7,0,0,7,7,7,7,7,7,7
http://arxiv.org/abs/2402.17497v1,7,7,0,0,0,7,7,7,7,7,7,7
http://arxiv.org/abs/2402.11891v1,6,6,0,0,0,6,6,6,6,6,6,6
http://arxiv.org/abs/2410.03780v1,6,6,0,0,0,6,6,6,6,6,6,6
http://arxiv.org/abs/2404.07221v2,6,6,0,0,0,6,6,6,6,6,6,6
http://arxiv.org/abs/2406.13213v2,6,6,6,0,0,6,6,6,6,6,6,6
http://arxiv.org/abs/2410.01782v1,5,5,5,0,0,5,5,5,5,5,5,5
http://arxiv.org/abs/2210.02627v1,5,5,5,0,0,5,5,5,5,5,5,5
http://arxiv.org/abs/2407.11005v1,5,5,0,0,0,5,5,5,5,5,5,5


In [11]:
from askharrison.prompts.content_curation import create_arxiv_filtering_prompt

In [12]:
help(create_arxiv_filtering_prompt)

Help on function create_arxiv_filtering_prompt in module askharrison.prompts.content_curation:

create_arxiv_filtering_prompt(problem_statement: str, doc_abstract: str)



In [13]:
# create a prompt for each arxiv entry
arxiv_reranking_prompts = [create_arxiv_filtering_prompt(problem_statement, 
                                         record['title']+"\n"+record['summary']) for record in unique_arixv_result_df.to_dict(orient='records')]

In [14]:
from askharrison.llm_models import parallel_llm_processor, process_question, safe_eval, extract_python_code

In [15]:
reranking_llm_response = parallel_llm_processor(arxiv_reranking_prompts, llm_function=process_question, 
                                                max_workers=8)

Processing prompts: 100%|██████████| 95/95 [00:30<00:00,  3.11it/s]


In [23]:
llm_responses_results = [safe_eval(extract_python_code(response)) for response in reranking_llm_response]
# filter out empty responses

unique_arixv_result_df.shape, len(llm_responses_results)

((95, 13), 95)

In [25]:
# extract reasoning, is_direct, 'is_relevant' from llm_responses_results if it is not empty, and add to unique_arixv_result_df
unique_arixv_result_df['reasoning'] = [response['reasoning'] if response else None for response in llm_responses_results]
unique_arixv_result_df['is_direct'] = [response['is_direct'] if response else None for response in llm_responses_results]
unique_arixv_result_df['is_relevant'] = [response['is_relevant'] if response else None for response in llm_responses_results]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_arixv_result_df['reasoning'] = [response['reasoning'] if response else None for response in llm_responses_results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_arixv_result_df['is_direct'] = [response['is_direct'] if response else None for response in llm_responses_results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [31]:
# increase max column width in pandas
pd.set_option('display.max_colwidth', None)

In [32]:
unique_arixv_result_df.query('is_direct == True')\
    [['title','entry_id', 'summary', 'is_relevant']]

Unnamed: 0,title,entry_id,summary,is_relevant
1,RAGBench: Explainable Benchmark for Retrieval-Augmented Generation Systems,http://arxiv.org/abs/2407.11005v1,"Retrieval-Augmented Generation (RAG) has become a standard architectural\npattern for incorporating domain-specific knowledge into user-facing chat\napplications powered by Large Language Models (LLMs). RAG systems are\ncharacterized by (1) a document retriever that queries a domain-specific corpus\nfor context information relevant to an input query, and (2) an LLM that\ngenerates a response based on the provided query and context. However,\ncomprehensive evaluation of RAG systems remains a challenge due to the lack of\nunified evaluation criteria and annotated datasets. In response, we introduce\nRAGBench: the first comprehensive, large-scale RAG benchmark dataset of 100k\nexamples. It covers five unique industry-specific domains and various RAG task\ntypes. RAGBench examples are sourced from industry corpora such as user\nmanuals, making it particularly relevant for industry applications. Further, we\nformalize the TRACe evaluation framework: a set of explainable and actionable\nRAG evaluation metrics applicable across all RAG domains. We release the\nlabeled dataset at https://huggingface.co/datasets/rungalileo/ragbench.\nRAGBench explainable labels facilitate holistic evaluation of RAG systems,\nenabling actionable feedback for continuous improvement of production\napplications. Thorough extensive benchmarking, we find that LLM-based RAG\nevaluation methods struggle to compete with a finetuned RoBERTa model on the\nRAG evaluation task. We identify areas where existing approaches fall short and\npropose the adoption of RAGBench with TRACe towards advancing the state of RAG\nevaluation systems.",5.0
3,Open-RAG: Enhanced Retrieval-Augmented Reasoning with Open-Source Large Language Models,http://arxiv.org/abs/2410.01782v1,"Retrieval-Augmented Generation (RAG) has been shown to enhance the factual\naccuracy of Large Language Models (LLMs), but existing methods often suffer\nfrom limited reasoning capabilities in effectively using the retrieved\nevidence, particularly when using open-source LLMs. To mitigate this gap, we\nintroduce a novel framework, Open-RAG, designed to enhance reasoning\ncapabilities in RAG with open-source LLMs. Our framework transforms an\narbitrary dense LLM into a parameter-efficient sparse mixture of experts (MoE)\nmodel capable of handling complex reasoning tasks, including both single- and\nmulti-hop queries. Open-RAG uniquely trains the model to navigate challenging\ndistractors that appear relevant but are misleading. As a result, Open-RAG\nleverages latent learning, dynamically selecting relevant experts and\nintegrating external knowledge effectively for more accurate and contextually\nrelevant responses. In addition, we propose a hybrid adaptive retrieval method\nto determine retrieval necessity and balance the trade-off between performance\ngain and inference speed. Experimental results show that the Llama2-7B-based\nOpen-RAG outperforms state-of-the-art LLMs and RAG models such as ChatGPT,\nSelf-RAG, and Command R+ in various knowledge-intensive tasks. We open-source\nour code and models at https://openragmoe.github.io/",5.0
6,A Multi-Source Retrieval Question Answering Framework Based on RAG,http://arxiv.org/abs/2405.19207v1,"With the rapid development of large-scale language models,\nRetrieval-Augmented Generation (RAG) has been widely adopted. However, existing\nRAG paradigms are inevitably influenced by erroneous retrieval information,\nthereby reducing the reliability and correctness of generated results.\nTherefore, to improve the relevance of retrieval information, this study\nproposes a method that replaces traditional retrievers with GPT-3.5, leveraging\nits vast corpus knowledge to generate retrieval information. We also propose a\nweb retrieval based method to implement fine-grained knowledge retrieval,\nUtilizing the powerful reasoning capability of GPT-3.5 to realize semantic\npartitioning of problem.In order to mitigate the illusion of GPT retrieval and\nreduce noise in Web retrieval,we proposes a multi-source retrieval framework,\nnamed MSRAG, which combines GPT retrieval with web retrieval. Experiments on\nmultiple knowledge-intensive QA datasets demonstrate that the proposed\nframework in this study performs better than existing RAG framework in\nenhancing the overall efficiency and accuracy of QA systems.",5.0
8,Improving Retrieval for RAG based Question Answering Models on Financial Documents,http://arxiv.org/abs/2404.07221v2,"The effectiveness of Large Language Models (LLMs) in generating accurate\nresponses relies heavily on the quality of input provided, particularly when\nemploying Retrieval Augmented Generation (RAG) techniques. RAG enhances LLMs by\nsourcing the most relevant text chunk(s) to base queries upon. Despite the\nsignificant advancements in LLMs' response quality in recent years, users may\nstill encounter inaccuracies or irrelevant answers; these issues often stem\nfrom suboptimal text chunk retrieval by RAG rather than the inherent\ncapabilities of LLMs. To augment the efficacy of LLMs, it is crucial to refine\nthe RAG process. This paper explores the existing constraints of RAG pipelines\nand introduces methodologies for enhancing text retrieval. It delves into\nstrategies such as sophisticated chunking techniques, query expansion, the\nincorporation of metadata annotations, the application of re-ranking\nalgorithms, and the fine-tuning of embedding algorithms. Implementing these\napproaches can substantially improve the retrieval quality, thereby elevating\nthe overall performance and reliability of LLMs in processing and responding to\nqueries.",5.0
16,MultiHop-RAG: Benchmarking Retrieval-Augmented Generation for Multi-Hop Queries,http://arxiv.org/abs/2401.15391v1,"Retrieval-augmented generation (RAG) augments large language models (LLM) by\nretrieving relevant knowledge, showing promising potential in mitigating LLM\nhallucinations and enhancing response quality, thereby facilitating the great\nadoption of LLMs in practice. However, we find that existing RAG systems are\ninadequate in answering multi-hop queries, which require retrieving and\nreasoning over multiple pieces of supporting evidence. Furthermore, to our\nknowledge, no existing RAG benchmarking dataset focuses on multi-hop queries.\nIn this paper, we develop a novel dataset, MultiHop-RAG, which consists of a\nknowledge base, a large collection of multi-hop queries, their ground-truth\nanswers, and the associated supporting evidence. We detail the procedure of\nbuilding the dataset, utilizing an English news article dataset as the\nunderlying RAG knowledge base. We demonstrate the benchmarking utility of\nMultiHop-RAG in two experiments. The first experiment compares different\nembedding models for retrieving evidence for multi-hop queries. In the second\nexperiment, we examine the capabilities of various state-of-the-art LLMs,\nincluding GPT-4, PaLM, and Llama2-70B, in reasoning and answering multi-hop\nqueries given the evidence. Both experiments reveal that existing RAG methods\nperform unsatisfactorily in retrieving and answering multi-hop queries. We hope\nMultiHop-RAG will be a valuable resource for the community in developing\neffective RAG systems, thereby facilitating greater adoption of LLMs in\npractice. The MultiHop-RAG and implemented RAG system is publicly available at\nhttps://github.com/yixuantt/MultiHop-RAG/.",5.0
17,Long-Context LLMs Meet RAG: Overcoming Challenges for Long Inputs in RAG,http://arxiv.org/abs/2410.05983v1,"Retrieval-augmented generation (RAG) empowers large language models (LLMs) to\nutilize external knowledge sources. The increasing capacity of LLMs to process\nlonger input sequences opens up avenues for providing more retrieved\ninformation, to potentially enhance the quality of generated outputs. It is\nplausible to assume that a larger retrieval set would contain more relevant\ninformation (higher recall), that might result in improved performance.\nHowever, our empirical findings demonstrate that for many long-context LLMs,\nthe quality of generated output initially improves first, but then subsequently\ndeclines as the number of retrieved passages increases. This paper investigates\nthis phenomenon, identifying the detrimental impact of retrieved ""hard\nnegatives"" as a key contributor. To mitigate this and enhance the robustness of\nlong-context LLM-based RAG, we propose both training-free and training-based\napproaches. We first showcase the effectiveness of retrieval reordering as a\nsimple yet powerful training-free optimization. Furthermore, we explore\ntraining-based methods, specifically RAG-specific implicit LLM fine-tuning and\nRAG-oriented fine-tuning with intermediate reasoning, demonstrating their\ncapacity for substantial performance gains. Finally, we conduct a systematic\nanalysis of design choices for these training-based methods, including data\ndistribution, retriever selection, and training context length.",5.0
38,Does RAG Introduce Unfairness in LLMs? Evaluating Fairness in Retrieval-Augmented Generation Systems,http://arxiv.org/abs/2409.19804v1,"RAG (Retrieval-Augmented Generation) have recently gained significant\nattention for their enhanced ability to integrate external knowledge sources in\nopen-domain question answering (QA) tasks. However, it remains unclear how\nthese models address fairness concerns, particularly with respect to sensitive\nattributes such as gender, geographic location, and other demographic factors.\nFirst, as language models evolve to prioritize utility, like improving exact\nmatch accuracy, fairness may have been largely overlooked. Second, RAG methods\nare complex pipelines, making it hard to identify and address biases, as each\ncomponent is optimized for different goals. In this paper, we aim to\nempirically evaluate fairness in several RAG methods. We propose a fairness\nevaluation framework tailored to RAG methods, using scenario-based questions\nand analyzing disparities across demographic attributes. The experimental\nresults indicate that, despite recent advances in utility-driven optimization,\nfairness issues persist in both the retrieval and generation stages,\nhighlighting the need for more targeted fairness interventions within RAG\npipelines. We will release our dataset and code upon acceptance of the paper.",5.0
41,Augmentation-Adapted Retriever Improves Generalization of Language Models as Generic Plug-In,http://arxiv.org/abs/2305.17331v1,"Retrieval augmentation can aid language models (LMs) in knowledge-intensive\ntasks by supplying them with external information. Prior works on retrieval\naugmentation usually jointly fine-tune the retriever and the LM, making them\nclosely coupled. In this paper, we explore the scheme of generic retrieval\nplug-in: the retriever is to assist target LMs that may not be known beforehand\nor are unable to be fine-tuned together. To retrieve useful documents for\nunseen target LMs, we propose augmentation-adapted retriever (AAR), which\nlearns LM's preferences obtained from a known source LM. Experiments on the\nMMLU and PopQA datasets demonstrate that our AAR trained with a small source LM\nis able to significantly improve the zero-shot generalization of larger target\nLMs ranging from 250M Flan-T5 to 175B InstructGPT. Further analysis indicates\nthat the preferences of different LMs overlap, enabling AAR trained with a\nsingle source LM to serve as a generic plug-in for various target LMs. Our code\nis open-sourced at https://github.com/OpenMatch/Augmentation-Adapted-Retriever.",5.0
43,Improving Retrieval-Augmented Code Comment Generation by Retrieving for Generation,http://arxiv.org/abs/2408.03623v1,"Code comment generation aims to generate high-quality comments from source\ncode automatically and has been studied for years. Recent studies proposed to\nintegrate information retrieval techniques with neural generation models to\ntackle this problem, i.e., Retrieval-Augmented Comment Generation (RACG)\napproaches, and achieved state-of-the-art results. However, the retrievers in\nprevious work are built independently of their generators. This results in that\nthe retrieved exemplars are not necessarily the most useful ones for generating\ncomments, limiting the performance of existing approaches. To address this\nlimitation, we propose a novel training strategy to enable the retriever to\nlearn from the feedback of the generator and retrieve exemplars for generation.\nSpecifically, during training, we use the retriever to retrieve the top-k\nexemplars and calculate their retrieval scores, and use the generator to\ncalculate a generation loss for the sample based on each exemplar. By aligning\nhigh-score exemplars retrieved by the retriever with low-loss exemplars\nobserved by the generator, the retriever can learn to retrieve exemplars that\ncan best improve the quality of the generated comments. Based on this strategy,\nwe propose a novel RACG approach named JOINTCOM and evaluate it on two\nreal-world datasets, JCSD and PCSD. The experimental results demonstrate that\nour approach surpasses the state-of-the-art baselines by 7.3% to 30.0% in terms\nof five metrics on the two datasets. We also conduct a human evaluation to\ncompare JOINTCOM with the best-performing baselines. The results indicate that\nJOINTCOM outperforms the baselines, producing comments that are more natural,\ninformative, and useful.",5.0
48,QPaug: Question and Passage Augmentation for Open-Domain Question Answering of LLMs,http://arxiv.org/abs/2406.14277v2,"Retrieval-augmented generation (RAG) has received much attention for\nOpen-domain question-answering (ODQA) tasks as a means to compensate for the\nparametric knowledge of large language models (LLMs). While previous approaches\nfocused on processing retrieved passages to remove irrelevant context, they\nstill rely heavily on the quality of retrieved passages which can degrade if\nthe question is ambiguous or complex. In this paper, we propose a simple yet\nefficient method called question and passage augmentation (QPaug) via LLMs for\nopen-domain QA. QPaug first decomposes the original questions into\nmultiple-step sub-questions. By augmenting the original question with detailed\nsub-questions and planning, we are able to make the query more specific on what\nneeds to be retrieved, improving the retrieval performance. In addition, to\ncompensate for the case where the retrieved passages contain distracting\ninformation or divided opinions, we augment the retrieved passages with\nself-generated passages by LLMs to guide the answer extraction. Experimental\nresults show that QPaug outperforms the previous state-of-the-art and achieves\nsignificant performance gain over existing RAG methods. The source code is\navailable at \url{https://github.com/kmswin1/QPaug}.",5.0
