In [1]:
# enable autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from askharrison.arxiv_search import expand_arxiv_query, run_multi_arixv_queries

In [3]:
problem_statement = """RAG(retrieval augmented generation) sometimes the sources retrieved are relevant but not enough to answer the question user ask. How do research approach this case"""

In [23]:
problem_statement = """reduce hallucination in RAG(retrieval augmented generation), make queries tailored for arxiv search, and then use the retrieved papers to answer the question"""

In [24]:
search_queries = expand_arxiv_query(problem_statement)

In [25]:
search_queries

['Reducing hallucination in retrieval augmented generation models',
 'Techniques for minimizing hallucination in RAG models',
 'Enhancing the effectiveness of Retrieval Augmented Generation',
 'Studies on improving the accuracy of Retrieval Augmented Generation',
 'Methods to reduce artifact generation in RAG models',
 'Optimization techniques in retrieval augmented generation models',
 'Novel methods for reducing hallucination in RAG',
 'Approaches for improving reliability in retrieval augmented generation models',
 'Research on limiting hallucination in RAG',
 'Advanced strategies for retrieval augmented generation optimization']

In [26]:
arxiv_query_results = run_multi_arixv_queries(search_queries)

100%|██████████| 10/10 [00:25<00:00,  2.53s/it]


In [27]:
len(arxiv_query_results.items())

10

In [28]:
# flatten arxiv_query_results
all_results = []
for query in arxiv_query_results:
    for result in arxiv_query_results[query]:
        all_results.append(result)

# make arxiv query results a dataframe and create a new dataframe with only unique entry_id
import pandas as pd

arixv_result_df = pd.DataFrame(all_results)
unique_arixv_result_df = arixv_result_df.drop_duplicates(subset='entry_id')

In [29]:
arixv_result_df.shape, unique_arixv_result_df.shape

((200, 13), (159, 13))

In [30]:
arixv_result_df.groupby('entry_id').count().sort_values('title', ascending=False).head(10)

Unnamed: 0_level_0,authors,categories,comment,doi,journal_ref,links,pdf_url,primary_category,published,summary,title,updated
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
http://arxiv.org/abs/2410.18251v1,4,4,4,0,0,4,4,4,4,4,4,4
http://arxiv.org/abs/2410.22353v1,3,3,0,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2401.15391v1,3,3,3,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2410.17783v1,3,3,3,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2401.05856v1,3,3,0,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2410.12248v1,3,3,0,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2406.19150v1,3,3,0,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2408.15533v2,3,3,0,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2410.11414v2,3,3,3,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2411.12759v1,3,3,0,0,0,3,3,3,3,3,3,3


In [31]:
from askharrison.prompts.content_curation import create_arxiv_filtering_prompt

In [32]:
help(create_arxiv_filtering_prompt)

Help on function create_arxiv_filtering_prompt in module askharrison.prompts.content_curation:

create_arxiv_filtering_prompt(problem_statement: str, doc_abstract: str)



In [33]:
# create a prompt for each arxiv entry
arxiv_reranking_prompts = [create_arxiv_filtering_prompt(problem_statement, 
                                         record['title']+"\n"+record['summary']) for record in unique_arixv_result_df.to_dict(orient='records')]

In [34]:
from askharrison.llm_models import parallel_llm_processor, process_question, safe_eval, extract_python_code

In [35]:
reranking_llm_response = parallel_llm_processor(arxiv_reranking_prompts, llm_function=process_question, 
                                                max_workers=8)

Processing prompts: 100%|██████████| 159/159 [01:11<00:00,  2.22it/s]


In [36]:
llm_responses_results = [safe_eval(extract_python_code(response)) for response in reranking_llm_response]
# filter out empty responses

unique_arixv_result_df.shape, len(llm_responses_results)

((159, 13), 159)

In [37]:
# extract reasoning, is_direct, 'is_relevant' from llm_responses_results if it is not empty, and add to unique_arixv_result_df
unique_arixv_result_df['reasoning'] = [response['reasoning'] if response else None for response in llm_responses_results]
unique_arixv_result_df['is_direct'] = [response['is_direct'] if response else None for response in llm_responses_results]
unique_arixv_result_df['is_relevant'] = [response['is_relevant'] if response else None for response in llm_responses_results]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_arixv_result_df['reasoning'] = [response['reasoning'] if response else None for response in llm_responses_results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_arixv_result_df['is_direct'] = [response['is_direct'] if response else None for response in llm_responses_results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [38]:
# increase max column width in pandas
pd.set_option('display.max_colwidth', None)

In [39]:
unique_arixv_result_df.query('is_direct == True').shape

(15, 16)

In [40]:
unique_arixv_result_df.query('is_direct == True')\
    [['title','entry_id', 'summary', 'is_relevant']]

Unnamed: 0,title,entry_id,summary,is_relevant
6,Context-Augmented Code Generation Using Programming Knowledge Graphs,http://arxiv.org/abs/2410.18251v1,"Large Language Models (LLMs) and Code-LLMs (CLLMs) have significantly\nimproved code generation, but, they frequently face difficulties when dealing\nwith challenging and complex problems. Retrieval-Augmented Generation (RAG)\naddresses this issue by retrieving and integrating external knowledge at the\ninference time. However, retrieval models often fail to find most relevant\ncontext, and generation models, with limited context capacity, can hallucinate\nwhen given irrelevant data. We present a novel framework that leverages a\nProgramming Knowledge Graph (PKG) to semantically represent and retrieve code.\nThis approach enables fine-grained code retrieval by focusing on the most\nrelevant segments while reducing irrelevant context through a tree-pruning\ntechnique. PKG is coupled with a re-ranking mechanism to reduce even more\nhallucinations by selectively integrating non-RAG solutions. We propose two\nretrieval approaches-block-wise and function-wise-based on the PKG, optimizing\ncontext granularity. Evaluations on the HumanEval and MBPP benchmarks show our\nmethod improves pass@1 accuracy by up to 20%, and outperforms state-of-the-art\nmodels by up to 34% on MBPP. Our contributions include PKG-based retrieval,\ntree pruning to enhance retrieval precision, a re-ranking method for robust\nsolution selection and a Fill-in-the-Middle (FIM) enhancer module for automatic\ncode augmentation with relevant comments and docstrings.",4
10,Coarse-to-Fine Highlighting: Reducing Knowledge Hallucination in Large Language Models,http://arxiv.org/abs/2410.15116v1,"Generation of plausible but incorrect factual information, often termed\nhallucination, has attracted significant research interest. Retrieval-augmented\nlanguage model (RALM) -- which enhances models with up-to-date knowledge --\nemerges as a promising method to reduce hallucination. However, existing RALMs\nmay instead exacerbate hallucination when retrieving lengthy contexts. To\naddress this challenge, we propose COFT, a novel\n\textbf{CO}arse-to-\textbf{F}ine highligh\textbf{T}ing method to focus on\ndifferent granularity-level key texts, thereby avoiding getting lost in lengthy\ncontexts. Specifically, COFT consists of three components: \textit{recaller},\n\textit{scorer}, and \textit{selector}. First, \textit{recaller} applies a\nknowledge graph to extract potential key entities in a given context. Second,\n\textit{scorer} measures the importance of each entity by calculating its\ncontextual weight. Finally, \textit{selector} selects high contextual weight\nentities with a dynamic threshold algorithm and highlights the corresponding\nparagraphs, sentences, or words in a coarse-to-fine manner. Extensive\nexperiments on the knowledge hallucination benchmark demonstrate the\neffectiveness of COFT, leading to a superior performance over $30\%$ in the F1\nscore metric. Moreover, COFT also exhibits remarkable versatility across\nvarious long-form tasks, such as reading comprehension and question answering.",4
14,The Effects of Hallucinations in Synthetic Training Data for Relation Extraction,http://arxiv.org/abs/2410.08393v1,"Relation extraction is crucial for constructing knowledge graphs, with large\nhigh-quality datasets serving as the foundation for training, fine-tuning, and\nevaluating models. Generative data augmentation (GDA) is a common approach to\nexpand such datasets. However, this approach often introduces hallucinations,\nsuch as spurious facts, whose impact on relation extraction remains\nunderexplored. In this paper, we examine the effects of hallucinations on the\nperformance of relation extraction on the document and sentence levels. Our\nempirical study reveals that hallucinations considerably compromise the ability\nof models to extract relations from text, with recall reductions between 19.1%\nand 39.2%. We identify that relevant hallucinations impair the model's\nperformance, while irrelevant hallucinations have a minimal impact.\nAdditionally, we develop methods for the detection of hallucinations to improve\ndata quality and model performance. Our approaches successfully classify texts\nas either 'hallucinated' or 'clean,' achieving high F1-scores of 83.8% and\n92.2%. These methods not only assist in removing hallucinations but also help\nin estimating their prevalence within datasets, which is crucial for selecting\nhigh-quality data. Overall, our work confirms the profound impact of relevant\nhallucinations on the effectiveness of relation extraction models.",4
29,The Geometry of Queries: Query-Based Innovations in Retrieval-Augmented Generation,http://arxiv.org/abs/2407.18044v1,"Digital health chatbots powered by Large Language Models (LLMs) have the\npotential to significantly improve personal health management for chronic\nconditions by providing accessible and on-demand health coaching and\nquestion-answering. However, these chatbots risk providing unverified and\ninaccurate information because LLMs generate responses based on patterns\nlearned from diverse internet data. Retrieval Augmented Generation (RAG) can\nhelp mitigate hallucinations and inaccuracies in LLM responses by grounding it\non reliable content. However, efficiently and accurately retrieving most\nrelevant set of content for real-time user questions remains a challenge. In\nthis work, we introduce Query-Based Retrieval Augmented Generation (QB-RAG), a\nnovel approach that pre-computes a database of potential queries from a content\nbase using LLMs. For an incoming patient question, QB-RAG efficiently matches\nit against this pre-generated query database using vector search, improving\nalignment between user questions and the content. We establish a theoretical\nfoundation for QB-RAG and provide a comparative analysis of existing retrieval\nenhancement techniques for RAG systems. Finally, our empirical evaluation\ndemonstrates that QB-RAG significantly improves the accuracy of healthcare\nquestion answering, paving the way for robust and trustworthy LLM applications\nin digital health.",5
37,RAGged Edges: The Double-Edged Sword of Retrieval-Augmented Chatbots,http://arxiv.org/abs/2403.01193v3,"Large language models (LLMs) like ChatGPT demonstrate the remarkable progress\nof artificial intelligence. However, their tendency to hallucinate -- generate\nplausible but false information -- poses a significant challenge. This issue is\ncritical, as seen in recent court cases where ChatGPT's use led to citations of\nnon-existent legal rulings. This paper explores how Retrieval-Augmented\nGeneration (RAG) can counter hallucinations by integrating external knowledge\nwith prompts. We empirically evaluate RAG against standard LLMs using prompts\ndesigned to induce hallucinations. Our results show that RAG increases accuracy\nin some cases, but can still be misled when prompts directly contradict the\nmodel's pre-trained understanding. These findings highlight the complex nature\nof hallucinations and the need for more robust solutions to ensure LLM\nreliability in real-world applications. We offer practical recommendations for\nRAG deployment and discuss implications for the development of more trustworthy\nLLMs.",5
77,Progressive Query Expansion for Retrieval Over Cost-constrained Data Sources,http://arxiv.org/abs/2406.07136v1,"Query expansion has been employed for a long time to improve the accuracy of\nquery retrievers. Earlier works relied on pseudo-relevance feedback (PRF)\ntechniques, which augment a query with terms extracted from documents retrieved\nin a first stage. However, the documents may be noisy hindering the\neffectiveness of the ranking. To avoid this, recent studies have instead used\nLarge Language Models (LLMs) to generate additional content to expand a query.\nThese techniques are prone to hallucination and also focus on the LLM usage\ncost. However, the cost may be dominated by the retrieval in several important\npractical scenarios, where the corpus is only available via APIs which charge a\nfee per retrieved document. We propose combining classic PRF techniques with\nLLMs and create a progressive query expansion algorithm ProQE that iteratively\nexpands the query as it retrieves more documents. ProQE is compatible with both\nsparse and dense retrieval systems. Our experimental results on four retrieval\ndatasets show that ProQE outperforms state-of-the-art baselines by 37% and is\nthe most cost-effective.",5
91,Seven Failure Points When Engineering a Retrieval Augmented Generation System,http://arxiv.org/abs/2401.05856v1,"Software engineers are increasingly adding semantic search capabilities to\napplications using a strategy known as Retrieval Augmented Generation (RAG). A\nRAG system involves finding documents that semantically match a query and then\npassing the documents to a large language model (LLM) such as ChatGPT to\nextract the right answer using an LLM. RAG systems aim to: a) reduce the\nproblem of hallucinated responses from LLMs, b) link sources/references to\ngenerated responses, and c) remove the need for annotating documents with\nmeta-data. However, RAG systems suffer from limitations inherent to information\nretrieval systems and from reliance on LLMs. In this paper, we present an\nexperience report on the failure points of RAG systems from three case studies\nfrom separate domains: research, education, and biomedical. We share the\nlessons learned and present 7 failure points to consider when designing a RAG\nsystem. The two key takeaways arising from our work are: 1) validation of a RAG\nsystem is only feasible during operation, and 2) the robustness of a RAG system\nevolves rather than designed in at the start. We conclude with a list of\npotential research directions on RAG systems for the software engineering\ncommunity.",4
102,Query Optimization for Parametric Knowledge Refinement in Retrieval-Augmented Large Language Models,http://arxiv.org/abs/2411.07820v2,"We introduce the Extract-Refine-Retrieve-Read (ERRR) framework, a novel\napproach designed to bridge the pre-retrieval information gap in\nRetrieval-Augmented Generation (RAG) systems through query optimization\ntailored to meet the specific knowledge requirements of Large Language Models\n(LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR\nframework begins by extracting parametric knowledge from LLMs, followed by\nusing a specialized query optimizer for refining these queries. This process\nensures the retrieval of only the most pertinent information essential for\ngenerating accurate responses. Moreover, to enhance flexibility and reduce\ncomputational costs, we propose a trainable scheme for our pipeline that\nutilizes a smaller, tunable model as the query optimizer, which is refined\nthrough knowledge distillation from a larger teacher model. Our evaluations on\nvarious question-answering (QA) datasets and with different retrieval systems\nshow that ERRR consistently outperforms existing baselines, proving to be a\nversatile and cost-effective module for improving the utility and accuracy of\nRAG systems.",5
108,Retrieve Anything To Augment Large Language Models,http://arxiv.org/abs/2310.07554v2,"Large language models (LLMs) face significant challenges stemming from their\ninherent limitations in knowledge, memory, alignment, and action. These\nchallenges cannot be addressed by LLMs alone, but should rely on assistance\nfrom the external world, such as knowledge base, memory store, demonstration\nexamples, and tools. Retrieval augmentation stands as a vital mechanism for\nbridging the gap between LLMs and the external assistance. However,\nconventional methods encounter two pressing issues. On the one hand, the\ngeneral-purpose retrievers are not properly optimized for the retrieval\naugmentation of LLMs. On the other hand, the task-specific retrievers lack the\nrequired versatility, hindering their performance across the diverse retrieval\naugmentation scenarios.\n In this work, we present a novel approach, the LLM-Embedder, which\ncomprehensively supports the diverse retrieval augmentation needs of LLMs with\none unified embedding model. Training such a unified model is non-trivial, as\nvarious retrieval tasks aim to capture distinct semantic relationships, often\nsubject to mutual interference. To address this challenge, we systematically\noptimize our training methodology. This includes reward formulation based on\nLLMs' feedback, the stabilization of knowledge distillation, multi-task\nfine-tuning with explicit instructions, and homogeneous in-batch negative\nsampling. These optimization strategies contribute to the outstanding empirical\nperformance of the LLM-Embedder. Notably, it yields remarkable enhancements in\nretrieval augmentation for LLMs, surpassing both general-purpose and\ntask-specific retrievers in various evaluation scenarios. Our checkpoint and\nsource code are publicly available at\nhttps://github.com/FlagOpen/FlagEmbedding.",5
117,GABO: Graph Augmentations with Bi-level Optimization,http://arxiv.org/abs/2104.00722v1,"Data augmentation refers to a wide range of techniques for improving model\ngeneralization by augmenting training examples. Oftentimes such methods require\ndomain knowledge about the dataset at hand, spawning a plethora of recent\nliterature surrounding automated techniques for data augmentation. In this work\nwe apply one such method, bilevel optimization, to tackle the problem of graph\nclassification on the ogbg-molhiv dataset. Our best performing augmentation\nachieved a test ROCAUC score of 77.77 % with a GIN+virtual classifier, which\nmakes it the most effective augmenter for this classifier on the leaderboard.\nThis framework combines a GIN layer augmentation generator with a bias\ntransformation and outperforms the same classifier augmented using the\nstate-of-the-art FLAG augmentation.",5
