In [1]:
# enable autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from askharrison.arxiv_search import expand_arxiv_query, run_multi_arixv_queries

In [3]:
problem_statement = """RAG(retrieval augmented generation) sometimes the sources retrieved are relevant but not enough to answer the question user ask. How do research approach this case"""

In [4]:
problem_statement = """reduce hallucination in RAG(retrieval augmented generation)"""

In [5]:
search_queries = expand_arxiv_query(problem_statement)

In [6]:
search_queries

['RAG model hallucination reduction techniques',
 'Reducing hallucination in Retrieval Augmented Generation',
 'Methods to minimize hallucination in RAG models',
 'RAG model accuracy improvement strategies',
 'Techniques for efficient and accurate RAG models',
 'Overcoming hallucination in RAG models',
 'Mitigating hallucination in Retrieval Augmented Generation',
 'Accuracy enhancement in RAG models',
 'Approaches to reduce hallucination in RAG',
 'Solutions for hallucination in Retrieval Augmented Generation']

In [7]:
arxiv_query_results = run_multi_arixv_queries(search_queries)

100%|██████████| 10/10 [00:22<00:00,  2.25s/it]


In [8]:
len(arxiv_query_results.items())

10

In [9]:
# flatten arxiv_query_results
all_results = []
for query in arxiv_query_results:
    for result in arxiv_query_results[query]:
        all_results.append(result)

# make arxiv query results a dataframe and create a new dataframe with only unique entry_id
import pandas as pd

arixv_result_df = pd.DataFrame(all_results)
unique_arixv_result_df = arixv_result_df.drop_duplicates(subset='entry_id')

In [10]:
arixv_result_df.shape, unique_arixv_result_df.shape

((200, 13), (116, 13))

In [11]:
arixv_result_df.groupby('entry_id').count().sort_values('title', ascending=False).head(10)

Unnamed: 0_level_0,authors,categories,comment,doi,journal_ref,links,pdf_url,primary_category,published,summary,title,updated
entry_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
http://arxiv.org/abs/2410.11414v2,7,7,7,0,0,7,7,7,7,7,7,7
http://arxiv.org/abs/2407.12325v1,7,7,0,0,0,7,7,7,7,7,7,7
http://arxiv.org/abs/2411.12759v1,6,6,0,0,0,6,6,6,6,6,6,6
http://arxiv.org/abs/2401.00396v2,5,5,0,0,0,5,5,5,5,5,5,5
http://arxiv.org/abs/2408.15533v2,5,5,0,0,0,5,5,5,5,5,5,5
http://arxiv.org/abs/2412.04235v1,4,4,0,0,0,4,4,4,4,4,4,4
http://arxiv.org/abs/2410.13085v1,4,4,0,0,0,4,4,4,4,4,4,4
http://arxiv.org/abs/2410.18251v1,3,3,3,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2408.00555v1,3,3,0,0,0,3,3,3,3,3,3,3
http://arxiv.org/abs/2407.19994v3,3,3,0,3,3,3,3,3,3,3,3,3


In [12]:
from askharrison.prompts.content_curation import create_arxiv_filtering_prompt

In [13]:
help(create_arxiv_filtering_prompt)

Help on function create_arxiv_filtering_prompt in module askharrison.prompts.content_curation:

create_arxiv_filtering_prompt(problem_statement: str, doc_abstract: str)



In [14]:
# create a prompt for each arxiv entry
arxiv_reranking_prompts = [create_arxiv_filtering_prompt(problem_statement, 
                                         record['title']+"\n"+record['summary']) for record in unique_arixv_result_df.to_dict(orient='records')]

In [15]:
from askharrison.llm_models import parallel_llm_processor, process_question, safe_eval, extract_python_code

In [16]:
reranking_llm_response = parallel_llm_processor(arxiv_reranking_prompts, llm_function=process_question, 
                                                max_workers=8)

Processing prompts: 100%|██████████| 116/116 [00:57<00:00,  2.03it/s]


In [17]:
llm_responses_results = [safe_eval(extract_python_code(response)) for response in reranking_llm_response]
# filter out empty responses

unique_arixv_result_df.shape, len(llm_responses_results)

((116, 13), 116)

In [18]:
# extract reasoning, is_direct, 'is_relevant' from llm_responses_results if it is not empty, and add to unique_arixv_result_df
unique_arixv_result_df['reasoning'] = [response['reasoning'] if response else None for response in llm_responses_results]
unique_arixv_result_df['is_direct'] = [response['is_direct'] if response else None for response in llm_responses_results]
unique_arixv_result_df['is_relevant'] = [response['is_relevant'] if response else None for response in llm_responses_results]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_arixv_result_df['reasoning'] = [response['reasoning'] if response else None for response in llm_responses_results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_arixv_result_df['is_direct'] = [response['is_direct'] if response else None for response in llm_responses_results]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [19]:
# increase max column width in pandas
pd.set_option('display.max_colwidth', None)

In [22]:
unique_arixv_result_df.query('is_direct == True').shape

(41, 16)

In [None]:
unique_arixv_result_df.query('is_direct == True')\
    [['title','entry_id', 'summary', 'is_relevant']]

Unnamed: 0,title,entry_id,summary,is_relevant
0,Addressing Hallucinations with RAG and NMISS in Italian Healthcare LLM Chatbots,http://arxiv.org/abs/2412.04235v1,"I combine detection and mitigation techniques to addresses hallucinations in\nLarge Language Models (LLMs). Mitigation is achieved in a question-answering\nRetrieval-Augmented Generation (RAG) framework while detection is obtained by\nintroducing the Negative Missing Information Scoring System (NMISS), which\naccounts for contextual relevance in responses. While RAG mitigates\nhallucinations by grounding answers in external data, NMISS refines the\nevaluation by identifying cases where traditional metrics incorrectly flag\ncontextually accurate responses as hallucinations. I use Italian health news\narticles as context to evaluate LLM performance. Results show that Gemma2 and\nGPT-4 outperform the other models, with GPT-4 producing answers closely aligned\nwith reference responses. Mid-tier models, such as Llama2, Llama3, and Mistral\nbenefit significantly from NMISS, highlighting their ability to provide richer\ncontextual information. This combined approach offers new insights into the\nreduction and more accurate assessment of hallucinations in LLMs, with\napplications in real-world healthcare tasks and other domains.",5
1,Luna: An Evaluation Foundation Model to Catch Language Model Hallucinations with High Accuracy and Low Cost,http://arxiv.org/abs/2406.00975v2,"Retriever Augmented Generation (RAG) systems have become pivotal in enhancing\nthe capabilities of language models by incorporating external knowledge\nretrieval mechanisms. However, a significant challenge in deploying these\nsystems in industry applications is the detection and mitigation of\nhallucinations: instances where the model generates information that is not\ngrounded in the retrieved context. Addressing this issue is crucial for\nensuring the reliability and accuracy of responses generated by large language\nmodels (LLMs) in diverse industry settings. Current hallucination detection\ntechniques fail to deliver accuracy, low latency, and low cost simultaneously.\nWe introduce Luna: a DeBERTA-large (440M) encoder, finetuned for hallucination\ndetection in RAG settings. We demonstrate that Luna outperforms GPT-3.5 and\ncommercial evaluation frameworks on the hallucination detection task, with 97%\nand 91% reduction in cost and latency, respectively. Luna is lightweight and\ngeneralizes across multiple industry verticals and out-of-domain data, making\nit an ideal candidate for industry LLM applications.",5
3,LRP4RAG: Detecting Hallucinations in Retrieval-Augmented Generation via Layer-wise Relevance Propagation,http://arxiv.org/abs/2408.15533v2,"Retrieval-Augmented Generation (RAG) has become a primary technique for\nmitigating hallucinations in large language models (LLMs). However, incomplete\nknowledge extraction and insufficient understanding can still mislead LLMs to\nproduce irrelevant or even contradictory responses, which means hallucinations\npersist in RAG. In this paper, we propose LRP4RAG, a method based on the\nLayer-wise Relevance Propagation (LRP) algorithm for detecting hallucinations\nin RAG. Specifically, we first utilize LRP to compute the relevance between the\ninput and output of the RAG generator. We then apply further extraction and\nresampling to the relevance matrix. The processed relevance data are input into\nmultiple classifiers to determine whether the output contains hallucinations.\nTo the best of our knowledge, this is the first time that LRP has been used for\ndetecting RAG hallucinations, and extensive experiments demonstrate that\nLRP4RAG outperforms existing baselines.",5
5,AlzheimerRAG: Multimodal Retrieval Augmented Generation for PubMed articles,http://arxiv.org/abs/2412.16701v1,"Recent advancements in generative AI have flourished the development of\nhighly adept Large Language Models (LLMs) that integrate diverse data types to\nempower decision-making. Among these, Multimodal Retrieval-Augmented Generation\n(RAG) applications are promising for their capability to combine the strengths\nof information retrieval and generative models, enhancing their utility across\nvarious domains, including biomedical research. This paper introduces\nAlzheimerRAG, a Multimodal RAG pipeline tool for biomedical research use cases,\nprimarily focusing on Alzheimer's disease from PubMed articles. Our pipeline\nincorporates multimodal fusion techniques to integrate textual and visual data\nprocessing by efficiently indexing and accessing vast amounts of biomedical\nliterature. Preliminary experimental results against benchmarks, such as BioASQ\nand PubMedQA, have returned improved results in information retrieval and\nsynthesis of domain-specific information. We also demonstrate a case study with\nour RAG pipeline across different Alzheimer's clinical scenarios. We infer that\nAlzheimerRAG can generate responses with accuracy non-inferior to humans and\nwith low rates of hallucination. Overall, a reduction in cognitive task load is\nobserved, which allows researchers to gain multimodal insights, improving\nunderstanding and treatment of Alzheimer's disease.",5
7,Searching for Best Practices in Retrieval-Augmented Generation,http://arxiv.org/abs/2407.01219v1,"Retrieval-augmented generation (RAG) techniques have proven to be effective\nin integrating up-to-date information, mitigating hallucinations, and enhancing\nresponse quality, particularly in specialized domains. While many RAG\napproaches have been proposed to enhance large language models through\nquery-dependent retrievals, these approaches still suffer from their complex\nimplementation and prolonged response times. Typically, a RAG workflow involves\nmultiple processing steps, each of which can be executed in various ways. Here,\nwe investigate existing RAG approaches and their potential combinations to\nidentify optimal RAG practices. Through extensive experiments, we suggest\nseveral strategies for deploying RAG that balance both performance and\nefficiency. Moreover, we demonstrate that multimodal retrieval techniques can\nsignificantly enhance question-answering capabilities about visual inputs and\naccelerate the generation of multimodal content using a ""retrieval as\ngeneration"" strategy.",5
8,The Geometry of Queries: Query-Based Innovations in Retrieval-Augmented Generation,http://arxiv.org/abs/2407.18044v1,"Digital health chatbots powered by Large Language Models (LLMs) have the\npotential to significantly improve personal health management for chronic\nconditions by providing accessible and on-demand health coaching and\nquestion-answering. However, these chatbots risk providing unverified and\ninaccurate information because LLMs generate responses based on patterns\nlearned from diverse internet data. Retrieval Augmented Generation (RAG) can\nhelp mitigate hallucinations and inaccuracies in LLM responses by grounding it\non reliable content. However, efficiently and accurately retrieving most\nrelevant set of content for real-time user questions remains a challenge. In\nthis work, we introduce Query-Based Retrieval Augmented Generation (QB-RAG), a\nnovel approach that pre-computes a database of potential queries from a content\nbase using LLMs. For an incoming patient question, QB-RAG efficiently matches\nit against this pre-generated query database using vector search, improving\nalignment between user questions and the content. We establish a theoretical\nfoundation for QB-RAG and provide a comparative analysis of existing retrieval\nenhancement techniques for RAG systems. Finally, our empirical evaluation\ndemonstrates that QB-RAG significantly improves the accuracy of healthcare\nquestion answering, paving the way for robust and trustworthy LLM applications\nin digital health.",5
10,Investigating the performance of Retrieval-Augmented Generation and fine-tuning for the development of AI-driven knowledge-based systems,http://arxiv.org/abs/2403.09727v1,"The development of generative large language models (G-LLM) opened up new\nopportunities for the development of new types of knowledge-based systems\nsimilar to ChatGPT, Bing, or Gemini. Fine-tuning (FN) and Retrieval-Augmented\nGeneration (RAG) are the techniques that can be used to implement domain\nadaptation for the development of G-LLM-based knowledge systems. In our study,\nusing ROUGE, BLEU, METEOR scores, and cosine similarity, we compare and examine\nthe performance of RAG and FN for the GPT-J-6B, OPT-6.7B, LlaMA, LlaMA-2\nlanguage models. Based on measurements shown on different datasets, we\ndemonstrate that RAG-based constructions are more efficient than models\nproduced with FN. We point out that connecting RAG and FN is not trivial,\nbecause connecting FN models with RAG can cause a decrease in performance.\nFurthermore, we outline a simple RAG-based architecture which, on average,\noutperforms the FN models by 16% in terms of the ROGUE score, 15% in the case\nof the BLEU score, and 53% based on the cosine similarity. This shows the\nsignificant advantage of RAG over FN in terms of hallucination, which is not\noffset by the fact that the average 8% better METEOR score of FN models\nindicates greater creativity compared to RAG.",5
12,Retrieval-Augmented Generation for Large Language Models: A Survey,http://arxiv.org/abs/2312.10997v5,"Large Language Models (LLMs) showcase impressive capabilities but encounter\nchallenges like hallucination, outdated knowledge, and non-transparent,\nuntraceable reasoning processes. Retrieval-Augmented Generation (RAG) has\nemerged as a promising solution by incorporating knowledge from external\ndatabases. This enhances the accuracy and credibility of the generation,\nparticularly for knowledge-intensive tasks, and allows for continuous knowledge\nupdates and integration of domain-specific information. RAG synergistically\nmerges LLMs' intrinsic knowledge with the vast, dynamic repositories of\nexternal databases. This comprehensive review paper offers a detailed\nexamination of the progression of RAG paradigms, encompassing the Naive RAG,\nthe Advanced RAG, and the Modular RAG. It meticulously scrutinizes the\ntripartite foundation of RAG frameworks, which includes the retrieval, the\ngeneration and the augmentation techniques. The paper highlights the\nstate-of-the-art technologies embedded in each of these critical components,\nproviding a profound understanding of the advancements in RAG systems.\nFurthermore, this paper introduces up-to-date evaluation framework and\nbenchmark. At the end, this article delineates the challenges currently faced\nand points out prospective avenues for research and development.",5
13,RAG-Thief: Scalable Extraction of Private Data from Retrieval-Augmented Generation Applications with Agent-based Attacks,http://arxiv.org/abs/2411.14110v1,"While large language models (LLMs) have achieved notable success in\ngenerative tasks, they still face limitations, such as lacking up-to-date\nknowledge and producing hallucinations. Retrieval-Augmented Generation (RAG)\nenhances LLM performance by integrating external knowledge bases, providing\nadditional context which significantly improves accuracy and knowledge\ncoverage. However, building these external knowledge bases often requires\nsubstantial resources and may involve sensitive information. In this paper, we\npropose an agent-based automated privacy attack called RAG-Thief, which can\nextract a scalable amount of private data from the private database used in RAG\napplications. We conduct a systematic study on the privacy risks associated\nwith RAG applications, revealing that the vulnerability of LLMs makes the\nprivate knowledge bases suffer significant privacy risks. Unlike previous\nmanual attacks which rely on traditional prompt injection techniques, RAG-Thief\nstarts with an initial adversarial query and learns from model responses,\nprogressively generating new queries to extract as many chunks from the\nknowledge base as possible. Experimental results show that our RAG-Thief can\nextract over 70% information from the private knowledge bases within customized\nRAG applications deployed on local machines and real-world platforms, including\nOpenAI's GPTs and ByteDance's Coze. Our findings highlight the privacy\nvulnerabilities in current RAG applications and underscore the pressing need\nfor stronger safeguards.",5
19,THaMES: An End-to-End Tool for Hallucination Mitigation and Evaluation in Large Language Models,http://arxiv.org/abs/2409.11353v3,"Hallucination, the generation of factually incorrect content, is a growing\nchallenge in Large Language Models (LLMs). Existing detection and mitigation\nmethods are often isolated and insufficient for domain-specific needs, lacking\na standardized pipeline. This paper introduces THaMES (Tool for Hallucination\nMitigations and EvaluationS), an integrated framework and library addressing\nthis gap. THaMES offers an end-to-end solution for evaluating and mitigating\nhallucinations in LLMs, featuring automated test set generation, multifaceted\nbenchmarking, and adaptable mitigation strategies. It automates test set\ncreation from any corpus, ensuring high data quality, diversity, and\ncost-efficiency through techniques like batch processing, weighted sampling,\nand counterfactual validation. THaMES assesses a model's ability to detect and\nreduce hallucinations across various tasks, including text generation and\nbinary classification, applying optimal mitigation strategies like In-Context\nLearning (ICL), Retrieval Augmented Generation (RAG), and Parameter-Efficient\nFine-tuning (PEFT). Evaluations of state-of-the-art LLMs using a knowledge base\nof academic papers, political news, and Wikipedia reveal that commercial models\nlike GPT-4o benefit more from RAG than ICL, while open-weight models like\nLlama-3.1-8B-Instruct and Mistral-Nemo gain more from ICL. Additionally, PEFT\nsignificantly enhances the performance of Llama-3.1-8B-Instruct in both\nevaluation tasks.",5
