In [1]:
import pandas as pd
from google.cloud import bigquery
from elasticsearch.helpers import bulk


In [2]:
DOMAIN='cs-AI'
GOOGLE_CLOUD_PROJECT='arxiv-trends'

Get environment variables with dotenv

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

### Data Ingestion

In [4]:
# Initialize
bq_client = bigquery.Client(project=GOOGLE_CLOUD_PROJECT)



In [None]:
def get_bq_data(domain='cs-AI'):
    domain_cleaned = domain.replace("-", "_")
    domain_cleaned = domain_cleaned.replace(".", "_")
    sql_query = f"""
    SELECT id, title, summary, author
    FROM `arxiv-trends.arxiv_papers.arxiv_papers_2000_2025_{domain_cleaned}`
    WHERE summary IS NOT NULL
    """

    query_job = bq_client.query(sql_query)
    results = query_job.result().to_dataframe()
    return results

raw_arxiv_df = get_bq_data(domain=DOMAIN)

In [6]:
raw_arxiv_df

Unnamed: 0,id,title,summary,author
0,http://arxiv.org/abs/1405.3637v2,Vicious Circle Principle and Logic Programs wi...,The paper presents a knowledge representation ...,"[Michael Gelfond, Yuanlin Zhang]"
1,http://arxiv.org/abs/1608.08262v1,Vicious Circle Principle and Formation of Sets...,The paper continues the investigation of Poinc...,"[Michael Gelfond, Yuanlin Zhang]"
2,http://arxiv.org/abs/1808.07050v1,Vicious Circle Principle and Logic Programs wi...,The paper presents a knowledge representation ...,"[Michael Gelfond, Yuanlin Zhang]"
3,http://arxiv.org/abs/2102.04323v2,Discovering a set of policies for the worst ca...,We study the problem of how to construct a set...,"[Tom Zahavy, Andre Barreto, Daniel J Mankowitz..."
4,http://arxiv.org/abs/2309.13426v2,A Chat About Boring Problems: Studying GPT-bas...,Text normalization - the conversion of text fr...,"[Yang Zhang, Travis M. Bartley, Mariana Grater..."
...,...,...,...,...
109198,http://arxiv.org/abs/2406.11326v1,GitHub Copilot: the perfect Code compLeeter?,This paper aims to evaluate GitHub Copilot's g...,"[Ilja Siroš, Dave Singelée, Bart Preneel]"
109199,http://arxiv.org/abs/physics/0005062v1,Applying MDL to Learning Best Model Granularity,The Minimum Description Length (MDL) principle...,"[Qiong Gao, Ming Li, Paul Vitanyi]"
109200,http://arxiv.org/abs/2202.07290v1,Don't stop the training: continuously-updating...,"Over the last decade, numerous studies have sh...","[Pierre Orhan, Yves Boubenec, Jean-Rémi King]"
109201,http://arxiv.org/abs/1911.00572v1,Probabilistic Formulation of the Take The Best...,The framework of cognitively bounded rationali...,"[Tomi Peltola, Jussi Jokinen, Samuel Kaski]"


In [7]:
raw_arxiv_df[raw_arxiv_df['id'] == 'http://arxiv.org/abs/2412.13337v1']

Unnamed: 0,id,title,summary,author
2967,http://arxiv.org/abs/2412.13337v1,Unveiling the Secret Recipe: A Guide For Super...,The rise of large language models (LLMs) has c...,"[Aldo Pareja, Nikhil Shivakumar Nayak, Hao Wan..."


Remove duplicates (if any)

In [8]:
# Clean duplicates before bulk indexing
arxiv_df = raw_arxiv_df.drop_duplicates(subset=['id'])
print(f"Removed duplicates: {len(raw_arxiv_df)} -> {len(arxiv_df)} rows")


Removed duplicates: 109203 -> 109203 rows


### Elastic Search: Find the most relevant papers for a given query

To use elastic search
``` 
docker run --name es01 --net elastic -p 9200:9200 \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "ES_JAVA_OPTS=-Xms512m -Xmx512m" \
  docker.elastic.co/elasticsearch/elasticsearch:8.13.4 
  ```

In [9]:
from elasticsearch import Elasticsearch

# Connect to your ES instance
es = Elasticsearch(
    "http://localhost:9200",  # Or your cloud instance
    #basic_auth=("user", "password")  # Only if authentication is enabled
)

In [10]:
print(es.ping())

True


In [11]:
index_name = "arxiv-papers"

index_mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "title": {"type": "text", "analyzer": "standard"},
            "summary": {"type": "text", "analyzer": "standard"},
            "author": {"type": "text", "analyzer": "standard"},
            #"published": {"type": "date"},
            #"categories": {"type": "keyword"}
        }
    }
}

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_mapping)

In [None]:
def generate_docs(df):
    for _, row in df.iterrows():
        yield {
            "_index": index_name,
            "_id": row["id"],
            "_source": {
                "id": row["id"],
                "title": row["title"],
                "summary": row["summary"],
                "author": row["author"],
                # "published": row["published"].isoformat() if row["published"] else None,
                # "categories": row["categories"]
            }
        }

In [46]:
def search_papers(query, top_k=10):
    # Text-based search
    text_query = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^0.5", "summary"],
                "type": "best_fields"
            }
        },
        "size": top_k
    }
    
    response = es.search(index=index_name, body=text_query)
    
    results = []
    for hit in response["hits"]["hits"]:
        results.append({
            "score": hit["_score"],
            "id": hit["_source"]["id"],
            "title": hit["_source"]["title"],
            "summary": hit["_source"]["summary"],
            "author": hit["_source"]["author"]
        })
    
    return results

In [14]:
# bulk index all documents
bulk(es, generate_docs(arxiv_df))

(109203, [])

### Using a LLM to asnwer a query based on the most relevant papers 

In [15]:
from openai import OpenAI
llm_client = OpenAI()

In [23]:
def build_prompt(query, relevant_papers):
    # Build context from summaries
    context = "\n\n".join([
        f"id: {paper['id']}\nPaper: {paper['title']}\nSummary: {paper['summary']}"
        for paper in relevant_papers
    ])
    
    # Create prompt for LLM
    prompt = f"""
    Based on the following research paper summaries, answer the question: {query}
    
    Context:
    {context}
    
    Answer:
    """
    return prompt

In [24]:
def llm(prompt, relevant_papers, model):
    
    # Send to your LLM of choice (OpenAI, etc.)
    llm_response = llm_client.chat.completions.create(
    model=model,
    messages=[{"role": "user", "content": prompt}]
)
    return {
        "llm_answer": llm_response,
        "sources": relevant_papers
    }

In [25]:
def rag(query, top_k, model):
    # search top N papers using elastic search
    relevant_papers = search_papers(query, top_k=top_k)

    # build prompt
    prompt = build_prompt(query, relevant_papers)

    # generate llm answer based on the relevant papers
    answer = llm(prompt, relevant_papers=relevant_papers, model=model)

    return answer

In [26]:
query = "What are the latest methods for fine-tuning LLMs on small datasets?"

Look for the most relevant papers

In [20]:
search_papers(query,top_k=5)

[{'score': 41.763905,
  'id': 'http://arxiv.org/abs/2412.13337v1',
  'title': 'Unveiling the Secret Recipe: A Guide For Supervised Fine-Tuning Small LLMs',
  'summary': 'The rise of large language models (LLMs) has created a significant disparity:\nindustrial research labs with their computational resources, expert teams, and\nadvanced infrastructures, can effectively fine-tune LLMs, while individual\ndevelopers and small organizations face barriers due to limited resources. In\nthis paper, we aim to bridge this gap by presenting a comprehensive study on\nsupervised fine-tuning of LLMs using instruction-tuning datasets spanning\ndiverse knowledge domains and skills. We focus on small-sized LLMs (3B to 7B\nparameters) for their cost-efficiency and accessibility. We explore various\ntraining configurations and strategies across four open-source pre-trained\nmodels. We provide detailed documentation of these configurations, revealing\nfindings that challenge several common training practi

Use an LLM to give an answer using as a context the most relevant papers

In [None]:
answer = rag(query,top_k=5,model="o4-mini")

In [30]:
print(answer['llm_answer'].choices[0].message.content)


The three main strands of very recent work on “small-data” fine-tuning of LLMs can be grouped as follows:

1.  Supervised instruction-tuning of small (3 B–7 B) LLMs with hyper-parameter best-practices  
    •  Large batch sizes + low learning rates often outperform the more common small-batch/high-LR recipes.  
    •  Monitor early-stage training dynamics (gradient norms, loss curves) to kill bad runs and save computation.  
    •  Simple learning-rate schedules and reduced warm-up are sufficient—no need for elaborate phased schedules.  
    •  “Stacked” instruction mixing (train on all tasks at once) is as good as or better than multi-phase curricula, and is easier to implement.

2.  Contrastive fine-tuning of embeddings on tiny labeled sets  
    •  Build anchor/positive/negative pairs and use a contrastive loss to sharpen semantic similarity.  
    •  Augment your small corpus with soft/expert-provided similarity scores so that the model “knows” graded relevance.  
    •  This yield

In [29]:
answer

{'llm_answer': ChatCompletion(id='chatcmpl-BpZ6WAKnXiWbGpbFFMxFkjopAlqQc', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The three main strands of very recent work on “small-data” fine-tuning of LLMs can be grouped as follows:\n\n1.  Supervised instruction-tuning of small (3\u2009B–7\u2009B) LLMs with hyper-parameter best-practices  \n    •  Large batch sizes + low learning rates often outperform the more common small-batch/high-LR recipes.  \n    •  Monitor early-stage training dynamics (gradient norms, loss curves) to kill bad runs and save computation.  \n    •  Simple learning-rate schedules and reduced warm-up are sufficient—no need for elaborate phased schedules.  \n    •  “Stacked” instruction mixing (train on all tasks at once) is as good as or better than multi-phase curricula, and is easier to implement.\n\n2.  Contrastive fine-tuning of embeddings on tiny labeled sets  \n    •  Build anchor/positive/negative pairs and us

## Retrieval Evaluation

In [36]:
# Load your generated questions
eval_df = pd.read_csv('../data/arxiv_ground_truth_retrieval.csv')

In [47]:
# Test your RAG system
def evaluate_rag(question, expected_paper_id):
    # Run your RAG
    results = search_papers(question, top_k=5)
    
    # Check if expected paper is in top results
    retrieved_ids = [r['id'] for r in results]
    
    return {
        'question': question,
        'expected_paper': expected_paper_id,
        'retrieved_papers': retrieved_ids,
        'hit_at_1': expected_paper_id == retrieved_ids[0] if retrieved_ids else False,
        'hit_at_5': expected_paper_id in retrieved_ids
    }



In [49]:
sample_size=1000
# Evaluate on sample
sample_questions = eval_df.head(sample_size)
eval_results = []

for _, row in sample_questions.iterrows():
    result = evaluate_rag(row['question'], row['paper_id'])
    eval_results.append(result)

# Calculate metrics
hit_at_1 = sum([r['hit_at_1'] for r in eval_results]) / len(eval_results)
hit_at_5 = sum([r['hit_at_5'] for r in eval_results]) / len(eval_results)

print(f"Hit@1: {hit_at_1:.2f}")
print(f"Hit@5: {hit_at_5:.2f}")

Hit@1: 0.62
Hit@5: 0.75


## Optimize boosting parameters

In [44]:
import itertools
import numpy as np
from typing import List, Dict, Tuple

In [40]:
def search_papers_with_boost(query, title_boost=2.0, summary_boost=1.0, top_k=10):
    """Modified search function with configurable boost parameters"""
    text_query = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": [f"title^{title_boost}", f"summary^{summary_boost}"],
                "type": "best_fields"
            }
        },
        "size": top_k
    }
   
    response = es.search(index=index_name, body=text_query)
   
    results = []
    for hit in response["hits"]["hits"]:
        results.append({
            "score": hit["_score"],
            "id": hit["_source"]["id"],
            "title": hit["_source"]["title"],
            "summary": hit["_source"]["summary"],
            "author": hit["_source"]["author"]
        })
   
    return results

In [41]:
def evaluate_rag_with_boost(question, expected_paper_id, title_boost=2.0, summary_boost=1.0):
    """Evaluate RAG with specific boost parameters"""
    results = search_papers_with_boost(question, title_boost, summary_boost, top_k=5)
    retrieved_ids = [r['id'] for r in results]
   
    return {
        'question': question,
        'expected_paper': expected_paper_id,
        'retrieved_papers': retrieved_ids,
        'hit_at_1': expected_paper_id == retrieved_ids[0] if retrieved_ids else False,
        'hit_at_5': expected_paper_id in retrieved_ids,
        'title_boost': title_boost,
        'summary_boost': summary_boost
    }

In [42]:
def grid_search_boost_params(eval_df, sample_size=50):
    """Grid search over boost parameters"""
    # Define parameter ranges
    title_boosts = [0.5, 1.0, 2.0, 3.0]
    summary_boosts = [0.5, 1.0, 2.0, 3.0]
    
    sample_questions = eval_df.head(sample_size)
    
    best_params = None
    best_score = 0
    results_log = []
    
    print(f"Testing {len(title_boosts) * len(summary_boosts)} parameter combinations...")
    
    for title_boost, summary_boost in itertools.product(title_boosts, summary_boosts):
        print(f"Testing title_boost={title_boost}, summary_boost={summary_boost}")
        
        eval_results = []
        for _, row in sample_questions.iterrows():
            result = evaluate_rag_with_boost(
                row['question'], 
                row['paper_id'],
                title_boost=title_boost,
                summary_boost=summary_boost
            )
            eval_results.append(result)
        
        # Calculate metrics
        hit_at_1 = sum([r['hit_at_1'] for r in eval_results]) / len(eval_results)
        hit_at_5 = sum([r['hit_at_5'] for r in eval_results]) / len(eval_results)
        
        # Use weighted score (adjust weights based on importance)
        combined_score = 0.7 * hit_at_1 + 0.3 * hit_at_5
        
        results_log.append({
            'title_boost': title_boost,
            'summary_boost': summary_boost,
            'hit_at_1': hit_at_1,
            'hit_at_5': hit_at_5,
            'combined_score': combined_score
        })
        
        if combined_score > best_score:
            best_score = combined_score
            best_params = (title_boost, summary_boost)
            print(f"New best: Hit@1={hit_at_1:.3f}, Hit@5={hit_at_5:.3f}, Score={combined_score:.3f}")
    
    return best_params, results_log

In [45]:
print("=== Grid Search Optimization ===")
best_params, grid_results = grid_search_boost_params(eval_df, sample_size=50)
print(f"Best parameters: title_boost={best_params[0]}, summary_boost={best_params[1]}")

=== Grid Search Optimization ===
Testing 16 parameter combinations...
Testing title_boost=0.5, summary_boost=0.5
New best: Hit@1=0.620, Hit@5=0.840, Score=0.686
Testing title_boost=0.5, summary_boost=1.0
New best: Hit@1=0.640, Hit@5=0.800, Score=0.688
Testing title_boost=0.5, summary_boost=2.0
Testing title_boost=0.5, summary_boost=3.0
Testing title_boost=1.0, summary_boost=0.5
Testing title_boost=1.0, summary_boost=1.0
Testing title_boost=1.0, summary_boost=2.0
Testing title_boost=1.0, summary_boost=3.0
Testing title_boost=2.0, summary_boost=0.5
Testing title_boost=2.0, summary_boost=1.0
Testing title_boost=2.0, summary_boost=2.0
Testing title_boost=2.0, summary_boost=3.0
Testing title_boost=3.0, summary_boost=0.5
Testing title_boost=3.0, summary_boost=1.0
Testing title_boost=3.0, summary_boost=2.0
Testing title_boost=3.0, summary_boost=3.0
Best parameters: title_boost=0.5, summary_boost=1.0


## RAG Evaluation

In [50]:
import json
import pandas as pd
from tqdm import tqdm
import time
import logging
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
import re

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class EvaluationResult:
    """Structured evaluation result"""
    relevance: str
    explanation: str
    confidence: Optional[float] = None
    aspects: Optional[Dict[str, str]] = None

class RAGJudge:
    """LLM-as-Judge evaluator for RAG systems"""
    
    def __init__(self, llm_client, template_type="comprehensive"):
        self.llm_client = llm_client
        self.template_type = template_type
        self.templates = self._get_templates()
        
    def _get_templates(self) -> Dict[str, str]:
        """Get evaluation prompt templates"""
        templates = {
            "basic": """
You are an expert evaluator for a RAG system.
Evaluate the relevance of the generated answer to the given question.
Classify as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Question: {question}
Generated Answer: {answer_llm}

Provide your evaluation in JSON format:
{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Brief explanation for your evaluation]"
}}
""".strip(),
            
            "comprehensive": """
You are an expert evaluator for a RAG system that retrieves and answers questions about academic papers.

Your task is to evaluate the quality of the generated answer based on multiple criteria:

1. **Relevance**: Does the answer address the question asked?
2. **Accuracy**: Is the information provided factually correct?
3. **Completeness**: Does the answer provide sufficient detail?
4. **Clarity**: Is the answer well-structured and easy to understand?

Question: {question}
Generated Answer: {answer_llm}

Please provide your evaluation in JSON format:
{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Accuracy": "INACCURATE" | "PARTLY_ACCURATE" | "ACCURATE",
  "Completeness": "INCOMPLETE" | "PARTLY_COMPLETE" | "COMPLETE",
  "Clarity": "UNCLEAR" | "PARTLY_CLEAR" | "CLEAR",
  "Overall_Score": 1-5,
  "Explanation": "[Detailed explanation of your evaluation]"
}}
""".strip(),
            
            "with_context": """
You are an expert evaluator for a RAG system that retrieves and answers questions about academic papers.

Question: {question}
Retrieved Papers: {retrieved_papers}
Generated Answer: {answer_llm}

Evaluate the answer considering:
1. **Relevance**: Does it answer the question?
2. **Grounding**: Is it based on the retrieved papers?
3. **Accuracy**: Is the information correct?
4. **Citation**: Are sources properly referenced?

Provide evaluation in JSON format:
{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Grounding": "NOT_GROUNDED" | "PARTLY_GROUNDED" | "WELL_GROUNDED",
  "Accuracy": "INACCURATE" | "PARTLY_ACCURATE" | "ACCURATE",
  "Citation": "NO_CITATION" | "POOR_CITATION" | "GOOD_CITATION",
  "Overall_Score": 1-5,
  "Explanation": "[Detailed explanation]"
}}
""".strip()
        }
        return templates
    
    def _clean_json_response(self, response: str) -> str:
        """Clean and extract JSON from LLM response"""
        # Remove code blocks
        response = re.sub(r'```json\s*|\s*```', '', response)
        response = re.sub(r'```\s*|\s*```', '', response)
        
        # Find JSON-like content
        json_match = re.search(r'\{.*\}', response, re.DOTALL)
        if json_match:
            return json_match.group(0)
        
        return response.strip()
    
    def evaluate_single(self, question: str, answer: str, 
                       retrieved_papers: Optional[List[Dict]] = None) -> EvaluationResult:
        """Evaluate a single question-answer pair"""
        
        # Choose template based on available data
        if retrieved_papers and self.template_type == "with_context":
            template = self.templates["with_context"]
            papers_text = "\n".join([
                f"- {paper.get('title', 'Unknown Title')}: {paper.get('summary', 'No summary')[:200]}..."
                for paper in retrieved_papers[:3]  # Limit to top 3 papers
            ])
            prompt = template.format(
                question=question,
                answer_llm=answer,
                retrieved_papers=papers_text
            )
        else:
            template = self.templates[self.template_type]
            prompt = template.format(question=question, answer_llm=answer)
        
        try:
            # Get LLM evaluation
            raw_response = self.llm_client(prompt)
            
            # Clean and parse JSON
            clean_response = self._clean_json_response(raw_response)
            evaluation_dict = json.loads(clean_response)
            
            # Extract basic fields
            relevance = evaluation_dict.get('Relevance', 'UNKNOWN')
            explanation = evaluation_dict.get('Explanation', 'No explanation provided')
            
            # Extract additional fields for comprehensive evaluation
            aspects = {}
            if 'Accuracy' in evaluation_dict:
                aspects['accuracy'] = evaluation_dict['Accuracy']
            if 'Completeness' in evaluation_dict:
                aspects['completeness'] = evaluation_dict['Completeness']
            if 'Clarity' in evaluation_dict:
                aspects['clarity'] = evaluation_dict['Clarity']
            if 'Grounding' in evaluation_dict:
                aspects['grounding'] = evaluation_dict['Grounding']
            if 'Citation' in evaluation_dict:
                aspects['citation'] = evaluation_dict['Citation']
            
            confidence = evaluation_dict.get('Overall_Score', None)
            
            return EvaluationResult(
                relevance=relevance,
                explanation=explanation,
                confidence=confidence,
                aspects=aspects if aspects else None
            )
            
        except (json.JSONDecodeError, Exception) as e:
            logger.error(f"Error parsing evaluation: {e}")
            logger.error(f"Raw response: {raw_response}")
            return EvaluationResult(
                relevance="UNKNOWN",
                explanation=f"Error parsing evaluation: {str(e)}"
            )
    
    def evaluate_batch(self, eval_df: pd.DataFrame, 
                      rag_function,
                      sample_size: int = 200,
                      save_path: Optional[str] = None,
                      rate_limit_delay: float = 0.1) -> pd.DataFrame:
        """Evaluate a batch of questions"""
        
        # Sample data
        if sample_size < len(eval_df):
            df_sample = eval_df.sample(n=sample_size, random_state=42)
        else:
            df_sample = eval_df.copy()
        
        results = []
        
        for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Evaluating"):
            question = row['question']
            expected_paper_id = row.get('paper_id', None)
            
            try:
                # Get RAG answer and retrieved papers
                if hasattr(rag_function, '__call__'):
                    # If rag_function returns both answer and papers
                    try:
                        rag_result = rag_function(question)
                        if isinstance(rag_result, tuple):
                            answer, retrieved_papers = rag_result
                        else:
                            answer = rag_result
                            retrieved_papers = None
                    except Exception as e:
                        logger.error(f"Error in RAG function: {e}")
                        continue
                else:
                    answer = "RAG function not callable"
                    retrieved_papers = None
                
                # Evaluate with LLM judge
                evaluation = self.evaluate_single(question, answer, retrieved_papers)
                
                # Store results
                result = {
                    'question': question,
                    'expected_paper_id': expected_paper_id,
                    'answer': answer,
                    'relevance': evaluation.relevance,
                    'explanation': evaluation.explanation,
                    'confidence_score': evaluation.confidence,
                }
                
                # Add aspect scores if available
                if evaluation.aspects:
                    result.update(evaluation.aspects)
                
                results.append(result)
                
                # Rate limiting
                if rate_limit_delay > 0:
                    time.sleep(rate_limit_delay)
                    
            except Exception as e:
                logger.error(f"Error evaluating question '{question}': {e}")
                continue
        
        # Create results DataFrame
        df_results = pd.DataFrame(results)
        
        # Save results
        if save_path:
            df_results.to_csv(save_path, index=False)
            logger.info(f"Results saved to {save_path}")
        
        return df_results

def analyze_evaluation_results(df_results: pd.DataFrame) -> Dict:
    """Analyze and summarize evaluation results"""
    
    analysis = {}
    
    # Basic relevance distribution
    if 'relevance' in df_results.columns:
        relevance_dist = df_results['relevance'].value_counts(normalize=True)
        analysis['relevance_distribution'] = relevance_dist.to_dict()
    
    # Confidence score statistics
    if 'confidence_score' in df_results.columns:
        confidence_stats = df_results['confidence_score'].describe()
        analysis['confidence_statistics'] = confidence_stats.to_dict()
    
    # Aspect analysis (if comprehensive evaluation)
    aspects = ['accuracy', 'completeness', 'clarity', 'grounding', 'citation']
    for aspect in aspects:
        if aspect in df_results.columns:
            aspect_dist = df_results[aspect].value_counts(normalize=True)
            analysis[f'{aspect}_distribution'] = aspect_dist.to_dict()
    
    # Overall quality score
    if 'confidence_score' in df_results.columns:
        avg_score = df_results['confidence_score'].mean()
        analysis['average_quality_score'] = avg_score
    
    return analysis

def enhanced_rag_function(question: str, search_papers_func, llm_client) -> Tuple[str, List[Dict]]:
    """Enhanced RAG function that returns both answer and retrieved papers"""
    
    # Get relevant papers
    retrieved_papers = search_papers_func(question, top_k=5)
    
    # Create context from papers
    context = ""
    for i, paper in enumerate(retrieved_papers[:3]):  # Use top 3 papers
        context += f"Paper {i+1}: {paper['title']}\n"
        context += f"Summary: {paper['summary'][:300]}...\n\n"
    
    # Generate answer
    answer_prompt = f"""
Based on the following academic papers, answer the question:

Question: {question}

Relevant Papers:
{context}

Please provide a comprehensive answer based on the information from these papers.
"""
    
    answer = llm_client(answer_prompt)
    
    return answer, retrieved_papers

In [55]:
# Create a callable wrapper
def llm_callable(prompt: str) -> str:
    response = llm_client.chat.completions.create(
        model="gpt-4o-mini",  # or your preferred model
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
        max_tokens=1000
    )
    return response.choices[0].message.content

# Now use it with the judge
judge = RAGJudge(llm_client=llm_callable, template_type="comprehensive")

# Create your RAG function
def rag_function(question):
    return enhanced_rag_function(question, search_papers, llm_callable)

# Run evaluation
rag_eval_results = judge.evaluate_batch(
    eval_df=eval_df,
    rag_function=rag_function,
    sample_size=10, 
    save_path="../data/rag_evaluation_results.csv"
)

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]INFO:elastic_transport.transport:POST http://localhost:9200/arxiv-papers/_search [status:200 duration:0.026s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Evaluating:  10%|█         | 1/10 [00:07<01:11,  7.93s/it]INFO:elastic_transport.transport:POST http://localhost:9200/arxiv-papers/_search [status:200 duration:0.031s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Evaluating:  20%|██        | 2/10 [00:19<01:21, 10.21s/it]INFO:elastic_transport.transport:POST http://localhost:9200/arxiv-papers/_search [status:200 duration:0.032s]
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.

In [61]:
rag_eval_results

Unnamed: 0,question,expected_paper_id,answer,relevance,explanation,confidence_score,accuracy,completeness,clarity
0,What limitations in existing embedding models ...,http://arxiv.org/abs/2007.06267v2,BoxE addresses several limitations in existing...,RELEVANT,The generated answer directly addresses the qu...,5,ACCURATE,COMPLETE,CLEAR
1,What are the primary use cases of conversation...,http://arxiv.org/abs/2407.12004v1,Based on the summaries of the provided academi...,RELEVANT,The generated answer directly addresses the qu...,5,ACCURATE,COMPLETE,CLEAR
2,What future research directions does the paper...,http://arxiv.org/abs/1311.0716v1,Based on the summaries of the three academic p...,RELEVANT,The generated answer directly addresses the qu...,5,ACCURATE,COMPLETE,CLEAR
3,In what ways does the report address the envir...,http://arxiv.org/abs/2310.03715v1,The provided summaries of the academic papers ...,PARTLY_RELEVANT,The generated answer addresses the question ab...,3,PARTLY_ACCURATE,PARTLY_COMPLETE,CLEAR
4,What implications does the proposed method hav...,http://arxiv.org/abs/1212.4799v2,"The proposed method, as discussed in the conte...",RELEVANT,The generated answer directly addresses the im...,5,ACCURATE,COMPLETE,CLEAR
5,What technologies support the modular design o...,http://arxiv.org/abs/2003.00925v1,The modular design of the CAAI (Cognitive Arch...,RELEVANT,The generated answer effectively addresses the...,5,ACCURATE,COMPLETE,CLEAR
6,How do the online procedures compare in terms ...,http://arxiv.org/abs/1007.0614v1,Based on the summaries of the provided academi...,RELEVANT,The generated answer directly addresses the qu...,5,ACCURATE,COMPLETE,CLEAR
7,How do the findings relate to the reliability ...,http://arxiv.org/abs/2403.14859v2,The findings from the three papers provide ins...,RELEVANT,The generated answer directly addresses the qu...,5,ACCURATE,COMPLETE,CLEAR
8,In what ways do the authors suggest policymake...,http://arxiv.org/abs/2211.00065v1,Based on the summaries of the provided academi...,RELEVANT,The generated answer directly addresses the qu...,5,ACCURATE,COMPLETE,CLEAR
9,What implications for future research on XAI i...,http://arxiv.org/abs/2211.06561v1,The implications for future research on Explai...,RELEVANT,The generated answer directly addresses the qu...,5,ACCURATE,COMPLETE,CLEAR


In [62]:
rag_eval_results[rag_eval_results['relevance']=='NON_RELEVANT']

Unnamed: 0,question,expected_paper_id,answer,relevance,explanation,confidence_score,accuracy,completeness,clarity


In [64]:
# Analyze results
analysis = analyze_evaluation_results(rag_eval_results)
print("Evaluation Analysis:")
for key, value in analysis.items():
    print(f"{key}: {value}")



Evaluation Analysis:
relevance_distribution: {'RELEVANT': 0.9, 'PARTLY_RELEVANT': 0.1}
confidence_statistics: {'count': 10.0, 'mean': 4.8, 'std': 0.6324555320336759, 'min': 3.0, '25%': 5.0, '50%': 5.0, '75%': 5.0, 'max': 5.0}
accuracy_distribution: {'ACCURATE': 0.9, 'PARTLY_ACCURATE': 0.1}
completeness_distribution: {'COMPLETE': 0.9, 'PARTLY_COMPLETE': 0.1}
clarity_distribution: {'CLEAR': 1.0}
average_quality_score: 4.8


In [None]:
# Compare different RAG configurations
configurations = [
    {"name": "baseline", "title_boost": 2.0, "summary_boost": 1.0},
    {"name": "optimized", "title_boost": 3.5, "summary_boost": 1.5},
]

comparison_results = {}
for config in configurations:
    # Update search function with new parameters
    rag_func = lambda q: enhanced_rag_function(
        q, 
        lambda query, top_k: search_papers_with_boost(
            query, config["title_boost"], config["summary_boost"], top_k
        ),
        llm_client
    )
    
    results = judge.evaluate_batch(
        eval_df=eval_df,
        rag_function=rag_func,
        sample_size=10,
        save_path=f"rag_eval_{config['name']}.csv"
    )
    
    comparison_results[config['name']] = analyze_evaluation_results(results)

# Print comparison
print("\nConfiguration Comparison:")
for name, analysis in comparison_results.items():
    print(f"\n{name.upper()}:")
    if 'average_quality_score' in analysis:
        print(f"  Average Quality Score: {analysis['average_quality_score']:.2f}")
    if 'relevance_distribution' in analysis:
        relevant_pct = analysis['relevance_distribution'].get('RELEVANT', 0) * 100
        print(f"  Relevant Answers: {relevant_pct:.1f}%")