In [1]:
import pandas as pd
from google.cloud import bigquery
from elasticsearch.helpers import bulk


In [2]:
DOMAIN='cs-AI'
GOOGLE_CLOUD_PROJECT='arxiv-trends'

Get environment variables with dotenv

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

### Data Ingestion

In [None]:
# Initialize
bq_client = bigquery.Client(project=GOOGLE_CLOUD_PROJECT)



In [None]:
def get_bq_data(domain='cs-AI'):
    domain_cleaned = domain.replace("-", "_")
    domain_cleaned = domain_cleaned.replace(".", "_")
    sql_query = f"""
    SELECT id, title, summary, author
    FROM `arxiv-trends.arxiv_papers.arxiv_papers_2000_2025_{domain_cleaned}`
    WHERE summary IS NOT NULL
    """

    query_job = bq_client.query(sql_query)
    results = query_job.result().to_dataframe()
    return results

raw_arxiv_df = get_bq_data(domain=DOMAIN)

In [6]:
raw_arxiv_df

Unnamed: 0,id,title,summary,author
0,http://arxiv.org/abs/1405.3637v2,Vicious Circle Principle and Logic Programs wi...,The paper presents a knowledge representation ...,"[Michael Gelfond, Yuanlin Zhang]"
1,http://arxiv.org/abs/1608.08262v1,Vicious Circle Principle and Formation of Sets...,The paper continues the investigation of Poinc...,"[Michael Gelfond, Yuanlin Zhang]"
2,http://arxiv.org/abs/1808.07050v1,Vicious Circle Principle and Logic Programs wi...,The paper presents a knowledge representation ...,"[Michael Gelfond, Yuanlin Zhang]"
3,http://arxiv.org/abs/2102.04323v2,Discovering a set of policies for the worst ca...,We study the problem of how to construct a set...,"[Tom Zahavy, Andre Barreto, Daniel J Mankowitz..."
4,http://arxiv.org/abs/2309.13426v2,A Chat About Boring Problems: Studying GPT-bas...,Text normalization - the conversion of text fr...,"[Yang Zhang, Travis M. Bartley, Mariana Grater..."
...,...,...,...,...
109198,http://arxiv.org/abs/2406.11326v1,GitHub Copilot: the perfect Code compLeeter?,This paper aims to evaluate GitHub Copilot's g...,"[Ilja Siroš, Dave Singelée, Bart Preneel]"
109199,http://arxiv.org/abs/physics/0005062v1,Applying MDL to Learning Best Model Granularity,The Minimum Description Length (MDL) principle...,"[Qiong Gao, Ming Li, Paul Vitanyi]"
109200,http://arxiv.org/abs/2202.07290v1,Don't stop the training: continuously-updating...,"Over the last decade, numerous studies have sh...","[Pierre Orhan, Yves Boubenec, Jean-Rémi King]"
109201,http://arxiv.org/abs/1911.00572v1,Probabilistic Formulation of the Take The Best...,The framework of cognitively bounded rationali...,"[Tomi Peltola, Jussi Jokinen, Samuel Kaski]"


In [7]:
raw_arxiv_df[raw_arxiv_df['id'] == 'http://arxiv.org/abs/2412.13337v1']

Unnamed: 0,id,title,summary,author
2967,http://arxiv.org/abs/2412.13337v1,Unveiling the Secret Recipe: A Guide For Super...,The rise of large language models (LLMs) has c...,"[Aldo Pareja, Nikhil Shivakumar Nayak, Hao Wan..."


Remove duplicates (if any)

In [None]:
# Clean duplicates before bulk indexing
arxiv_df = raw_arxiv_df.drop_duplicates(subset=['id'])
print(f"Removed duplicates: {len(raw_arxiv_df)} -> {len(arxiv_df)} rows")


Removed duplicates: 109203 -> 109203 rows


### Elastic Search: Find the most relevant papers for a given query

To use elastic search
``` 
docker run --name es01 --net elastic -p 9200:9200 \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "ES_JAVA_OPTS=-Xms512m -Xmx512m" \
  docker.elastic.co/elasticsearch/elasticsearch:8.13.4 
  ```

In [9]:
from elasticsearch import Elasticsearch

# Connect to your ES instance
es = Elasticsearch(
    "http://localhost:9200",  # Or your cloud instance
    #basic_auth=("user", "password")  # Only if authentication is enabled
)

In [10]:
print(es.ping())

True


In [11]:
index_name = "arxiv-papers"

index_mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "title": {"type": "text", "analyzer": "standard"},
            "summary": {"type": "text", "analyzer": "standard"},
            "author": {"type": "text", "analyzer": "standard"},
            #"published": {"type": "date"},
            #"categories": {"type": "keyword"}
        }
    }
}

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_mapping)

In [None]:
def generate_docs(df):
    for _, row in df.iterrows():
        yield {
            "_index": index_name,
            "_id": row["id"],
            "_source": {
                "id": row["id"],
                "title": row["title"],
                "summary": row["summary"],
                "author": row["author"],
                # "published": row["published"].isoformat() if row["published"] else None,
                # "categories": row["categories"]
            }
        }

In [None]:
def search_papers(query, top_k=10):
    # Text-based search
    text_query = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^2", "summary"],
                "type": "best_fields"
            }
        },
        "size": top_k
    }
    
    response = es.search(index=index_name, body=text_query)
    
    results = []
    for hit in response["hits"]["hits"]:
        results.append({
            "score": hit["_score"],
            "id": hit["_source"]["id"],
            "title": hit["_source"]["title"],
            "summary": hit["_source"]["summary"],
            "author": hit["_source"]["author"]
        })
    
    return results

In [None]:
# bulk index all documents
bulk(es, generate_docs(arxiv_df))

(109203, [])

### Using a LLM to asnwer a query based on the most relevant papers 

In [15]:
from openai import OpenAI
llm_client = OpenAI()

In [23]:
def build_prompt(query, relevant_papers):
    # Build context from summaries
    context = "\n\n".join([
        f"id: {paper['id']}\nPaper: {paper['title']}\nSummary: {paper['summary']}"
        for paper in relevant_papers
    ])
    
    # Create prompt for LLM
    prompt = f"""
    Based on the following research paper summaries, answer the question: {query}
    
    Context:
    {context}
    
    Answer:
    """
    return prompt

In [24]:
def llm(prompt, relevant_papers, model):
    
    # Send to your LLM of choice (OpenAI, etc.)
    llm_response = llm_client.chat.completions.create(
    model=model,
    messages=[{"role": "user", "content": prompt}]
)
    return {
        "llm_answer": llm_response,
        "sources": relevant_papers
    }

In [25]:
def rag(query, top_k, model):
    # search top N papers using elastic search
    relevant_papers = search_papers(query, top_k=top_k)

    # build prompt
    prompt = build_prompt(query, relevant_papers)

    # generate llm answer based on the relevant papers
    answer = llm(prompt, relevant_papers=relevant_papers, model=model)

    return answer

In [26]:
query = "What are the latest methods for fine-tuning LLMs on small datasets?"

Look for the most relevant papers

In [20]:
search_papers(query,top_k=5)

[{'score': 41.763905,
  'id': 'http://arxiv.org/abs/2412.13337v1',
  'title': 'Unveiling the Secret Recipe: A Guide For Supervised Fine-Tuning Small LLMs',
  'summary': 'The rise of large language models (LLMs) has created a significant disparity:\nindustrial research labs with their computational resources, expert teams, and\nadvanced infrastructures, can effectively fine-tune LLMs, while individual\ndevelopers and small organizations face barriers due to limited resources. In\nthis paper, we aim to bridge this gap by presenting a comprehensive study on\nsupervised fine-tuning of LLMs using instruction-tuning datasets spanning\ndiverse knowledge domains and skills. We focus on small-sized LLMs (3B to 7B\nparameters) for their cost-efficiency and accessibility. We explore various\ntraining configurations and strategies across four open-source pre-trained\nmodels. We provide detailed documentation of these configurations, revealing\nfindings that challenge several common training practi

Use an LLM to give an answer using as a context the most relevant papers

In [None]:
answer = rag(query,top_k=5,model="o4-mini")

In [30]:
print(answer['llm_answer'].choices[0].message.content)


The three main strands of very recent work on “small-data” fine-tuning of LLMs can be grouped as follows:

1.  Supervised instruction-tuning of small (3 B–7 B) LLMs with hyper-parameter best-practices  
    •  Large batch sizes + low learning rates often outperform the more common small-batch/high-LR recipes.  
    •  Monitor early-stage training dynamics (gradient norms, loss curves) to kill bad runs and save computation.  
    •  Simple learning-rate schedules and reduced warm-up are sufficient—no need for elaborate phased schedules.  
    •  “Stacked” instruction mixing (train on all tasks at once) is as good as or better than multi-phase curricula, and is easier to implement.

2.  Contrastive fine-tuning of embeddings on tiny labeled sets  
    •  Build anchor/positive/negative pairs and use a contrastive loss to sharpen semantic similarity.  
    •  Augment your small corpus with soft/expert-provided similarity scores so that the model “knows” graded relevance.  
    •  This yield

In [29]:
answer

{'llm_answer': ChatCompletion(id='chatcmpl-BpZ6WAKnXiWbGpbFFMxFkjopAlqQc', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='The three main strands of very recent work on “small-data” fine-tuning of LLMs can be grouped as follows:\n\n1.  Supervised instruction-tuning of small (3\u2009B–7\u2009B) LLMs with hyper-parameter best-practices  \n    •  Large batch sizes + low learning rates often outperform the more common small-batch/high-LR recipes.  \n    •  Monitor early-stage training dynamics (gradient norms, loss curves) to kill bad runs and save computation.  \n    •  Simple learning-rate schedules and reduced warm-up are sufficient—no need for elaborate phased schedules.  \n    •  “Stacked” instruction mixing (train on all tasks at once) is as good as or better than multi-phase curricula, and is easier to implement.\n\n2.  Contrastive fine-tuning of embeddings on tiny labeled sets  \n    •  Build anchor/positive/negative pairs and us

### RAG Evaluation

In [None]:
# Load your generated questions
eval_df = pd.read_csv('arxiv_ground_truth_retrieval.csv')

Hit@1: 0.16
Hit@5: 0.40


In [33]:
# Test your RAG system
def evaluate_rag(question, expected_paper_id):
    # Run your RAG
    results = search_papers(question, top_k=5)
    
    # Check if expected paper is in top results
    retrieved_ids = [r['id'] for r in results]
    
    return {
        'question': question,
        'expected_paper': expected_paper_id,
        'retrieved_papers': retrieved_ids,
        'hit_at_1': expected_paper_id == retrieved_ids[0] if retrieved_ids else False,
        'hit_at_5': expected_paper_id in retrieved_ids
    }



In [34]:
sample_size=50
# Evaluate on sample
sample_questions = eval_df.head(sample_size)
eval_results = []

for _, row in sample_questions.iterrows():
    result = evaluate_rag(row['question'], row['paper_id'])
    eval_results.append(result)

# Calculate metrics
hit_at_1 = sum([r['hit_at_1'] for r in eval_results]) / len(eval_results)
hit_at_5 = sum([r['hit_at_5'] for r in eval_results]) / len(eval_results)

print(f"Hit@1: {hit_at_1:.2f}")
print(f"Hit@5: {hit_at_5:.2f}")

Hit@1: 0.16
Hit@5: 0.40
