In [1]:
import pandas as pd
from google.cloud import bigquery
from elasticsearch.helpers import bulk


In [2]:
DOMAIN='cs-AI'
GOOGLE_CLOUD_PROJECT='arxiv-trends'

Get environment variables with dotenv

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
# Initialize
client = bigquery.Client(project=GOOGLE_CLOUD_PROJECT)



In [5]:
def get_bq_data(domain='cs-AI'):
    domain_cleaned = domain.replace("-", "_")
    domain_cleaned = domain_cleaned.replace(".", "_")
    sql_query = f"""
    SELECT id, title, summary, author
    FROM `arxiv-trends.arxiv_papers.arxiv_papers_2000_2025_{domain_cleaned}`
    WHERE summary IS NOT NULL
    """

    query_job = client.query(sql_query)
    results = query_job.result().to_dataframe()
    return results

raw_arxiv_df = get_bq_data(domain=DOMAIN)

In [6]:
raw_arxiv_df

Unnamed: 0,id,title,summary,author
0,http://arxiv.org/abs/1405.3637v2,Vicious Circle Principle and Logic Programs wi...,The paper presents a knowledge representation ...,"[Michael Gelfond, Yuanlin Zhang]"
1,http://arxiv.org/abs/1608.08262v1,Vicious Circle Principle and Formation of Sets...,The paper continues the investigation of Poinc...,"[Michael Gelfond, Yuanlin Zhang]"
2,http://arxiv.org/abs/1808.07050v1,Vicious Circle Principle and Logic Programs wi...,The paper presents a knowledge representation ...,"[Michael Gelfond, Yuanlin Zhang]"
3,http://arxiv.org/abs/2102.04323v2,Discovering a set of policies for the worst ca...,We study the problem of how to construct a set...,"[Tom Zahavy, Andre Barreto, Daniel J Mankowitz..."
4,http://arxiv.org/abs/2309.13426v2,A Chat About Boring Problems: Studying GPT-bas...,Text normalization - the conversion of text fr...,"[Yang Zhang, Travis M. Bartley, Mariana Grater..."
...,...,...,...,...
109198,http://arxiv.org/abs/2406.11326v1,GitHub Copilot: the perfect Code compLeeter?,This paper aims to evaluate GitHub Copilot's g...,"[Ilja Siroš, Dave Singelée, Bart Preneel]"
109199,http://arxiv.org/abs/physics/0005062v1,Applying MDL to Learning Best Model Granularity,The Minimum Description Length (MDL) principle...,"[Qiong Gao, Ming Li, Paul Vitanyi]"
109200,http://arxiv.org/abs/2202.07290v1,Don't stop the training: continuously-updating...,"Over the last decade, numerous studies have sh...","[Pierre Orhan, Yves Boubenec, Jean-Rémi King]"
109201,http://arxiv.org/abs/1911.00572v1,Probabilistic Formulation of the Take The Best...,The framework of cognitively bounded rationali...,"[Tomi Peltola, Jussi Jokinen, Samuel Kaski]"


In [7]:
raw_arxiv_df[raw_arxiv_df['id'] == 'http://arxiv.org/abs/2412.13337v1']

Unnamed: 0,id,title,summary,author
2967,http://arxiv.org/abs/2412.13337v1,Unveiling the Secret Recipe: A Guide For Super...,The rise of large language models (LLMs) has c...,"[Aldo Pareja, Nikhil Shivakumar Nayak, Hao Wan..."


In [8]:
# Add this before bulk indexing
arxiv_df = raw_arxiv_df.drop_duplicates(subset=['id'])
print(f"Removed duplicates: {len(raw_arxiv_df)} -> {len(arxiv_df)} rows")


Removed duplicates: 109203 -> 109203 rows


### Elastic Search

To use elastic search
``` 
docker run --name es01 --net elastic -p 9200:9200 \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "ES_JAVA_OPTS=-Xms512m -Xmx512m" \
  docker.elastic.co/elasticsearch/elasticsearch:8.13.4 
  ```

In [9]:
from elasticsearch import Elasticsearch

# Connect to your ES instance
es = Elasticsearch(
    "http://localhost:9200",  # Or your cloud instance
    #basic_auth=("user", "password")  # Only if authentication is enabled
)

In [10]:
print(es.ping())

True


In [11]:
index_name = "arxiv-papers"

index_mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "title": {"type": "text", "analyzer": "standard"},
            "summary": {"type": "text", "analyzer": "standard"},
            "author": {"type": "text", "analyzer": "standard"},
            #"published": {"type": "date"},
            #"categories": {"type": "keyword"}
        }
    }
}

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_mapping)

In [12]:
def generate_docs(df):
    for _, row in df.iterrows():
        yield {
            "_index": index_name,
            "_id": row["id"],
            "_source": {
                "id": row["id"],
                "title": row["title"],
                "summary": row["summary"],
                "author": row["author"],
                # "published": row["published"].isoformat() if row["published"] else None,
                # "categories": row["categories"]
            }
        }


In [18]:
def search_papers(query, top_k=10):
    # Text-based search
    text_query = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^2", "summary", "author"],
                "type": "best_fields"
            }
        },
        "size": top_k
    }
    
    response = es.search(index=index_name, body=text_query)
    
    results = []
    for hit in response["hits"]["hits"]:
        results.append({
            "score": hit["_score"],
            "id": hit["_source"]["id"],
            "title": hit["_source"]["title"],
            "summary": hit["_source"]["summary"],
            "author": hit["_source"]["author"]
        })
    
    return results

In [14]:
from openai import OpenAI
llm_client = OpenAI()

In [15]:
def answer_question(question, relevant_papers, model):
    
    # Build context from summaries
    context = "\n\n".join([
        f"id: {paper['id']}\nPaper: {paper['title']}\nSummary: {paper['summary']}"
        for paper in relevant_papers
    ])
    
    # Create prompt for LLM
    prompt = f"""
    Based on the following research paper summaries, answer the question: {question}
    
    Context:
    {context}
    
    Answer:
    """
    
    # Send to your LLM of choice (OpenAI, etc.)
    llm_response = llm_client.chat.completions.create(
    model=model,
    messages=[{"role": "user", "content": prompt}]
)
    return {
        "llm_answer": llm_response,
        "sources": relevant_papers
    }

In [None]:
def rag(query, top_k, model):
    # bulk index all documents
    bulk(es, generate_docs(arxiv_df))

    # search top N papers using elastic search
    relevant_papers = search_papers(query, top_k=top_k)

    # generate llm answer based on the relevant papers
    answer = answer_question(query, relevant_papers=relevant_papers, model=model)

    return answer

In [21]:
query = "What are the latest methods for fine-tuning LLMs on small datasets?"
answer = rag(query,top_k=5,model="o4-mini")
print(answer['llm_answer'].choices[0].message.content)

Here is a summary of the most recent approaches to getting strong fine‐tuning results when you only have a few thousand (or fewer) examples:

1. Supervised “secret-recipe” tuning for 3–7B‐parameter models (arXiv:2412.13337v1)  
   • Instruction-tune small open‐source LLMs with standard multi‐task/instruction datasets (e.g. stacked rather than phased training)  
   • Use unusually large batch sizes coupled with very low peak learning rates  
   • Monitor early training signals (gradient norms, loss curves) to early-stop poor runs and save compute  
   • Simplify warm-up schedules and learning-rate decay without hurting final accuracy  
   • Stacked (all tasks at once) training works just as well as multi-phase curricula and is more sample-efficient  

2. Contrastive fine-tuning of embeddings with expert-augmented soft labels (arXiv:2408.11868v1)  
   • When you care about retrieval/semantic-similarity rather than full autoregressive outputs, use contrastive objectives on your small data