In [2]:
import pandas as pd
from google.cloud import bigquery


In [3]:
DOMAIN='cs-AI'
GOOGLE_CLOUD_PROJECT='arxiv-trends'

In [4]:
# Initialize
client = bigquery.Client(project=GOOGLE_CLOUD_PROJECT)



In [5]:
query = """
SELECT id, title, summary, author
FROM `arxiv-trends.arxiv_papers.arxiv_papers_2000_2025_cs_AI`
WHERE summary IS NOT NULL
"""

query_job = client.query(query)
results = query_job.result().to_dataframe()

In [30]:
results

Unnamed: 0,id,title,summary,author
0,http://arxiv.org/abs/1405.3637v2,Vicious Circle Principle and Logic Programs wi...,The paper presents a knowledge representation ...,"[Michael Gelfond, Yuanlin Zhang]"
1,http://arxiv.org/abs/1608.08262v1,Vicious Circle Principle and Formation of Sets...,The paper continues the investigation of Poinc...,"[Michael Gelfond, Yuanlin Zhang]"
2,http://arxiv.org/abs/1808.07050v1,Vicious Circle Principle and Logic Programs wi...,The paper presents a knowledge representation ...,"[Michael Gelfond, Yuanlin Zhang]"
3,http://arxiv.org/abs/2102.04323v2,Discovering a set of policies for the worst ca...,We study the problem of how to construct a set...,"[Tom Zahavy, Andre Barreto, Daniel J Mankowitz..."
4,http://arxiv.org/abs/2309.13426v2,A Chat About Boring Problems: Studying GPT-bas...,Text normalization - the conversion of text fr...,"[Yang Zhang, Travis M. Bartley, Mariana Grater..."
...,...,...,...,...
109198,http://arxiv.org/abs/2406.11326v1,GitHub Copilot: the perfect Code compLeeter?,This paper aims to evaluate GitHub Copilot's g...,"[Ilja Siroš, Dave Singelée, Bart Preneel]"
109199,http://arxiv.org/abs/physics/0005062v1,Applying MDL to Learning Best Model Granularity,The Minimum Description Length (MDL) principle...,"[Qiong Gao, Ming Li, Paul Vitanyi]"
109200,http://arxiv.org/abs/2202.07290v1,Don't stop the training: continuously-updating...,"Over the last decade, numerous studies have sh...","[Pierre Orhan, Yves Boubenec, Jean-Rémi King]"
109201,http://arxiv.org/abs/1911.00572v1,Probabilistic Formulation of the Take The Best...,The framework of cognitively bounded rationali...,"[Tomi Peltola, Jussi Jokinen, Samuel Kaski]"


In [32]:
results[results['id'] == 'http://arxiv.org/abs/2412.13337v1']

Unnamed: 0,id,title,summary,author
2967,http://arxiv.org/abs/2412.13337v1,Unveiling the Secret Recipe: A Guide For Super...,The rise of large language models (LLMs) has c...,"[Aldo Pareja, Nikhil Shivakumar Nayak, Hao Wan..."


In [43]:
# Add this before bulk indexing
results_clean = results.drop_duplicates(subset=['id'])
print(f"Removed duplicates: {len(results)} -> {len(results_clean)} rows")


Removed duplicates: 109203 -> 109203 rows


### Elastic Search

To use elastic search
``` 
docker run --name es01 --net elastic -p 9200:9200 \
  -e "discovery.type=single-node" \
  -e "xpack.security.enabled=false" \
  -e "ES_JAVA_OPTS=-Xms512m -Xmx512m" \
  docker.elastic.co/elasticsearch/elasticsearch:8.13.4 
  ```

In [7]:
from elasticsearch import Elasticsearch

# Connect to your ES instance
es = Elasticsearch(
    "http://localhost:9200",  # Or your cloud instance
    #basic_auth=("user", "password")  # Only if authentication is enabled
)

In [8]:
print(es.ping())

True


In [9]:
index_name = "arxiv-papers"

index_mapping = {
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "title": {"type": "text", "analyzer": "standard"},
            "summary": {"type": "text", "analyzer": "standard"},
            "author": {"type": "text", "analyzer": "standard"},
            "published": {"type": "date"},
            "categories": {"type": "keyword"}
        }
    }
}

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_mapping)

In [35]:
from tqdm import tqdm


from elasticsearch.helpers import bulk

def generate_docs(df):
    for _, row in df.iterrows():
        yield {
            "_index": index_name,
            "id": row["id"],
            "_source": {
                "id": row["id"],
                "title": row["title"],
                "summary": row["summary"],
                "author": row["author"],
                # "published": row["published"].isoformat() if row["published"] else None,
                # "categories": row["categories"]
            }
        }

# Bulk index all documents
bulk(es, generate_docs(results))

(109203, [])

In [36]:
query = "What are the latest methods for fine-tuning LLMs on small datasets?"


In [37]:
def search_papers(query, top_k=5):
    # Text-based search
    text_query = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^2", "summary", "author"],
                "type": "best_fields"
            }
        },
        "size": top_k
    }
    
    response = es.search(index=index_name, body=text_query)
    
    results = []
    for hit in response["hits"]["hits"]:
        results.append({
            "score": hit["_score"],
            "id": hit["_source"]["id"],
            "title": hit["_source"]["title"],
            "summary": hit["_source"]["summary"],
            "author": hit["_source"]["author"]
        })
    
    return results

# Example usage
papers = search_papers(query)

In [None]:
from openai import OpenAI
llm_client = OpenAI()

In [None]:
def answer_question(question, top_k=10):
    # Search for relevant papers
    relevant_papers = search_papers(question, top_k)
    
    # Build context from summaries
    context = "\n\n".join([
        f"id: {paper['id']}\nPaper: {paper['title']}\nSummary: {paper['summary']}"
        for paper in relevant_papers
    ])
    
    # Create prompt for LLM
    prompt = f"""
    Based on the following research paper summaries, answer the question: {question}
    
    Context:
    {context}
    
    Answer:
    """
    
    # Send to your LLM of choice (OpenAI, etc.)
    llm_response = llm_client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": prompt}]
)
    return {
        "answer": llm_response,
        "sources": relevant_papers
    }

In [42]:
answer_question(query)

{'answer': 'LLM response here',
 'sources': [{'score': 41.224213,
   'id': 'http://arxiv.org/abs/2412.13337v1',
   'title': 'Unveiling the Secret Recipe: A Guide For Supervised Fine-Tuning Small LLMs',
   'summary': 'The rise of large language models (LLMs) has created a significant disparity:\nindustrial research labs with their computational resources, expert teams, and\nadvanced infrastructures, can effectively fine-tune LLMs, while individual\ndevelopers and small organizations face barriers due to limited resources. In\nthis paper, we aim to bridge this gap by presenting a comprehensive study on\nsupervised fine-tuning of LLMs using instruction-tuning datasets spanning\ndiverse knowledge domains and skills. We focus on small-sized LLMs (3B to 7B\nparameters) for their cost-efficiency and accessibility. We explore various\ntraining configurations and strategies across four open-source pre-trained\nmodels. We provide detailed documentation of these configurations, revealing\nfinding

In [15]:
# def get_embedding(text, model="o4-mini"):
#     response = openai.Embedding.create(
#         input=text,
#         model=model
#     )
#     return response['data'][0]['embedding']