In [1]:
import pandas as pd
import json
from tqdm.auto import tqdm
from google.cloud import bigquery
from openai import OpenAI


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
DOMAIN='cs-AI'
GOOGLE_CLOUD_PROJECT='arxiv-trends'

In [6]:
# Initialize BQ client
bq_client = bigquery.Client(project=GOOGLE_CLOUD_PROJECT)
# Initialize OpenAI client
llm_client = OpenAI()



In [8]:
def get_bq_data(domain='cs-AI'):
    domain_cleaned = domain.replace("-", "_")
    domain_cleaned = domain_cleaned.replace(".", "_")
    sql_query = f"""
    SELECT id, title, summary, author
    FROM `arxiv-trends.arxiv_papers.arxiv_papers_2000_2025_{domain_cleaned}`
    WHERE summary IS NOT NULL
    """

    query_job = bq_client.query(sql_query)
    results = query_job.result().to_dataframe()
    return results

raw_arxiv_df = get_bq_data(domain=DOMAIN)

In [9]:
# Clean duplicates before bulk indexing
arxiv_df = raw_arxiv_df.drop_duplicates(subset=['id'])
print(f"Removed duplicates: {len(raw_arxiv_df)} -> {len(arxiv_df)} rows")

Removed duplicates: 109203 -> 109203 rows


In [10]:
# Convert your arxiv_df to documents
documents = arxiv_df.to_dict(orient='records')

# Prompt template for generating questions about research papers
prompt_template = """
You emulate a researcher or student using our ArXiv research assistant.
Formulate 5 questions this user might ask based on the provided research paper.
Make the questions specific to this paper's content, methods, findings, or applications.
The paper summary should contain the answer to the questions, and the questions should
be complete and research-focused. Use as few words as possible from the paper details.

The paper:

id: {id}
title: {title}
summary: {summary}
author: {author}

Create questions that would be naturally asked by someone researching this topic.
Examples of good question types:
- What methods does this paper propose for [specific problem]?
- How does this approach compare to [related work]?
- What are the main findings regarding [specific aspect]?
- What datasets or experiments were used to validate [method]?
- What are the limitations or future work suggested in [domain]?

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [15]:
def generate_questions(doc):
    """Generate questions for a single paper"""
    prompt = prompt_template.format(**doc)
    
    try:
        response = llm_client.chat.completions.create(
            model='gpt-4o-mini',
            messages=[{"role": "user", "content": prompt}]
        )
        
        json_response = response.choices[0].message.content
        return json_response
    except Exception as e:
        print(f"Error generating questions for paper {doc['id']}: {e}")
        return None

In [None]:
# Test with first document
print("Testing with first paper...")
test_prompt = prompt_template.format(**documents[0])
test_response = llm_client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role": "user", "content": test_prompt}]
)

test_questions = json.loads(test_response.choices[0].message.content)
print("Sample questions generated:")
for i, q in enumerate(test_questions['questions'], 1):
    print(f"{i}. {q}")

Testing with first paper...
Sample questions generated:
1. What is the main goal of the language Alog proposed in this paper?
2. What algorithm does the paper introduce for computing answer sets in Alog?
3. How does Alog's handling of aggregates compare to traditional ASP approaches?
4. What properties of Alog are discussed in the paper?
5. What future research directions are suggested based on the findings of this paper?

Generating questions for 109203 papers...


In [None]:
sample_size = 10
# Generate questions for all papers
print(f"\nGenerating questions for {len(documents)} papers...")
results = {}

for doc in tqdm(documents[:sample_size]):  # Start with first 5 papers for testing
    doc_id = doc['id']
    if doc_id in results:
        continue
    
    questions_raw = generate_questions(doc)
    if questions_raw is None:
        continue
        
    try:
        questions = json.loads(questions_raw)
        results[doc_id] = questions['questions']
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON for paper {doc_id}: {e}")
        continue



Generating questions for 109203 papers...


  0%|          | 0/10 [00:00<?, ?it/s]

Failed to parse JSON for paper http://arxiv.org/abs/1405.3637v2: Invalid \escape: line 1 column 85 (char 84)


In [None]:
# Convert results to final format
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

print(f"\nGenerated {len(final_results)} question-paper pairs")
print("Sample results:")
for i in range(min(5, len(final_results))):
    print(f"{i+1}. Paper: {final_results[i][0]}")
    print(f"   Question: {final_results[i][1]}")
    print()

# Create DataFrame and save
df_results = pd.DataFrame(final_results, columns=['paper_id', 'question'])
df_results.to_csv('../data/arxiv_ground_truth_retrieval.csv', index=False)

print(f"Saved {len(df_results)} questions to '../data/arxiv_ground_truth_retrieval.csv'")
print("\nDataFrame preview:")
print(df_results.head())



Generated 45 question-paper pairs
Sample results:
1. Paper: http://arxiv.org/abs/1608.08262v1
   Question: What alternative formalization of the Vicious Circle Principle is proposed in this paper?

2. Paper: http://arxiv.org/abs/1608.08262v1
   Question: How does Slog+ differ from the previously introduced language Alog in terms of set constructs?

3. Paper: http://arxiv.org/abs/1608.08262v1
   Question: In what specific scenarios does the formal semantics of Slog+ coincide with other known languages?

4. Paper: http://arxiv.org/abs/1608.08262v1
   Question: What implications does the incorporation of infinite sets have for knowledge representation in logic programming?

5. Paper: http://arxiv.org/abs/1608.08262v1
   Question: What are the key differences in the intuitive and formal semantics of Slog+ compared to its predecessors?

Saved 45 questions to 'arxiv_ground_truth_retrieval.csv'

DataFrame preview:
                            paper_id  \
0  http://arxiv.org/abs/1608.08262v1  

In [None]:
# # Additional evaluation: Generate questions for different domains/topics
# def generate_domain_questions(n_papers=10):
#     """Generate questions focused on specific research domains"""
    
#     domain_prompt = """
#     You are a researcher in machine learning/AI. Based on the following research papers,
#     generate 3 comparative or domain-specific questions that could be answered by 
#     analyzing multiple papers in this field.
    
#     Papers:
#     {papers_context}
    
#     Generate questions like:
#     - What are the different approaches to [specific problem] across these papers?
#     - How do the methodologies compare between [paper A] and [paper B]?
#     - What trends can be observed in [specific domain] research?
    
#     Provide output in JSON format:
#     {{"questions": ["question1", "question2", "question3"]}}
#     """
    
#     # Take first n papers for domain questions
#     sample_papers = documents[:n_papers]
#     papers_context = "\n\n".join([
#         f"Paper {i+1}: {paper['title']}\nSummary: {paper['summary'][:200]}..."
#         for i, paper in enumerate(sample_papers)
#     ])
    
#     prompt = domain_prompt.format(papers_context=papers_context)
    
#     response = llm_client.chat.completions.create(
#         model='gpt-4o-mini',
#         messages=[{"role": "user", "content": prompt}]
#     )
    
#     return json.loads(response.choices[0].message.content)

# # Generate domain-specific questions
# domain_questions = generate_domain_questions()
# print("\nDomain-specific questions:")
# for i, q in enumerate(domain_questions['questions'], 1):
#     print(f"{i}. {q}")

# # Save domain questions separately
# domain_df = pd.DataFrame(domain_questions['questions'], columns=['question'])
# domain_df['question_type'] = 'domain_comparative'
# domain_df.to_csv('arxiv_domain_questions.csv', index=False)

# print(f"\nSaved {len(domain_df)} domain questions to 'arxiv_domain_questions.csv'")