In [21]:
import os
import shutil
import nest_asyncio
nest_asyncio.apply()

import os
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY)
assert OPENAI_API_KEY is not None, "Please set the OPENAI_API_KEY environment variable"

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


# Create Database of Research Papers from ArXiV

In [22]:
import arxiv

# makes a directory to store the raw papers
if os.path.exists('./raw_documents'):
    shutil.rmtree('./raw_documents')
os.makedirs('./raw_documents')

# TODO: Add the arXiv IDs of the papers you want to download
paper_ids = ["1312.5602", "2310.18144"]
client = arxiv.Client()

for paper_id in paper_ids:
    paper = next(client.results(arxiv.Search(id_list=[paper_id])))
    paper.download_pdf(dirpath="./raw_documents")

# Parse PDF Documents

In [23]:
from autorag.parser import Parser

# makes a directory to store the parsed papers
if os.path.exists('./parse_project_dir'):
    shutil.rmtree('./parse_project_dir')
os.makedirs('./parse_project_dir')

# TODO: Complete the parse.yaml
parser = Parser(data_path_glob="./raw_documents/*.pdf", project_dir="./parse_project_dir")
parser.start_parsing("./parse.yaml")

After the parser run is finished, you can see the result at the `parse_project_dir` folder.

In [24]:
import pandas as pd
pdfminer_raw_df = pd.read_parquet("./parse_project_dir/parsed_result.parquet")
print(pdfminer_raw_df.head())

                                               texts  \
0  4\n2\n0\n2\n\nr\np\nA\n3\n2\n\n]\n\nG\nL\n.\ns...   
1  3\n1\n0\n2\n\nc\ne\nD\n9\n1\n\n]\n\nG\nL\n.\ns...   

                                                path  page  \
0  ./raw_documents/2310.18144v4.Improving_Intrins...    -1   
1  ./raw_documents/1312.5602v1.Playing_Atari_with...    -1   

  last_modified_datetime  
0             2025-03-08  
1             2025-03-08  


In the raw dataframe, you can find out the four columns.

- texts : The parsed result. All parsed result from the original documents.
- path : The path of the original file
- page : The page of the document. If -1, it means whole document.
- last_modified_datetime : When the document last modified.

# Chunking

Chunking is the stage that makes whole documents to little pieces. This is important because embedding model or other retrieval methods is not optimized for the too long documents. It is great to make little passages to increase retrieval performance.

You can also use multiple Chunk modules at once. In this case, you need to use one corpus to create QA, and then map the rest of the corpus to QA Data. If the chunk method is different, the retrieval_gt will be different, so we need to remap it to the QA dataset.

In [25]:
from autorag.chunker import Chunker

# makes a directory to store the chunked papers
if os.path.exists("./chunk_project_dir"):
    shutil.rmtree("./chunk_project_dir")
os.makedirs("./chunk_project_dir")

# TODO: Complete the chunk.yaml
chunker = Chunker.from_parquet(parsed_data_path="./parse_project_dir/parsed_result.parquet", project_dir="./chunk_project_dir")
chunker.start_chunking("./chunk.yaml")

After the chunker run is finished, you can see the result at the `chunk_project_dir` folder.

In [26]:
corpus_df = pd.read_parquet("./chunk_project_dir/0.parquet")
corpus_df.head()

Unnamed: 0,doc_id,contents,path,start_end_idx,metadata
0,24a592c3-c66c-4024-b27e-527f8799504a,file_name: 2310.18144v4.Improving_Intrinsic_Ex...,./raw_documents/2310.18144v4.Improving_Intrins...,"[0, 127]","{'last_modified_datetime': '2025-03-08', 'next..."
1,a9174b12-0386-42f6-8824-e4528d3f2747,file_name: 2310.18144v4.Improving_Intrinsic_Ex...,./raw_documents/2310.18144v4.Improving_Intrins...,"[29, 196]","{'last_modified_datetime': '2025-03-08', 'next..."
2,2d4c66d8-3867-4e93-be63-960063d128f9,file_name: 2310.18144v4.Improving_Intrinsic_Ex...,./raw_documents/2310.18144v4.Improving_Intrins...,"[52, 247]","{'last_modified_datetime': '2025-03-08', 'next..."
3,baaa8e24-a125-478c-aa2a-82e71496c301,file_name: 2310.18144v4.Improving_Intrinsic_Ex...,./raw_documents/2310.18144v4.Improving_Intrins...,"[83, 311]","{'last_modified_datetime': '2025-03-08', 'next..."
4,2f64eb10-9b1e-48ff-b7a4-ee986fd0894b,file_name: 2310.18144v4.Improving_Intrinsic_Ex...,./raw_documents/2310.18144v4.Improving_Intrins...,"[139, 388]","{'last_modified_datetime': '2025-03-08', 'next..."


In the corpus dataframe, you can find the five columns

- doc_id : The unique id of the each passages
- contents : The passage contents
- path : The original document path. You can find where is the raw document from this information.
- start_end_idx : Where the passage is starting and ending in the raw document
- metadata : The metadata like last_modified_datetime, next id or prev id.


# QA generation

In [27]:
from autorag.data.qa.schema import Raw, Corpus

raw_df = pd.read_parquet("./chunk_project_dir/0.parquet")
raw_instance = Raw(raw_df)

corpus_df = pd.read_parquet("./chunk_project_dir/0.parquet")
corpus_instance = Corpus(corpus_df, raw_instance)

Now, let's use LLM to generate questions. These will be used to select the best RAG pipeline for our documents. That's what AutoRAG does in the end :)

In [28]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from llama_index.llms.openai import OpenAI
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
from autorag.data.qa.generation_gt.llama_index_gen_gt import make_basic_gen_gt, make_concise_gen_gt
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
from autorag.data.qa.sample import random_single_hop

# Configurations
# TODO: Pick an OpenAI model
OPENAI_MODEL = "gpt-3.5-turbo-0125"
# TODO: Set the number of QA pairs to generate (more will use more credits but can yield more precise evaluations of the RAG pipeline)
NUM_QA = 100
SAVE_PATH = "./papers_data"

if os.path.exists(SAVE_PATH):
    shutil.rmtree(SAVE_PATH)
os.makedirs(SAVE_PATH)

# Initialize LLM
llm = OpenAI(model=OPENAI_MODEL)

# Generate initial QA dataset
initial_qa = (
    corpus_instance.sample(random_single_hop, n=NUM_QA)
    .map(lambda df: df.reset_index(drop=True))
    .make_retrieval_gt_contents()
    .batch_apply(factoid_query_gen, llm=llm)
    .batch_apply(make_basic_gen_gt, llm=llm)
    .batch_apply(make_concise_gen_gt, llm=llm)
    .filter(dontknow_filter_rule_based, lang="en")
)

# Save the initial QA dataset
initial_qa.to_parquet(f"{SAVE_PATH}/papers_qa.parquet", f"{SAVE_PATH}/papers_corpus.parquet")

In [29]:
# create train/test parquet files with 80/20 split (only for the qa data)
qa_parquet = pd.read_parquet(f"{SAVE_PATH}/papers_qa.parquet")
train_qa, test_qa = train_test_split(qa_parquet, test_size=0.2, random_state=42)
train_qa.to_parquet(f"{SAVE_PATH}/train_qa.parquet")
test_qa.to_parquet(f"{SAVE_PATH}/test_qa.parquet")
print("Train and test QA parquet files saved successfully")
print(f"Train QA shape: {train_qa.shape}")
print(f"Test QA shape: {test_qa.shape}")

Train and test QA parquet files saved successfully
Train QA shape: (80, 4)
Test QA shape: (20, 4)


In [30]:
qa_train = pd.read_parquet(f"{SAVE_PATH}/train_qa.parquet")
NUM_EXAMPLES_TO_PRINT = min(50, len(qa_train))

for i in range(NUM_EXAMPLES_TO_PRINT):
    print(f"Query {i+1}:")
    print("Q:", qa_train.iloc[i]["query"])
    print("A:", qa_train.iloc[i]["generation_gt"][0])
    print("#"*50)

Query 1:
Q: What is the median reward achieved by the human game player after around two hours of playing each game?
A: The median reward achieved by the human game player after around two hours of playing each game is reported to be higher than the scores in Bellemare et al. [3].
##################################################
Query 2:
Q: What is the name of the environment used to evaluate the performance of intrinsic exploration objectives in the text?
A: The name of the environment used to evaluate the performance of intrinsic exploration objectives in the text is "DeepSea."
##################################################
Query 3:
Q: When was SOFE introduced to address non-stationarity in intrinsic objectives?
A: SOFE was introduced to address non-stationarity in intrinsic objectives in the year 2021, as mentioned in the text (Zhang et al. 2021b).
##################################################
Query 4:
Q: What method is represented by the first row in the document, which 