In [None]:
import os
import shutil
import nest_asyncio
nest_asyncio.apply()
!pip uninstall AutoRAG
!AutoRAG[parse]>=0.3.0

# Create Database of Research Papers from ArXiV

In [189]:
import arxiv

# makes a directory to store the raw papers
if os.path.exists('./raw_documents'):
    shutil.rmtree('./raw_documents')
os.makedirs('./raw_documents')

# TODO: Add the arXiv IDs of the papers you want to download
paper_ids = ["1312.5602"]
client = arxiv.Client()

for paper_id in paper_ids:
    paper = next(client.results(arxiv.Search(id_list=[paper_id])))
    paper.download_pdf(dirpath="./raw_documents")

# Parse PDF Documents

In [190]:
from autorag.parser import Parser

# makes a directory to store the parsed papers
if os.path.exists('./parse_project_dir'):
    shutil.rmtree('./parse_project_dir')
os.makedirs('./parse_project_dir')

# TODO: Complete the parse.yaml
parser = Parser(data_path_glob="./raw_documents/*.pdf", project_dir="./parse_project_dir")
parser.start_parsing("./parse.yaml")

After the parser run is finished, you can see the result at the `parse_project_dir` folder.

In [191]:
import pandas as pd
pdfminer_raw_df = pd.read_parquet("./parse_project_dir/parsed_result.parquet")
print(pdfminer_raw_df.head())

                                               texts  \
0  3\n1\n0\n2\n\nc\ne\nD\n9\n1\n\n]\n\nG\nL\n.\ns...   

                                                path  page  \
0  ./raw_documents\1312.5602v1.Playing_Atari_with...    -1   

  last_modified_datetime  
0             2025-03-08  


In the raw dataframe, you can find out the four columns.

- texts : The parsed result. All parsed result from the original documents.
- path : The path of the original file
- page : The page of the document. If -1, it means whole document.
- last_modified_datetime : When the document last modified.

# Chunking

Chunking is the stage that makes whole documents to little pieces. This is important because embedding model or other retrieval methods is not optimized for the too long documents. It is great to make little passages to increase retrieval performance.

You can also use multiple Chunk modules at once. In this case, you need to use one corpus to create QA, and then map the rest of the corpus to QA Data. If the chunk method is different, the retrieval_gt will be different, so we need to remap it to the QA dataset.

In [192]:
from autorag.chunker import Chunker

# makes a directory to store the chunked papers
if os.path.exists("./chunk_project_dir"):
    shutil.rmtree("./chunk_project_dir")
os.makedirs("./chunk_project_dir")

# TODO: Complete the chunk.yaml
chunker = Chunker.from_parquet(parsed_data_path="./parse_project_dir/parsed_result.parquet", project_dir="./chunk_project_dir")
chunker.start_chunking("./chunk.yaml")

After the chunker run is finished, you can see the result at the `chunk_project_dir` folder.

In [193]:
corpus_df = pd.read_parquet("./chunk_project_dir/0.parquet")
corpus_df.head()

Unnamed: 0,doc_id,contents,path,start_end_idx,metadata
0,d08be319-dc24-4ec5-8638-b60df5619e05,file_name: 1312.5602v1.Playing_Atari_with_Deep...,./raw_documents\1312.5602v1.Playing_Atari_with...,"[0, 48]","{'last_modified_datetime': '2025-03-08', 'next..."
1,e0a7b842-74e8-43d9-b728-c21c82cede25,file_name: 1312.5602v1.Playing_Atari_with_Deep...,./raw_documents\1312.5602v1.Playing_Atari_with...,"[0, 259]","{'last_modified_datetime': '2025-03-08', 'next..."
2,7b3a6669-0778-4024-9750-e52d8e0465d6,file_name: 1312.5602v1.Playing_Atari_with_Deep...,./raw_documents\1312.5602v1.Playing_Atari_with...,"[260, 496]","{'last_modified_datetime': '2025-03-08', 'next..."
3,a247b57d-7f0c-448f-abd5-52aed48072ea,file_name: 1312.5602v1.Playing_Atari_with_Deep...,./raw_documents\1312.5602v1.Playing_Atari_with...,"[260, 669]","{'last_modified_datetime': '2025-03-08', 'next..."
4,06451c50-695d-4443-9bc2-74a9ff6d6eb4,file_name: 1312.5602v1.Playing_Atari_with_Deep...,./raw_documents\1312.5602v1.Playing_Atari_with...,"[498, 817]","{'last_modified_datetime': '2025-03-08', 'next..."


In the corpus dataframe, you can find the five columns

- doc_id : The unique id of the each passages
- contents : The passage contents
- path : The original document path. You can find where is the raw document from this information.
- start_end_idx : Where the passage is starting and ending in the raw document
- metadata : The metadata like last_modified_datetime, next id or prev id.


# QA generation

In [194]:
from autorag.data.qa.schema import Raw, Corpus

raw_df = pd.read_parquet("./chunk_project_dir/0.parquet")
raw_instance = Raw(raw_df)

corpus_df = pd.read_parquet("./chunk_project_dir/0.parquet")
corpus_instance = Corpus(corpus_df, raw_instance)

Now, let's use LLM to generate questions. These will be used to select the best RAG pipeline for our documents. That's what AutoRAG does in the end :)

In [195]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from llama_index.llms.openai import OpenAI
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
from autorag.data.qa.generation_gt.llama_index_gen_gt import make_basic_gen_gt, make_concise_gen_gt
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
from autorag.data.qa.sample import random_single_hop

# Configurations
# TODO: Pick an OpenAI model
OPENAI_MODEL = "gpt-3.5-turbo-0125"
# TODO: Set the number of QA pairs to generate (more will use more credits but can yield more precise evaluations of the RAG pipeline)
NUM_QA = 25
SAVE_PATH = "./papers_data"

if os.path.exists(SAVE_PATH):
    shutil.rmtree(SAVE_PATH)
os.makedirs(SAVE_PATH)

# Initialize LLM
llm = OpenAI(model=OPENAI_MODEL)

# Generate initial QA dataset
initial_qa = (
    corpus_instance.sample(random_single_hop, n=NUM_QA)
    .map(lambda df: df.reset_index(drop=True))
    .make_retrieval_gt_contents()
    .batch_apply(factoid_query_gen, llm=llm)
    .batch_apply(make_basic_gen_gt, llm=llm)
    .batch_apply(make_concise_gen_gt, llm=llm)
    .filter(dontknow_filter_rule_based, lang="en")
)

# Save the initial QA dataset
initial_qa.to_parquet(f"{SAVE_PATH}/papers_qa.parquet", f"{SAVE_PATH}/papers_corpus.parquet")

In [196]:
# create train/test parquet files with 80/20 split (only for the qa data)
qa_parquet = pd.read_parquet(f"{SAVE_PATH}/papers_qa.parquet")
train_qa, test_qa = train_test_split(qa_parquet, test_size=0.2, random_state=42)
train_qa.to_parquet(f"{SAVE_PATH}/train_qa.parquet")
test_qa.to_parquet(f"{SAVE_PATH}/test_qa.parquet")
print("Train and test QA parquet files saved successfully")
print(f"Train QA shape: {train_qa.shape}")
print(f"Test QA shape: {test_qa.shape}")

Train and test QA parquet files saved successfully
Train QA shape: (20, 4)
Test QA shape: (5, 4)


In [197]:
qa_train = pd.read_parquet(f"{SAVE_PATH}/train_qa.parquet")
NUM_EXAMPLES_TO_PRINT = min(50, len(qa_train))

for i in range(NUM_EXAMPLES_TO_PRINT):
    print(f"Query {i+1}:")
    print("Q:", qa_train.iloc[i]["query"])
    print("A:", qa_train.iloc[i]["generation_gt"][0])
    print("#"*50)

Query 1:
Q: What is the process for selecting an action in the given algorithm?
A: The process for selecting an action in the given algorithm involves two steps:
1. With a certain probability, a random action is selected.
2. Otherwise, the action is selected based on the maximum Q-value calculated using the function Q*(φ(st), a).
##################################################
Query 2:
Q: When was the article "Speech recognition with deep recurrent neural networks" presented at Proc. ICASSP?
A: The article "Speech recognition with deep recurrent neural networks" was presented at Proc. ICASSP in 2013.
##################################################
Query 3:
Q: What are the dimensions of the Atari frames after the preprocessing step described in the document?
A: After the preprocessing step described in the document, the dimensions of the Atari frames are reduced to 110 x 84 pixels.
##################################################
Query 4:
Q: What algorithm is used to train the c