In [None]:
import os
import shutil
import nest_asyncio
nest_asyncio.apply()

import os
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY)
assert OPENAI_API_KEY is not None, "Please set the OPENAI_API_KEY environment variable"

# Create Database of Research Papers from ArXiV

In [None]:
import arxiv

# makes a directory to store the raw papers
if os.path.exists('./raw_documents'):
    shutil.rmtree('./raw_documents')
os.makedirs('./raw_documents')

# TODO: Add the arXiv IDs of the papers you want to download e.g. "2005.14419"
paper_ids = []
client = arxiv.Client()

for paper_id in paper_ids:
    paper = next(client.results(arxiv.Search(id_list=[paper_id])))
    paper.download_pdf(dirpath="./raw_documents")

# Parse PDF Documents

In [None]:
from autorag.parser import Parser

# makes a directory to store the parsed papers
if os.path.exists('./parse_project_dir'):
    shutil.rmtree('./parse_project_dir')
os.makedirs('./parse_project_dir')

# TODO: Complete the parse.yaml file
parser = Parser(data_path_glob="./raw_documents/*.pdf", project_dir="./parse_project_dir")
parser.start_parsing("./parse.yaml")

After the parser run is finished, you can see the result at the `parse_project_dir` folder.

In [None]:
import pandas as pd
pdfminer_raw_df = pd.read_parquet("./parse_project_dir/parsed_result.parquet")
pdfminer_raw_df.head()

In the raw dataframe, you can find out the four columns.

- texts : The parsed result. All parsed result from the original documents.
- path : The path of the original file
- page : The page of the document. If -1, it means whole document.
- last_modified_datetime : When the document last modified.

# Chunking

Chunking is the stage that makes whole documents to little pieces. This is important because embedding model or other retrieval methods is not optimized for the too long documents. It is great to make little passages to increase retrieval performance.

You can also use multiple Chunk modules at once. In this case, you need to use one corpus to create QA, and then map the rest of the corpus to QA Data. If the chunk method is different, the retrieval_gt will be different, so we need to remap it to the QA dataset.

In [None]:
from autorag.chunker import Chunker

# makes a directory to store the chunked papers
if os.path.exists("./chunk_project_dir"):
    shutil.rmtree("./chunk_project_dir")
os.makedirs("./chunk_project_dir")

# TODO: Complete the chunk.yaml
chunker = Chunker.from_parquet(parsed_data_path="./parse_project_dir/parsed_result.parquet", project_dir="./chunk_project_dir")
chunker.start_chunking("./chunk.yaml")

After the chunker run is finished, you can see the result at the `chunk_project_dir` folder.

In [None]:
corpus_df = pd.read_parquet("./chunk_project_dir/0.parquet")
print(f"Split the papers into {len(corpus_df)} chunks")
for idx, row in corpus_df.iterrows():
    print(f"Chunk {idx}: {row['contents']}")
    print("#"*50)

# QA generation

In [None]:
from autorag.data.qa.schema import Raw, Corpus

raw_df = pd.read_parquet("./chunk_project_dir/0.parquet")
raw_instance = Raw(raw_df)

corpus_df = pd.read_parquet("./chunk_project_dir/0.parquet")
corpus_instance = Corpus(corpus_df, raw_instance)

Now, let's use LLM to generate questions. These will be used to select the best RAG pipeline for our documents. That's what AutoRAG does in the end :)

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from llama_index.llms.openai import OpenAI
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
from autorag.data.qa.generation_gt.llama_index_gen_gt import make_basic_gen_gt, make_concise_gen_gt
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
from autorag.data.qa.sample import random_single_hop

# Configurations
# TODO: Pick an OpenAI model
OPENAI_MODEL = ""
# TODO: Set the number of QA pairs to generate (more will use more credits but can yield more precise evaluations of the RAG pipeline)
NUM_QA = -1
SAVE_PATH = "./papers_data"

if os.path.exists(SAVE_PATH):
    shutil.rmtree(SAVE_PATH)
os.makedirs(SAVE_PATH)

# Initialize LLM
llm = OpenAI(model=OPENAI_MODEL)

# Generate initial QA dataset
initial_qa = (
    corpus_instance.sample(random_single_hop, n=NUM_QA)
    .map(lambda df: df.reset_index(drop=True))
    .make_retrieval_gt_contents()
    .batch_apply(factoid_query_gen, llm=llm)
    .batch_apply(make_basic_gen_gt, llm=llm)
    .batch_apply(make_concise_gen_gt, llm=llm)
    .filter(dontknow_filter_rule_based, lang="en")
)

# Save the initial QA dataset
initial_qa.to_parquet(f"{SAVE_PATH}/papers_qa.parquet", f"{SAVE_PATH}/papers_corpus.parquet")

In [None]:
qa_parquet = pd.read_parquet(f"{SAVE_PATH}/papers_qa.parquet")
train_qa, test_qa = train_test_split(qa_parquet, test_size=0.2, random_state=42)
train_qa.to_parquet(f"{SAVE_PATH}/train_qa.parquet")
test_qa.to_parquet(f"{SAVE_PATH}/test_qa.parquet")
print("Train and test QA parquet files saved successfully")
print(f"Train QA shape: {train_qa.shape}")
print(f"Test QA shape: {test_qa.shape}")

In [None]:
qa_train = pd.read_parquet(f"{SAVE_PATH}/train_qa.parquet")
NUM_EXAMPLES_TO_PRINT = min(50, len(qa_train))

for i in range(NUM_EXAMPLES_TO_PRINT):
    print(f"Query {i+1}:")
    print("Q:", qa_train.iloc[i]["query"])
    print("A:", qa_train.iloc[i]["generation_gt"][0])
    print("#"*50)