In [1]:
from datasets import load_dataset
import html
import re

In [2]:
ds = load_dataset("nick007x/arxiv-papers", split="train")

In [3]:
column_names = ds.column_names
print(f"column names: {column_names}")

column names: ['arxiv_id', 'title', 'authors', 'submission_date', 'comments', 'primary_subject', 'subjects', 'doi', 'abstract', 'file_path']


In [4]:
# determines whether a row is a computer science primary subject related arxiv paper
def is_cs(ds_row):
    primary_subject = ds_row["primary_subject"]
    return "(cs." in primary_subject

In [5]:
# determines whether a row is a specific computer science primary subject related arxiv paper
def is_cs_primary(ds_row, specifics=[""]):
    primary_subject = ds_row["primary_subject"]
    return any(f"(cs.{sp})" in primary_subject for sp in specifics)


In [6]:
# filtering only primary subject computer science papers
cs_filters = ["RO"]
# cs_ds = ds.filter(is_cs)
cs_ds = ds.filter(lambda row: is_cs_primary(row, specifics=cs_filters))

In [7]:
cs_ds[0]

{'arxiv_id': '0708.3936',
 'title': 'Working and Assembly Modes of the Agile Eye',
 'authors': ['Ilian Bonev', 'Damien Chablat', 'Philippe Wenger'],
 'submission_date': '29 Aug 2007',
 'comments': '',
 'primary_subject': 'Robotics (cs.RO)',
 'subjects': 'Robotics (cs.RO)',
 'doi': 'https://doi.org/10.48550/arXiv.0708.3936',
 'abstract': 'This paper deals with the in-depth kinematic analysis of a special spherical parallel wrist, called the Agile Eye. The Agile Eye is a three-legged spherical parallel robot with revolute joints in which all pairs of adjacent joint axes are orthogonal. Its most peculiar feature, demonstrated in this paper for the first time, is that its (orientation) workspace is unlimited and flawed only by six singularity curves (rather than surfaces). Furthermore, these curves correspond to self-motions of the mobile platform. This paper also demonstrates that, unlike for any other such complex spatial robots, the four solutions to the direct kinematics of the Agile E

In [8]:
cs_primary_subjects = list(set(cs_ds["primary_subject"]))

In [9]:
cs_primary_subjects

['Robotics (cs.RO)']

In [10]:
def clean_text(raw_text):
    text = html.unescape(raw_text)
    text = re.sub(r"<[^>]+>", "", text)
    text = re.sub(r"``(.*?)''", r'"\1"', text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [11]:
clean_text(cs_ds[0]["title"])

'Working and Assembly Modes of the Agile Eye'

In [12]:
# writing the dataset into a single text file
N = len(cs_ds)
BOS = "\x02"
EOS = "\x03"
text_lines = []

for i in range(N):
    row = cs_ds[i]
    title = clean_text(row["title"])
    abstract = clean_text(row["abstract"])

    text_lines.append(f"{BOS}Title: {title}")
    text_lines.append(f"Abstract: {abstract}{EOS}")
    if i < N-1:
        text_lines.append("")

final_text = "\n".join(text_lines)

In [13]:
with open("arxiv.txt", "w", encoding="utf-8") as f:
    f.write(final_text)