# Scientific Literature Filtering

In [8]:
%pip install -r ../requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import xml.etree.ElementTree
import requests

In [5]:
api = "http://export.arxiv.org/api/query?search_query=cat:cs.LG&start=0&max_results=1000"

response = requests.get(api)
print(f"Status code: {response.status_code}")
print(response.text)

Status code: 200
<?xml version='1.0' encoding='UTF-8'?>
<feed xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/" xmlns:arxiv="http://arxiv.org/schemas/atom" xmlns="http://www.w3.org/2005/Atom">
  <id>https://arxiv.org/api/CnvybtQa9uYuIE4pdtRKOb0rmDI</id>
  <title>arXiv Query: search_query=cat:cs.LG&amp;id_list=&amp;start=0&amp;max_results=1000</title>
  <updated>2026-02-23T15:51:44Z</updated>
  <link href="https://arxiv.org/api/query?search_query=cat:cs.LG&amp;start=0&amp;max_results=1000&amp;id_list=" type="application/atom+xml"/>
  <opensearch:itemsPerPage>1000</opensearch:itemsPerPage>
  <opensearch:totalResults>254214</opensearch:totalResults>
  <opensearch:startIndex>0</opensearch:startIndex>
  <entry>
    <id>http://arxiv.org/abs/2012.11510v1</id>
    <title>Design Rule Checking with a CNN Based Feature Extractor</title>
    <updated>2020-12-21T17:26:31Z</updated>
    <link href="https://arxiv.org/abs/2012.11510v1" rel="alternate" type="text/html"/>
    <link href="https://a

In [9]:
data_path_raw = '../data/raw/arxiv_papers.csv'
data_path_processed = '../data/processed/arxiv_papers.csv'

In [10]:
# Extracting features
features = ['id', 'title', 'summary', 'category', 'published']
root = xml.etree.ElementTree.fromstring(response.text) # parses the raw XML text into a XML element tree.
entries = root.findall('{http://www.w3.org/2005/Atom}entry') # finds all XML elements tagged as <entry>

atom_ns = '{http://www.w3.org/2005/Atom}' # Define the Atom XML namespace

with open(data_path_raw, 'w') as f:
    f.write(','.join(features) + '\n') # header row (features)
    for entry in entries:
        row = []
        for feature_name in features:
            if feature_name == 'category':
                categories = []
                # Find all category elements within the current entry
                for cat_elem in entry.findall(atom_ns + 'category'):
                    categories.append(cat_elem.attrib.get('term', ''))
                row.append('|'.join(categories)) # Join the list of categories into a string
            else:
                element = entry.find(atom_ns + feature_name)
                if element is not None:
                    # remove newlines, extra spaces, and escape commas for CSV
                    text_content = element.text.replace('\n', ' ').strip() if element.text else ''
                    row.append(text_content.replace(',', ''))
                else:
                    row.append('') # Appending empty string for missing data
        f.write(','.join(row) + '\n')

## EDA and Pre-processing

In [13]:
# Load the raw data
data = pd.read_csv(data_path_raw)
data['category'] = data['category'].apply(lambda x: x.split('|')) # converts string -> list for category
data.head()

Unnamed: 0,id,title,summary,category,published
0,http://arxiv.org/abs/2012.11510v1,Design Rule Checking with a CNN Based Feature ...,Design rule checking (DRC) is getting increasi...,[cs.LG],2020-12-21T17:26:31Z
1,http://arxiv.org/abs/2012.11638v1,Unsupervised in-distribution anomaly detection...,Anomaly detection is a key application of mach...,"[cs.LG, hep-ex, physics.data-an]",2020-12-21T19:05:22Z
2,http://arxiv.org/abs/2012.11325v1,Detecting Botnet Attacks in IoT Environments: ...,The increased reliance on the Internet and the...,"[cs.CR, cs.LG, cs.NI]",2020-12-16T16:39:55Z
3,http://arxiv.org/abs/2012.11327v1,Collaborative residual learners for automatic ...,Clinical coding is an administrative process t...,"[cs.IR, cs.LG]",2020-12-16T07:07:27Z
4,http://arxiv.org/abs/2012.11333v1,Ensemble model for pre-discharge icd10 coding ...,The translation of medical diagnosis to clinic...,"[cs.IR, cs.LG]",2020-12-16T07:02:56Z


In [22]:
# Unique categories
unique_categories = []
for categories in data['category']:
    for category in categories:
        if category not in unique_categories:
            unique_categories.append(category)

print(f"Unique categories: {unique_categories}")
print(f"Number of unique categories: {len(unique_categories)}")

Unique categories: ['cs.LG', 'hep-ex', 'physics.data-an', 'cs.CR', 'cs.NI', 'cs.IR', 'cs.SI', 'stat.ML', 'eess.IV', 'cs.CV', 'math.OC', 'math.PR', 'cs.RO', 'eess.AS', 'cs.SD', 'cs.CL', 'cs.AI', 'cs.CY', 'cs.IT', 'eess.SP', 'cs.DB', 'stat.AP', 'cs.CG', 'cs.NE', 'q-bio.NC', 'cs.GT', 'hep-lat', 'cond-mat.dis-nn', 'cond-mat.stat-mech', 'cs.SE', 'cs.LO', 'cs.DC', 'cond-mat.soft', 'math.ST', 'q-bio.MN', 'math.AT', 'q-bio.GN', 'cs.DS', 'stat.CO', 'cs.MA', 'cs.DM', 'cs.GR', 'econ.EM', 'q-bio.OT', 'stat.ME', 'quant-ph', 'cs.PL', 'physics.med-ph', 'q-bio.QM', 'cs.ET', 'eess.SY', 'physics.comp-ph', 'cs.HC', 'cs.DL', 'q-fin.ST', 'math.NA', 'q-bio.BM', 'q-bio.TO', 'cs.MS', 'physics.geo-ph', 'q-fin.CP', 'cs.MM', 'q-bio.PE', 'econ.GN', 'cond-mat.mtrl-sci', 'cs.AR', 'math.OA', 'cs.CE', 'astro-ph.IM', 'math.AP', 'math.MG', 'math.DS', 'math.CO', 'q-fin.GN', 'physics.soc-ph', 'cs.PF', 'cs.CC', 'cs.SC', 'q-fin.TR', 'astro-ph.EP']
Number of unique categories: 80


## Summary cleaning

In [23]:
data['summary'] = data['summary'].apply(lambda x: x.strip()) # extra space  
data['summary'] = data['summary'].apply(lambda x: x.lower()) # lowercase
data['summary']

0      design rule checking (drc) is getting increasi...
1      anomaly detection is a key application of mach...
2      the increased reliance on the internet and the...
3      clinical coding is an administrative process t...
4      the translation of medical diagnosis to clinic...
                             ...                        
995    the communication between data-generating devi...
996    recently we have witnessed great progress in t...
997    we propose a method to predict the sim-to-real...
998    since medical image data sets contain few samp...
999    a fundamental question in neuroscience is how ...
Name: summary, Length: 1000, dtype: str

In [24]:
data.to_csv(data_path_processed, index=False)