# Scientific Literature Filtering

In [None]:
%pip install -r ../requirements.txt

In [2]:
import pandas as pd
import xml.etree.ElementTree
import requests

In [3]:
api = "http://export.arxiv.org/api/query?search_query=cat:cs.LG&start=0&max_results=1000"

response = requests.get(api)
print(f"Status code: {response.status_code}")

Status code: 200


In [4]:
data_path_raw = '../data/raw/arxiv_papers.csv'
data_path_processed = '../data/processed/arxiv_papers.csv'

In [5]:
# Extracting features
features = ['id', 'title', 'summary', 'category', 'published']
root = xml.etree.ElementTree.fromstring(response.text) # parses the raw XML text into a XML element tree.
entries = root.findall('{http://www.w3.org/2005/Atom}entry') # finds all XML elements tagged as <entry>

atom_ns = '{http://www.w3.org/2005/Atom}' # Define the Atom XML namespace

with open(data_path_raw, 'w') as f:
    f.write(','.join(features) + '\n') # header row (features)
    for entry in entries:
        row = []
        for feature_name in features:
            if feature_name == 'category':
                categories = []
                # Find all category elements within the current entry
                for cat_elem in entry.findall(atom_ns + 'category'):
                    categories.append(cat_elem.attrib.get('term', ''))
                row.append('|'.join(categories)) # Join the list of categories into a string
            else:
                element = entry.find(atom_ns + feature_name)
                if element is not None:
                    # remove newlines, extra spaces, and escape commas for CSV
                    text_content = element.text.replace('\n', ' ').strip() if element.text else ''
                    row.append(text_content.replace(',', ''))
                else:
                    row.append('') # Appending empty string for missing data
        f.write(','.join(row) + '\n')

## EDA and Pre-processing

In [12]:
# Load the raw data
data = pd.read_csv(data_path_raw)
data['category'] = data['category'].apply(lambda x: x.split('|')) # converts string -> list for category

print(f"Shape of data: {data.shape}")
data.head()

Shape of data: (1000, 5)


Unnamed: 0,id,title,summary,category,published
0,http://arxiv.org/abs/2012.11510v1,Design Rule Checking with a CNN Based Feature ...,Design rule checking (DRC) is getting increasi...,[cs.LG],2020-12-21T17:26:31Z
1,http://arxiv.org/abs/2012.11638v1,Unsupervised in-distribution anomaly detection...,Anomaly detection is a key application of mach...,"[cs.LG, hep-ex, physics.data-an]",2020-12-21T19:05:22Z
2,http://arxiv.org/abs/2012.11325v1,Detecting Botnet Attacks in IoT Environments: ...,The increased reliance on the Internet and the...,"[cs.CR, cs.LG, cs.NI]",2020-12-16T16:39:55Z
3,http://arxiv.org/abs/2012.11327v1,Collaborative residual learners for automatic ...,Clinical coding is an administrative process t...,"[cs.IR, cs.LG]",2020-12-16T07:07:27Z
4,http://arxiv.org/abs/2012.11333v1,Ensemble model for pre-discharge icd10 coding ...,The translation of medical diagnosis to clinic...,"[cs.IR, cs.LG]",2020-12-16T07:02:56Z


In [14]:
# Unique categories
unique_categories = []
for categories in data['category']:
    for category in categories:
        if category not in unique_categories:
            unique_categories.append(category)

print(f"Number of unique categories: {len(unique_categories)}")

Number of unique categories: 80


## Summary cleaning

In [17]:
data['summary'] = data['summary'].apply(lambda x: x.strip()) # extra space  
data['summary'] = data['summary'].apply(lambda x: x.lower()) # lowercase

In [18]:
data.to_csv(data_path_processed, index=False)

In [19]:
data = pd.read_csv(data_path_processed)
data.head()

Unnamed: 0,id,title,summary,category,published
0,http://arxiv.org/abs/2012.11510v1,Design Rule Checking with a CNN Based Feature ...,design rule checking (drc) is getting increasi...,['cs.LG'],2020-12-21T17:26:31Z
1,http://arxiv.org/abs/2012.11638v1,Unsupervised in-distribution anomaly detection...,anomaly detection is a key application of mach...,"['cs.LG', 'hep-ex', 'physics.data-an']",2020-12-21T19:05:22Z
2,http://arxiv.org/abs/2012.11325v1,Detecting Botnet Attacks in IoT Environments: ...,the increased reliance on the internet and the...,"['cs.CR', 'cs.LG', 'cs.NI']",2020-12-16T16:39:55Z
3,http://arxiv.org/abs/2012.11327v1,Collaborative residual learners for automatic ...,clinical coding is an administrative process t...,"['cs.IR', 'cs.LG']",2020-12-16T07:07:27Z
4,http://arxiv.org/abs/2012.11333v1,Ensemble model for pre-discharge icd10 coding ...,the translation of medical diagnosis to clinic...,"['cs.IR', 'cs.LG']",2020-12-16T07:02:56Z
