# Data Collection and Exploratory Data Analysis

In [1]:
import pandas as pd
import xml.etree.ElementTree
import requests

## Data collection

In [2]:
api = "http://export.arxiv.org/api/query?search_query=cat:cs.LG&start=0&max_results=1000" # Official API endpoint for arXiv

response = requests.get(api)
print(f"Status code: {response.status_code}")

Status code: 200


In [3]:
data_path_raw = '../data/raw/arxiv_papers.csv' # Path to save the raw data
data_path_processed = '../data/processed/arxiv_papers.csv' # Path to save the processed data

In [4]:
# Extracting features
features = ['id', 'title', 'summary', 'category', 'published']
root = xml.etree.ElementTree.fromstring(response.text) # parses the raw XML text into a XML element tree.
entries = root.findall('{http://www.w3.org/2005/Atom}entry') # finds all XML elements tagged as <entry>

atom_ns = '{http://www.w3.org/2005/Atom}' # Define the Atom XML namespace

with open(data_path_raw, 'w') as f:
    f.write(','.join(features) + '\n') # header row (features)
    for entry in entries:
        row = []
        for feature_name in features:
            if feature_name == 'category':
                categories = []
                # Find all category elements within the current entry
                for cat_elem in entry.findall(atom_ns + 'category'):
                    categories.append(cat_elem.attrib.get('term', ''))
                row.append('|'.join(categories)) # Join the list of categories into a string
            else:
                element = entry.find(atom_ns + feature_name)
                if element is not None:
                    # remove newlines, extra spaces, and escape commas for CSV
                    text_content = element.text.replace('\n', ' ').strip() if element.text else ''
                    row.append(text_content.replace(',', ''))
                else:
                    row.append('') # Appending empty string for missing data
        f.write(','.join(row) + '\n')

## EDA and Pre-processing

In [5]:
# Load the raw data
data = pd.read_csv(data_path_raw)

print(f"Shape of data: {data.shape}")
data.head()

Shape of data: (1000, 5)


Unnamed: 0,id,title,summary,category,published
0,http://arxiv.org/abs/2012.11510v1,Design Rule Checking with a CNN Based Feature ...,Design rule checking (DRC) is getting increasi...,cs.LG,2020-12-21T17:26:31Z
1,http://arxiv.org/abs/2012.11638v1,Unsupervised in-distribution anomaly detection...,Anomaly detection is a key application of mach...,cs.LG|hep-ex|physics.data-an,2020-12-21T19:05:22Z
2,http://arxiv.org/abs/2012.11325v1,Detecting Botnet Attacks in IoT Environments: ...,The increased reliance on the Internet and the...,cs.CR|cs.LG|cs.NI,2020-12-16T16:39:55Z
3,http://arxiv.org/abs/2012.11327v1,Collaborative residual learners for automatic ...,Clinical coding is an administrative process t...,cs.IR|cs.LG,2020-12-16T07:07:27Z
4,http://arxiv.org/abs/2012.11333v1,Ensemble model for pre-discharge icd10 coding ...,The translation of medical diagnosis to clinic...,cs.IR|cs.LG,2020-12-16T07:02:56Z


In [6]:
# Basic Integrity Checks
print(f"Number of missing values in 'id': {data['id'].isnull().sum()}")
print(f"Number of unique IDs: {data['id'].nunique()}")
print(f"Number of duplicate summaries: {data.duplicated(subset='summary').sum()}")

Number of missing values in 'id': 0
Number of unique IDs: 1000
Number of duplicate summaries: 0


In [7]:
unique_categories = []
for category in data['category']:
    cats = category.split('|')
    for cat in cats:
        if cat not in unique_categories:
            unique_categories.append(cat)

print(f"Number of unique categories: {len(unique_categories)}")

Number of unique categories: 80


In [8]:
# Removing URL prefix from 'id'
data['id'] = data['id'].apply(lambda x: x.split('/')[-1])

In [9]:
# Length of abstartcs
abstract_word = []
abstract_word = data['summary'].apply(lambda x: len(str(x).split()))
abstract_word.describe()

count    1000.000000
mean      165.877000
std        48.202607
min        44.000000
25%       131.000000
50%       163.500000
75%       198.000000
max       286.000000
Name: summary, dtype: float64

In [10]:
# Published Year count
data['published'] = pd.to_datetime(data['published']).dt.year
data['published'].value_counts().sort_index()

published
2019     99
2020    901
Name: count, dtype: int64

In [11]:
data['combined'] = data['title'] + ". " + data['summary']
data['combined'] = data['combined'].apply(lambda x: x.strip()) # extra space removal
data['combined'] = data['combined'].apply(lambda x: x.lower()) # lowercaseing

## Saving the processed dataset in CSV format

In [12]:
data.to_csv(data_path_processed, index=False)

In [13]:
data = pd.read_csv(data_path_processed)
data.head()

Unnamed: 0,id,title,summary,category,published,combined
0,2012.11510v1,Design Rule Checking with a CNN Based Feature ...,Design rule checking (DRC) is getting increasi...,cs.LG,2020,design rule checking with a cnn based feature ...
1,2012.11638v1,Unsupervised in-distribution anomaly detection...,Anomaly detection is a key application of mach...,cs.LG|hep-ex|physics.data-an,2020,unsupervised in-distribution anomaly detection...
2,2012.11325v1,Detecting Botnet Attacks in IoT Environments: ...,The increased reliance on the Internet and the...,cs.CR|cs.LG|cs.NI,2020,detecting botnet attacks in iot environments: ...
3,2012.11327v1,Collaborative residual learners for automatic ...,Clinical coding is an administrative process t...,cs.IR|cs.LG,2020,collaborative residual learners for automatic ...
4,2012.11333v1,Ensemble model for pre-discharge icd10 coding ...,The translation of medical diagnosis to clinic...,cs.IR|cs.LG,2020,ensemble model for pre-discharge icd10 coding ...
