In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import glob
import datetime
import lxml.etree as ET
from xml.dom import minidom
import datetime
import pathlib

%matplotlib inline

In [2]:
date = "20191219"

In [3]:
datadir = "/home/staeiou/data/arxiv/oai-af/"

In [4]:
start = datetime.datetime.now()


In [5]:
def time_elapsed(start):
    end = datetime.datetime.now()

    time_to_run = end - start
    minutes = int(time_to_run.seconds/60)
    seconds = time_to_run.seconds % 60
    return "Total runtime: " + str(minutes) + " minutes, " + str(seconds) + " seconds"

## Parse all raw XML files

In [6]:
fn = glob.glob(datadir + "*")

In [7]:
len(fn)

1484151

In [8]:
doc = minidom.parse(datadir + "oai:arXiv.org:1503.04358.arXiv.xml")
print(doc.toprettyxml())

<?xml version="1.0" ?>
<arXiv xmlns="http://arxiv.org/OAI/arXiv/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://arxiv.org/OAI/arXiv/ http://arxiv.org/OAI/arXiv.xsd">
	
 
	<id>1503.04358</id>
	<created>2015-03-14</created>
	<authors>
		<author>
			<keyname>Koopman</keyname>
			<forenames>Rob</forenames>
		</author>
		<author>
			<keyname>Wang</keyname>
			<forenames>Shenghui</forenames>
		</author>
		<author>
			<keyname>Scharnhorst</keyname>
			<forenames>Andrea</forenames>
		</author>
		<author>
			<keyname>Englebienne</keyname>
			<forenames>Gwenn</forenames>
		</author>
	</authors>
	<title>Ariadne's Thread - Interactive Navigation in a World of Networked
  Information</title>
	<categories>cs.DL</categories>
	<comments>CHI'15 Extended Abstracts, April 18-23, 2015, Seoul, Republic of
  Korea. ACM 978-1-4503-3146-3/15/04</comments>
	<acm-class>H.5.2; H.3.3</acm-class>
	<doi>10.1145/2702613.2732781</doi>
	<license>http://arxiv.org/licenses/nonexclusive

In [9]:
def get_author_text(doc):
    """
    Inputs a minidom doc object, outputs a tuple of (string formatted list of authors, python list of authors).
    
    Helper function for doc_to_dict()
    
    """
    authorlist = []
    authortext = ""
    for author in doc.getElementsByTagName("authors")[0].childNodes:
        keyname = author.getElementsByTagName("keyname")[0].firstChild.data
        try:
            forenames = author.getElementsByTagName("forenames")[0].firstChild.data
        except:
            forenames = ""
        authortext += forenames + " " + keyname + ", "
        authorlist.append(forenames + " " + keyname)
    return authortext[:-2].replace("  ", " "), authorlist

In [10]:
get_author_text(doc)

('Rob Koopman, Shenghui Wang, Andrea Scharnhorst, Gwenn Englebienne',
 ['Rob Koopman', 'Shenghui Wang', 'Andrea Scharnhorst', 'Gwenn Englebienne'])

In [11]:
def doc_to_dict(doc):
    """
    Inputs a minidom object, returns a dictionary of all metadata
    
    """
    title = doc.getElementsByTagName("title")[0].firstChild.data
    arxivid = doc.getElementsByTagName("id")[0].firstChild.data
    created = doc.getElementsByTagName("created")[0].firstChild.data
    categories = doc.getElementsByTagName("categories")[0].firstChild.data
    abstract = doc.getElementsByTagName("abstract")[0].firstChild.data
    primary_cat = categories.split(" ")[0]
    
    author_text, author_list = get_author_text(doc)
    
    num_authors = len(author_list)
    
    num_categories = len(categories.split(" "))
    
    try:
        comments = doc.getElementsByTagName("comments")[0].firstChild.data
    except IndexError:
        comments = np.nan
        
    try:
        acm_class = doc.getElementsByTagName("acm-class")[0].firstChild.data
    except IndexError:
        acm_class = np.nan
        
    try:
        updated = doc.getElementsByTagName("updated")[0].firstChild.data
    except IndexError:
        updated = np.nan
        
    try:
        doi = doc.getElementsByTagName("doi")[0].firstChild.data
    except IndexError:
        doi = np.nan         
        
    row = {
           'title':title,
           'updated':updated,
           'arxiv_id':arxivid,
           'created':created,
           'categories':categories.replace(" " , ","),
           'num_categories':num_categories,
           'primary_cat':primary_cat,
           'abstract':abstract,
           'doi':doi,
           'acm_class':acm_class,
           'comments':comments,
           'author_text':author_text,
           'num_authors':num_authors
    }
    
    return row

### Iterate through all XML files and parse into list of dictionaries

In [12]:
df_list = []
dates = []
for file in fn:

    try:
        row = doc_to_dict(minidom.parse(file))
        

        df_list.append(row)

        if len(df_list) % 100000 is 0:
            print(len(df_list), time_elapsed(start))
    
    except IsADirectoryError:
        pass
    

100000 Total runtime: 1 minutes, 38 seconds
200000 Total runtime: 3 minutes, 1 seconds
300000 Total runtime: 4 minutes, 24 seconds
400000 Total runtime: 5 minutes, 48 seconds
500000 Total runtime: 7 minutes, 11 seconds
600000 Total runtime: 8 minutes, 35 seconds
700000 Total runtime: 9 minutes, 58 seconds
800000 Total runtime: 11 minutes, 23 seconds
900000 Total runtime: 12 minutes, 47 seconds
1000000 Total runtime: 14 minutes, 11 seconds
1100000 Total runtime: 15 minutes, 36 seconds
1200000 Total runtime: 17 minutes, 1 seconds
1300000 Total runtime: 18 minutes, 25 seconds
1400000 Total runtime: 19 minutes, 50 seconds


### Convert list of dicts to dataframe

In [13]:
df = pd.DataFrame(df_list)

### Remove all rows after 2019-12-01

In [14]:
df = df.query("created < '2019-12-01'")

In [15]:
len(df)

1483806

## Spot checking

In [16]:
len(df)

1483806

In [17]:
df = df.drop_duplicates()
len(df)

1483806

In [18]:
print(df.query("arxiv_id == '1207.2757'").author_text)

1448312     CDF Collaboration, D0 Collaboration, TEVNPHWG
Name: author_text, dtype: object


In [19]:
df.query("author_text == 'R. Stuart Geiger'")

Unnamed: 0,abstract,acm_class,arxiv_id,author_text,categories,comments,created,doi,num_authors,num_categories,primary_cat,title,updated
153362,Automated software agents --- or bots --- ha...,,1810.0959,R. Stuart Geiger,"cs.CY,cs.AI,cs.HC,cs.SI",Originally published in 2011,2018-10-22,,1,4,cs.CY,The Lives of Bots,
1194979,Scholars and practitioners across domains ar...,,1709.09093,R. Stuart Geiger,"cs.CY,cs.AI,cs.HC","14 pages, typo fixed in v2",2017-09-26,10.1177/2053951717730735,1,3,cs.CY,Beyond opening up the black box: Investigating...,2017-10-01
1221231,This report is a high-level summary analysis...,,1706.02777,R. Stuart Geiger,"cs.CY,cs.SE,cs.SI",58 pages,2017-06-08,10.17605/OSF.IO/ENRQ5,1,3,cs.CY,Summary Analysis of the 2017 GitHub Open Sourc...,


In [20]:
df.query("doi == '10.1145/2702613.2732781'")

Unnamed: 0,abstract,acm_class,arxiv_id,author_text,categories,comments,created,doi,num_authors,num_categories,primary_cat,title,updated
306862,This work-in-progress paper introduces an in...,H.5.2; H.3.3,1503.04358,"Rob Koopman, Shenghui Wang, Andrea Scharnhorst...",cs.DL,"CHI'15 Extended Abstracts, April 18-23, 2015, ...",2015-03-14,10.1145/2702613.2732781,4,1,cs.DL,Ariadne's Thread - Interactive Navigation in a...,


## Final export

In [21]:
pathlib.Path("processed_data/" + date).mkdir(parents=True, exist_ok=True) 

In [22]:
df.to_csv("processed_data/" + date + "/arxiv-oai-af.tsv", sep="\t", index=False)

In [24]:
# df.to_pickle("processed_data/" + date + "/arxiv-oai-af.pkl")

In [25]:
time_elapsed(start)

'Total runtime: 21 minutes, 57 seconds'