In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import glob
import datetime
import lxml.etree as ET
from xml.dom import minidom
%matplotlib inline

In [2]:
fn = glob.glob("/data/arxiv/oai-af/*")

In [3]:
len(fn)

1437618

In [4]:
doc = minidom.parse(fn[2])
print(doc.toprettyxml())

<?xml version="1.0" ?>
<arXiv xmlns="http://arxiv.org/OAI/arXiv/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://arxiv.org/OAI/arXiv/ http://arxiv.org/OAI/arXiv.xsd">
	
 
	<id>cond-mat/0302169</id>
	<created>2003-02-10</created>
	<authors>
		<author>
			<keyname>van der Marel</keyname>
			<forenames>D.</forenames>
		</author>
		<author>
			<keyname>Molegraaf</keyname>
			<forenames>H. J. A.</forenames>
		</author>
		<author>
			<keyname>Presura</keyname>
			<forenames>C.</forenames>
		</author>
		<author>
			<keyname>Santoso</keyname>
			<forenames>I.</forenames>
		</author>
	</authors>
	<title>Superconductivity by Kinetic Energy Saving?</title>
	<categories>cond-mat.supr-con cond-mat.str-el</categories>
	<comments>11 pages, ReVTeX, 7 figures in encapsulated postscript</comments>
	<journal-ref>in &quot;Concepts in electron correlation&quot;, Edited by A. Hewson and V.
  Zlatic, Kluwer (2003), p 7-16</journal-ref>
	<abstract>  A brief introduction is gi

In [5]:
def get_author_text(doc):
    authorlist = []
    authortext = ""
    for author in doc.getElementsByTagName("authors")[0].childNodes:
        keyname = author.getElementsByTagName("keyname")[0].firstChild.data
        try:
            forenames = author.getElementsByTagName("forenames")[0].firstChild.data
        except:
            forenames = ""
        authortext += forenames + " " + keyname + ", "
        authorlist.append(forenames + " " + keyname)
    return authortext[:-2].replace("  ", " "), authorlist

In [6]:
get_author_text(doc)

('D. van der Marel, H. J. A. Molegraaf, C. Presura, I. Santoso',
 ['D. van der Marel', 'H. J. A. Molegraaf', 'C. Presura', 'I. Santoso'])

In [7]:
df_list = []
dates = []
for file in fn[1:]:
    
    doc = minidom.parse(file)
    title = doc.getElementsByTagName("title")[0].firstChild.data
    arxivid = doc.getElementsByTagName("id")[0].firstChild.data
    created = doc.getElementsByTagName("created")[0].firstChild.data
    categories = doc.getElementsByTagName("categories")[0].firstChild.data
    abstract = doc.getElementsByTagName("abstract")[0].firstChild.data
    
    author_text, author_list = get_author_text(doc)
    
    num_authors = len(author_list)
    
    num_categories = len(categories.split(" "))
    
    try:
        updated = doc.getElementsByTagName("updated")[0].firstChild.data
    except IndexError:
        updated = np.nan
        
    try:
        doi = doc.getElementsByTagName("doi")[0].firstChild.data
    except IndexError:
        doi = np.nan 
        
        
    if 'cs.LG' in categories:
        cslg = 1
    else:
        cslg = 0
    
    if 'cs.AI' in categories:
        csai = 1
    else:
        csai = 0
    
    if 'stat.ML' in categories:
        statml = 1
    else:
        statml = 0
    
    if 'cs.CL' in categories:
        cscl = 1
    else:
        cscl = 0
        
    if 'cs.SI' in categories:
        cssi = 1
    else:
        cssi = 0
    
    if sum([cslg,csai,cscl,cssi,statml]) > 0:
        anymlcat = 1
    else:
        anymlcat = 0
        
        
    row = {
           'title':title,
           'updated':updated,
           'arxiv_id':arxivid,
           'created':created,
           'categories':categories.split(" "),
           'num_categories':num_categories,
           'abstract':abstract,
           'doi':doi,
           'cs.LG':cslg,
           'cs.AI':csai,
           'cs.CL':cscl,
           'cs.SI':cssi,
           'stat.ML':statml,
           'anymlcat':anymlcat,
           'author_text':author_text,
           'num_authors':num_authors
    }
    
    df_list.append(row)
    
    if len(df_list) % 25000 is 0:
        print(len(df_list))

25000
50000
75000
100000
125000
150000
175000
200000
225000
250000
275000
300000
325000
350000
375000
400000
425000
450000
475000
500000
525000
550000
575000
600000
625000
650000
675000
700000
725000
750000
775000
800000
825000
850000
875000
900000
925000
950000
975000
1000000
1025000
1050000
1075000
1100000
1125000
1150000
1175000
1200000
1225000
1250000
1275000
1300000
1325000
1350000
1375000
1400000
1425000


In [8]:
df = pd.DataFrame(df_list)

In [9]:
len(df)

1437617

In [10]:
print(df.query("arxiv_id == '1207.2757'").author_text)

2494     CDF Collaboration, D0 Collaboration, TEVNPHWG
Name: author_text, dtype: object


In [11]:
df_anyml = df.query("anymlcat == 1")

In [12]:
len(df_anyml)

56051

In [13]:
df.to_csv("processed_data/arxiv-oai-af.tsv", sep="\t")
df.to_hdf("processed_data/arxiv-oai-af.h5", key='df')


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['abstract', 'arxiv_id', 'author_text', 'categories', 'created', 'doi', 'title', 'updated']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [14]:
df_anyml.to_csv("processed_data/arxiv-oai-af-anyml.tsv", sep="\t")
df_anyml.to_hdf("processed_data/arxiv-oai-af-anyml.h5", key='df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['abstract', 'arxiv_id', 'author_text', 'categories', 'created', 'doi', 'title', 'updated']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
