In [1]:
import pandas as pd
import numpy as np
import glob
import os
import datetime
import pathlib
%matplotlib inline

In [2]:
dumpdate = "20191219"

In [3]:
def time_elapsed(start):
    end = datetime.datetime.now()

    time_to_run = end - start
    minutes = int(time_to_run.seconds/60)
    seconds = time_to_run.seconds % 60
    return "Total runtime: " + str(minutes) + " minutes, " + str(seconds) + " seconds"

In [4]:
start = datetime.datetime.now()


## Import processed data 

In [5]:
df_all = pd.read_csv("processed_data/" + dumpdate + "/arxiv-oai-af.tsv", sep="\t")
len(df_all)

1483806

In [6]:
df_all[51:54].transpose()

Unnamed: 0,51,52,53
abstract,We examine the possibility of the inclusion ...,We present an investigation of sample select...,"In this paper, we consider the phase recover..."
acm_class,,,
arxiv_id,astro-ph/9705080,1109.2787,1410.1368
author_text,"Helio J. Rocha-Pinto, Walter J. Maciel","Andreas Schulze, Lutz Wisotzki","Angélique Drémeau, Florent Krzakala"
categories,astro-ph,astro-ph.CO,"cs.IT,math.IT,math.ST,stat.AP,stat.TH"
comments,"tex file plus 5 postscript figures. Plain Tex,...","20 pages, 20 figures, accepted for publication...",To appear in the proceedings of IEEE Int'l Con...
created,1997-05-12,2011-09-13,2014-10-06
doi,,10.1051/0004-6361/201117564,10.1109/ICASSP.2015.7178654
num_authors,2,2,2
num_categories,1,1,5


## Processing


In [7]:
def list_to_string(l):
    return ', '.join(l)

In [8]:
def remove_initial_space(s):
    if s[0] == ' ':
        return s[1:]
    else:
        return s

In [9]:
df_all.abstract = df_all.abstract.apply(str.replace,args=("\n"," "))
df_all.abstract = df_all.abstract.apply(str.replace,args=("  "," "))
df_all.abstract = df_all.abstract.apply(str.replace,args=("  "," "))
df_all.abstract = df_all.abstract.apply(remove_initial_space)
df_all.title = df_all.title.apply(str.replace,args=("\n"," "))
df_all.title = df_all.title.apply(str.replace,args=("  "," "))
df_all.title = df_all.title.apply(str.replace,args=("  "," "))

In [10]:
df_all.iloc[5].abstract

'In this article we prove the following version of the Weak-BAB conjecture for $3$-folds in char $p>5$: Fix a DCC set $I\\subset [0, 1)$ and an algebraically closed field $k$ of characteristic $p>5$. Let $\\mathfrak{D}$ be a collection of klt pairs $(X, \\Delta)$ satisfying the following properties: (1) $X$ is a projective $3$-fold, (2) $\\Delta$ is an $\\mathbb{R}$-divisor with coefficients in $I$, (3) $K_X+\\Delta\\equiv 0$, and (4) $-K_X$ is ample. Then the set $\\{\\mbox{vol}_X(-K_X) \\ | \\ (X, \\Delta)\\in\\mathfrak{D}\\mbox{ for some }\\Delta\\}$ is bounded from above. '

### Get to one dataframe for each category

In [11]:
df_all['categories_list'] = df_all.categories.apply(str.split,args=",")

In [12]:
df_all.iloc[3].categories_list

['stat.ML', 'cs.AI', 'cs.LG', 'cs.SI']

In [13]:
categories = []

for row in df_all.iterrows():
    for cat in row[1].categories_list:
        #print(cat)
        if cat not in categories:
            categories.append(cat)
      

In [14]:
categories.sort()
len(categories)

175

In [15]:
df_dict = {}

for cat in categories:
   # print(cat)
    df_dict[cat] = df_all[df_all['categories'].str.contains(cat)]
    
    df_dict[cat] = df_dict[cat].sort_values(by='created')
   # print(len(df_dict[cat]))

In [16]:
df_all.query("doi=='10.1103/PhysRevD.86.101501'")

Unnamed: 0,abstract,acm_class,arxiv_id,author_text,categories,comments,created,doi,num_authors,num_categories,primary_cat,title,updated,categories_list
308834,We describe a simple way of obtaining horizon ...,,1204.1422,"Bibhas Ranjan Majhi, T. Padmanabhan","gr-qc,hep-th",v2: a comment added; accepted in PRD (Rapid Co...,2012-04-06,10.1103/PhysRevD.86.101501,2,2,gr-qc,Noether current from the surface term of gravi...,2012-10-17,"[gr-qc, hep-th]"


# File output

In [17]:
os.system("rm processed_data/" + dumpdate + "/per_month/*")
pathlib.Path("processed_data/" + dumpdate + "/per_month/").mkdir(parents=True, exist_ok=True)

In [18]:
def get_month(s):
    return s[0:7]

In [19]:
df_all = df_all.drop("categories_list", axis=1)
df_all['created_ym'] = df_all.created.apply(get_month)

In [20]:
def get_year(s):
    return s[0:4]

In [21]:
os.system("rm processed_data/" + dumpdate + "/per_year/*")
pathlib.Path("processed_data/" + dumpdate + "/per_year/").mkdir(parents=True, exist_ok=True) 

In [22]:
df_all['created_year'] = df_all.created.apply(get_year)

for year in range(1993,2019):


    query = "created_year == '" + str(year) + "'"

    yearly_df = df_all.query(query)
    
    yearly_df = yearly_df.sort_values(by='created')
    
    filename = "processed_data/" + dumpdate + "/per_year/" + str(year) + ".tsv"

    yearly_df.drop("created_year", axis=1).to_csv(filename, sep="\t")

    zip_str = "zip -9 " + filename + ".zip " + filename
    os.system(zip_str)

    zip_size = round(os.path.getsize(filename + ".zip")/1024/1024,2)

    print(str(year), len(yearly_df), zip_size)


1993 6728 2.06
1994 10085 3.24
1995 12994 4.31
1996 15875 5.3
1997 19621 6.65
1998 24174 8.31
1999 27694 9.6
2000 30672 10.75
2001 33128 11.77
2002 36103 12.8
2003 39389 14.17
2004 43721 15.88
2005 46867 17.2
2006 50304 18.71
2007 55768 20.7
2008 58796 22.23
2009 64077 24.68
2010 70286 27.93
2011 76605 30.93
2012 84389 34.81
2013 92866 38.72
2014 97598 41.66
2015 105128 45.39
2016 113436 49.89
2017 123781 55.1
2018 140135 62.66


In [23]:
os.system("rm processed_data/" + dumpdate + "/arxiv_oaiaf_catkey.h5")

256

In [24]:
os.system("rm processed_data/" + dumpdate + "/per_category/*")

0

In [25]:
pathlib.Path("processed_data/" + dumpdate + "/per_category/").mkdir(parents=True, exist_ok=True) 

In [26]:
for cat in categories:
    fn = "processed_data/" + dumpdate + "/per_category/" + cat
    df_dict[cat].to_csv(fn+".tsv", sep="\t", mode="w")
    #df_dict_generic[cat].to_hdf(fn+".h5", key="df", mode="w")
    
    zip_str = "zip -9 " + fn+".tsv.zip " + fn + ".tsv"
    bzip2_str = "bzip2 -kz9 " + fn + ".tsv"
    xz_str = "xz -kz9 " + fn + ".tsv"

        
    #os.system(zip_str)
    #os.system(bzip2_str)
    os.system(xz_str)
    
    xz_size = round(os.path.getsize(fn+".tsv.xz")/1024/1024,2)
    
    print(fn, xz_size, "M")
    
    # for h5 export
    
    # cat_key = cat.replace(".","_")
    # cat_key = cat_key.replace("-","_")
    # df_dict[cat].to_hdf("processed_data/" + dumpdate + "/arxiv_oaiaf_catkey.h5",
    #                            key=cat_key, mode="a")
    

processed_data/20190101/per_category/acc-phys 0.01 M
processed_data/20190101/per_category/adap-org 0.15 M
processed_data/20190101/per_category/alg-geom 0.27 M
processed_data/20190101/per_category/ao-sci 0.01 M
processed_data/20190101/per_category/astro-ph 74.36 M
processed_data/20190101/per_category/astro-ph.CO 14.49 M
processed_data/20190101/per_category/astro-ph.EP 5.47 M
processed_data/20190101/per_category/astro-ph.GA 12.25 M
processed_data/20190101/per_category/astro-ph.HE 10.06 M
processed_data/20190101/per_category/astro-ph.IM 5.12 M
processed_data/20190101/per_category/astro-ph.SR 12.45 M
processed_data/20190101/per_category/atom-ph 3.38 M
processed_data/20190101/per_category/bayes-an 0.01 M
processed_data/20190101/per_category/chao-dyn 0.55 M
processed_data/20190101/per_category/chem-ph 3.33 M
processed_data/20190101/per_category/cmp-lg 0.21 M
processed_data/20190101/per_category/comp-gas 0.06 M
processed_data/20190101/per_category/cond-mat 59.01 M
processed_data/20190101/per_

In [27]:
print(time_elapsed(start))

'Total runtime: 62 minutes, 40 seconds'