In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import glob
import os
import datetime
import lxml.etree as ET
from xml.dom import minidom
%matplotlib inline

## Import processed data 

In [2]:
df_all = pd.read_hdf("processed_data/arxiv-oai-af.h5", key='df')
df_ml = pd.read_hdf("processed_data/arxiv-oai-af-anyml.h5", key='df')

In [3]:
df_all[0:2].transpose()

Unnamed: 0,0,1
abstract,We consider the two-nucleon weak interaction...,A brief introduction is given in the generic...
anymlcat,0,0
arxiv_id,0907.3995,cond-mat/0302169
author_text,"J. W. Shin, S. Ando, C. H. Hyun","D. van der Marel, H. J. A. Molegraaf, C. Presu..."
categories,"[nucl-th, hep-ph, nucl-ex]","[cond-mat.supr-con, cond-mat.str-el]"
created,2009-07-23,2003-02-10
cs.AI,0,0
cs.CL,0,0
cs.LG,0,0
cs.SI,0,0


## Processing


In [4]:
def list_to_string(l):
    return ', '.join(l)

In [5]:
def remove_initial_space(s):
    if s[0] == ' ':
        return s[1:]
    else:
        return s

In [6]:
df_all.abstract = df_all.abstract.apply(remove_initial_space)
df_all.abstract = df_all.abstract.apply(str.replace,args=("\n"," "))
df_all.abstract = df_all.abstract.apply(str.replace,args=("  "," "))
df_all.abstract = df_all.abstract.apply(str.replace,args=("  "," "))
df_all.title = df_all.title.apply(str.replace,args=("\n"," "))
df_all.title = df_all.title.apply(str.replace,args=("  "," "))
df_all.title = df_all.title.apply(str.replace,args=("  "," "))

### Get to one dataframe for each category

In [7]:
df_all['categories_str'] = df_all.categories.apply(list_to_string)

In [8]:
categories = []

for row in df_all.iterrows():
    for cat in row[1].categories:
        #print(cat)
        if cat not in categories:
            categories.append(cat)
      

In [9]:
categories.sort()
len(categories)

175

In [10]:
df_dict = {}

for cat in categories:
    print(cat)
    df_dict[cat] = df_all[df_all['categories_str'].str.contains(cat)]
    print(len(df_dict[cat]))

acc-phys
49
adap-org
584
alg-geom
1423
ao-sci
17
astro-ph
244091
astro-ph.CO
45076
astro-ph.EP
14423
astro-ph.GA
32627
astro-ph.HE
30821
astro-ph.IM
14138
astro-ph.SR
35957
atom-ph
13857
bayes-an
16
chao-dyn
2398
chem-ph
11380
cmp-lg
894
comp-gas
221
cond-mat
256186
cond-mat.dis-nn
16710
cond-mat.mes-hall
58630
cond-mat.mtrl-sci
54620
cond-mat.other
12519
cond-mat.quant-gas
12862
cond-mat.soft
26221
cond-mat.stat-mech
52945
cond-mat.str-el
50009
cond-mat.supr-con
32427
cs.AI
15873
cs.AR
1160
cs.CC
6218
cs.CE
2813
cs.CG
3530
cs.CL
10610
cs.CR
9142
cs.CV
21472
cs.CY
5234
cs.DB
3868
cs.DC
8177
cs.DL
2498
cs.DM
7408
cs.DS
11975
cs.ET
1292
cs.FL
2444
cs.GL
147
cs.GR
1252
cs.GT
4988
cs.HC
3352
cs.IR
4814
cs.IT
26451
cs.LG
23912
cs.LO
8598
cs.MA
2061
cs.MM
1822
cs.MS
1175
cs.NA
2414
cs.NE
5416
cs.NI
10745
cs.OH
1734
cs.OS
403
cs.PF
1602
cs.PL
3631
cs.RO
4845
cs.SC
1370
cs.SD
1668
cs.SE
5216
cs.SI
8920
cs.SY
7178
dg-ga
732
econ.EM
473
econ.GN
68
econ.TH
58
eess.AS
723
eess.IV
478
eess.SP
1972


In [11]:
df_dict_generic = df_dict.copy()

In [12]:
for cat in categories:
    df_dict_generic[cat].drop(labels=['anymlcat', 'cs.AI', 'cs.CL', 'cs.LG', 'cs.SI', 'stat.ML', 'categories'],
                              axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# File output

In [13]:
os.system("rm processed_data/arxiv_oaiaf_catkey.h5")

256

In [14]:
os.system("rm processed_data/per_category/*")

0

In [15]:
for cat in categories:
    fn = "processed_data/per_category/" + cat
    df_dict_generic[cat].to_csv(fn+".tsv", sep="\t", mode="w")
    df_dict_generic[cat].to_hdf(fn+".h5", key="df", mode="w")
    
    zip_str = "zip -9 " + fn+".tsv.zip " + fn + ".tsv"
    bzip2_str = "bzip2 -kz9 " + fn + ".tsv"
    xz_str = "xz -kz9 " + fn + ".tsv"

        
    #os.system(zip_str)
    #os.system(bzip2_str)
    os.system(xz_str)
    
    xz_size = round(os.path.getsize(fn+".tsv.xz")/1024/1024,2)
    
    print(fn, xz_size, "M")
    
    cat_key = cat.replace(".","_")
    cat_key = cat_key.replace("-","_")
    df_dict_generic[cat].to_hdf("processed_data/arxiv_oaiaf_catkey.h5",
                                key=cat_key, mode="a")
    

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['abstract', 'arxiv_id', 'author_text', 'created', 'doi', 'title', 'updated', 'categories_str']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


processed_data/per_category/acc-phys 0.01 M
processed_data/per_category/adap-org 0.14 M
processed_data/per_category/alg-geom 0.25 M
processed_data/per_category/ao-sci 0.01 M
processed_data/per_category/astro-ph 70.97 M
processed_data/per_category/astro-ph.CO 13.57 M
processed_data/per_category/astro-ph.EP 5.07 M
processed_data/per_category/astro-ph.GA 11.32 M
processed_data/per_category/astro-ph.HE 9.32 M
processed_data/per_category/astro-ph.IM 4.71 M
processed_data/per_category/astro-ph.SR 11.65 M
processed_data/per_category/atom-ph 3.16 M
processed_data/per_category/bayes-an 0.01 M
processed_data/per_category/chao-dyn 0.5 M
processed_data/per_category/chem-ph 3.09 M
processed_data/per_category/cmp-lg 0.2 M
processed_data/per_category/comp-gas 0.06 M
processed_data/per_category/cond-mat 56.28 M
processed_data/per_category/cond-mat.dis-nn 3.74 M
processed_data/per_category/cond-mat.mes-hall 12.76 M
processed_data/per_category/cond-mat.mtrl-sci 13.28 M
processed_data/per_category/cond-m

In [16]:
!ls -lah "processed_data/per_category/*.xz"

ls: cannot access 'processed_data/per_category/*.xz': No such file or directory
