# Export .txt files for titles and abstracts

## Library imports

In [1]:
import pandas as pd
import glob
import datetime
import matplotlib.pyplot as plt
import matplotlib
import os
import zipfile
%matplotlib inline

In [2]:
def time_elapsed(start):
    end = datetime.datetime.now()

    time_to_run = end - start
    minutes = int(time_to_run.seconds/60)
    seconds = time_to_run.seconds % 60
    return "Total runtime: " + str(minutes) + " minutes, " + str(seconds) + " seconds"

In [3]:
start = datetime.datetime.now()


## Import data

There are two different folders for different slices of ArXiV: `per_category` and `per_year`. The easiest for getting the full dataset is to combine `per_year`.

Note: it is very important to specify the data types, particularly `arxiv_id`, as Pandas may assume they are floats based on some initial rows, when they are actually strings. 

In [4]:
dumpdate = "20191219"

In [5]:
datadir = "processed_data/" + dumpdate + "/per_year/"

In [6]:
files = glob.glob(datadir + "*.tsv.zip")
len(files)
files.sort()

In [7]:
dtypes = {
    "abstract": object,
    "acm_class": object,
    "arxiv_id": object,
    "author_text": object,
    "categories": object,
    "comments": object,
    "created": object,
    "doi": object,
    "num_authors": int,
    "num_categories": int,
    "primary_cat": object,
    "title": object,
    "updated": object,
    "created_ym": object
    }

In [8]:
df_all = pd.DataFrame()

for file in files:
    print(file)
    
    yearly_df = pd.read_csv(file,
                            sep="\t",
                            index_col=0,
                            compression='zip',
                            dtype=dtypes,
                            parse_dates=["created","updated"])
            
    df_all = df_all.append(yearly_df)
    
    print("Records this year: ", len(yearly_df), "Cumulative total: ", len(df_all), "\n")

processed_data/20190101/per_year/1993.tsv.zip
Records this year:  6728 Cumulative total:  6728 

processed_data/20190101/per_year/1994.tsv.zip
Records this year:  10085 Cumulative total:  16813 

processed_data/20190101/per_year/1995.tsv.zip
Records this year:  12994 Cumulative total:  29807 

processed_data/20190101/per_year/1996.tsv.zip
Records this year:  15875 Cumulative total:  45682 

processed_data/20190101/per_year/1997.tsv.zip
Records this year:  19621 Cumulative total:  65303 

processed_data/20190101/per_year/1998.tsv.zip
Records this year:  24174 Cumulative total:  89477 

processed_data/20190101/per_year/1999.tsv.zip
Records this year:  27694 Cumulative total:  117171 

processed_data/20190101/per_year/2000.tsv.zip
Records this year:  30672 Cumulative total:  147843 

processed_data/20190101/per_year/2001.tsv.zip
Records this year:  33128 Cumulative total:  180971 

processed_data/20190101/per_year/2002.tsv.zip
Records this year:  36103 Cumulative total:  217074 

processe

### Checking merged dataframe

In [9]:
len(df_all)

1480220

In [10]:
df_all = df_all.drop_duplicates()
len(df_all)

1480220

In [11]:
df_all.sample(2).transpose()

Unnamed: 0,768429,123208
abstract,LS I +61 303 is one of only a few high-mass X-...,We present a metamaterial-based random polariz...
acm_class,,
arxiv_id,0802.2363,1108.3954
author_text,"V. A. Acciari, M. Beilicke, G. Blaylock, S. M....","Xiaohui Ling, Hailu Luo, Chujun Zhao, Shuangch..."
categories,astro-ph,physics.optics
comments,accepted for publication in The Astrophysical ...,"5 pages, 6 figures"
created,2008-02-18 00:00:00,2011-08-19 00:00:00
doi,10.1086/587736,10.1364/AO.51.004749
num_authors,77,5
num_categories,1,1


# Export

In [12]:
df_all.abstract.to_csv("processed_data/" + dumpdate + "/arxiv-abstracts-all.txt", index=False)
df_all.title.to_csv("processed_data/" + dumpdate + "/arxiv-titles-all.txt", index=False)

In [13]:
abs_all_fn = "processed_data/" + dumpdate + "/arxiv-abstracts-all.txt"
title_all_fn = "processed_data/" + dumpdate + "/arxiv-titles-all.txt"

os.system("zip " + abs_all_fn + ".zip " + abs_all_fn)
os.system("zip " + title_all_fn + ".zip " + title_all_fn)

0

In [14]:
df_250k = df_all.sample(250000, random_state = 12345)
df_250k.abstract.to_csv("processed_data/" + dumpdate + "/arxiv-abstracts-250k.txt", index=False)
df_250k.title.to_csv("processed_data/" + dumpdate + "/arxiv-titles-250k.txt", index=False)

In [15]:
abs_250k_fn = "processed_data/" + dumpdate + "/arxiv-abstracts-250k.txt"
title_250k_fn = "processed_data/" + dumpdate + "/arxiv-titles-250k.txt"

os.system("zip " + abs_250k_fn + ".zip " + abs_250k_fn)
os.system("zip " + title_250k_fn + ".zip " + title_250k_fn)

0