In [None]:
#only required for google colab - No need to run this block outside colab environment
!pip install feedparser

In [None]:
#only required for google colab - No need to run this block outside colab environment
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import urllib.request as libreq
import sys
import time
import os
import csv
import feedparser
import logging
import random

In [None]:

category = ["cs","eess","econ","stat","math","physics","q-bio","q-fin"]
base_url = 'http://export.arxiv.org/api/query?'

In [None]:
# Opensearch metadata such as totalResults, startIndex,
# and itemsPerPage live in the opensearch namespace.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1

feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'
        ] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'
        ] = 'arxiv'

# When running on Google Colab add the following: '/content/drive/My Drive/ArXiv_Dataset/' path before the csv file
with open('ArXiv_Data.csv', 'w', newline='',encoding='latin1') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(['title', 'abstract'])
csvFile.close()
logging.basicConfig(filename='Details.log',
                    format='%(asctime)s:[%(levelname)s]:%(message)s',
                    filemode='w')
logger = logging.getLogger()

# threshold of logger to DEBUG

logger.setLevel(logging.DEBUG)

In [None]:
# Remove out of context unicodes 
def is_latin(s):
    return all(ord(c) < 256 for c in s)


In [None]:
print(".....Code is running....PLEASE don't close this window / shutdown the machine.....")
for i in category:
    search_query = 'cat:' + i + '.*'
    results_per_iter, ctr = 500, 0
    max_total_results = 200000
    for j in range(0, max_total_results, results_per_iter):
        start = j
        max_results = results_per_iter
        query = 'search_query=%s&start=%i&max_results=%i' \
            % (search_query, start, max_results)

        # Repeat GET requests if there's a <Connection timed out> or some other kind of excpetion

        exception_count = 0
        successful_response = False
        while successful_response == False:
            try:

                # Perform a GET request using the base_url and query

                with libreq.urlopen(base_url + query) as url:
                    response = url.read()
                successful_response = True
            except IOError:

                logger.warning('IOError exception. Likely a connection time out. Trying again in 20s.'
                               )
                time.sleep(20)
                exception_count += 1
                if exception_count > 10:
                    logger.error('Too many IOError exceptions. They have likely hung up for good. Stopping.'
                                 )
                    exit()
            time.sleep(random.uniform(1,3))
            feed = feedparser.parse(response)
        if(j==0):
            logger.info('\tTotal results for this category: (%s) = %s'
                             % (i, feed.feed.opensearch_totalresults))
            max_total_results = int(feed.feed.opensearch_totalresults)
        logger.info("\tScraped: %d records so far" % (j+results_per_iter))
        
        # When running on Google Colab add the following: '/content/drive/My Drive/ArXiv_Dataset/' path before the csv file
        with open('ArXiv_Data.csv', 'a', newline='', encoding='latin1') as csvFile:
            writer = csv.writer(csvFile)
            if(len(feed.entries)>0):
              for entry in feed.entries:
                  raw_abstract = entry.summary
                  abstract = ' '.join(raw_abstract.splitlines())
                  raw_title = entry.title
                  title = ' '.join(raw_title.splitlines())
                  x,y = title.count(' '),abstract.count(' ')
                  if(4<=x<=14 and 79<=y<=249):
                    try:
                      writer.writerow([title, abstract])
                      ctr += 1
                    except:
                        n_title, n_abstract = '', ''
                        for k in title.split():
                            if(is_latin(k)):
                                n_title += str(k)
                        for k in abstract.split():
                            if(is_latin(k)):
                                n_abstract += str(k)
                        writer.writerow([n_title, n_abstract])
                        ctr += 1
        logger.info("\tWrote: %d records so far" % ctr)
        csvFile.close()
        if(int(feed.feed.opensearch_itemsperpage) < max_results):
            break
        # Time delay to avoid server hang up
        time.sleep(random.uniform(5,10))
    logger.info('Completed category:(%s). %i records Wrote.'%(i, ctr))
    print('Completed category:(%s). %i records Wrote.'%(i, ctr))

print('Finished!')
