#### General Notes

This notebook performs *web scraping* with Python for the arXiv.org website. It includes:
- search by keywords
- search by specific year
directly "asking" the website. 

There is the use of the **arxiv** package available for Python which allows to perform a search by paper ID and getting information about the paper, such as the authors, title, publication date, etc. The paper ID can be found through web scraping of the arXiv.org website. 

The results of the search are:
- titles
- link for paper info
- link for download pdf
- link for donwload the source code folder

which are stored in a *.csv* file at the end of the search. The notebook is interactive, then the user chooses the order to display results:
- relevance
- submission date (newest first)
- submission date (oldest first)

and what to download.

The notebook also provides a *download* function.

##### Update //TODOs

Currently working on:
- implement search which gives as results only papers with more versions already uploaded

In [30]:
from selenium import webdriver
import sys
import os
import pandas as pd
import tarfile
from urllib import request
import arxiv

In [31]:
# useful global variables

MAX_NUM_PER_PAGE = 50
CURRENT_SIZE = 0
REMAINING_ELEMENTS = 0
TOTAL_RESULTS = 0
TITLES = []
LINK_PDFS = []
LINK_INFO = []
LINK_SOURCES = []
PAPER_IDS = []

In [32]:
website = "https://arxiv.org/search/advanced"

# UNCOMMENT THE LINES BELOW IF YOU WANT THE WINDOW TO BE HIDDEN
#options = webdriver.FirefoxOptions()
#options.add_argument("--headless")
#driver = webdriver.Firefox(options = options)

driver = webdriver.Firefox() 
driver.get(website)

In [33]:
# functions to interact with the user and start the search process

def get_info_from_user():
    terms = input("Enter keywords for your search: ")
    q1 = input("\nAre you looking for papers of a specific year? (y/n): ")
    if q1 == "y":
        year = input("Enter year (2007-YYYY): ")
        if year.isdigit():
            if int(year) >= 2007: 
                driver.find_element('xpath', '//input[@id="date-year"]').send_keys(year)
            else:
                print("Try again")
                sys.exit(0)
        else:
            print("Try again")
            sys.exit(0)
    else:
        driver.find_element('xpath', '//input[@id="date-filter_by-0"]').click()

    driver.find_element('xpath', '//input[@id="terms-0-term"]').send_keys(terms)
    driver.find_element('xpath', '//button[@class="button is-link is-medium"]').click()
    select_order = input("\nWhich order do you prefer? Choose an option: \n1. Relevance\n" + 
                "2. Submission Date (oldest first)\n3. Submission Date (newest first) \n"
                + "Insert a number: "
                )
    choose_order(select_order).click()
    driver.find_element('xpath', '//button[@class="button is-small is-link"]').click()

def choose_order(select_order):
    switcher = {
        '1': driver.find_element('xpath', '//select[@id="order"]/option[5]'),
        '2': driver.find_element('xpath', '//select[@id="order"]/option[4]'),
        '3': driver.find_element('xpath', '//select[@id="order"]/option[3]'),
    }
    return switcher.get(select_order)

The following cell contains all the functions needed to manage pages.
As default value, the maximum number of elements per page is 50. If the total number of elements for the whole search is greater than 50, then the user is asked if more results are needed; if so, the next page is loaded, otherwise all the results found up to that point are saved in the .csv file.

In [34]:
# get the size of the first page and extract all the relevant info of papers in it
def first_page():
    global CURRENT_SIZE
    first_page_size()
    extract_search_results(CURRENT_SIZE)

# utility function to the get the size of the first page
def first_page_size():
    global CURRENT_SIZE
    global REMAINING_ELEMENTS
    list_size = (driver.find_element('xpath', '/html/body/main/div[1]/div[1]/h1').text).split(" ")[-2]
    if list_size.isdigit:
        REMAINING_ELEMENTS = int(list_size)
        get_size(REMAINING_ELEMENTS)
    else:
        CURRENT_SIZE = 0
        print("No results found")
        sys.exit(0)

# manage next pages and extract search results
def next_page():
    driver.find_element('xpath', '/html/body/main/div[2]/nav[1]/a[2]').click()
    get_size(REMAINING_ELEMENTS)
    extract_search_results(CURRENT_SIZE)

def next_page_size():
    get_size(REMAINING_ELEMENTS)

# utility function to get the size of the current page
def get_size(size):
    global CURRENT_SIZE
    global REMAINING_ELEMENTS
    if size > MAX_NUM_PER_PAGE: 
        CURRENT_SIZE = MAX_NUM_PER_PAGE
        REMAINING_ELEMENTS -= MAX_NUM_PER_PAGE
    else:
        # no more elements
        CURRENT_SIZE = REMAINING_ELEMENTS
        REMAINING_ELEMENTS = 0

# extract titles, link for info, link to download pdf, link to download source code      
def extract_search_results(size):
    global TITLES
    global LINK_PDFS
    global LINK_INFO
    global LINK_SOURCES
    global TOTAL_RESULTS 
    global PAPER_IDS
    TOTAL_RESULTS += size
    for i in range(1, size+1):
        url_xpath = '/html/body/main/div[2]/ol/li[' + str(i) +']/div/p/a'
        paper_id = (driver.find_elements('xpath', url_xpath)[0].text).split(":")[-1]
        search_paper = next(arxiv.Search(id_list=[paper_id]).results())
        PAPER_IDS.append(paper_id)
        TITLES.append(search_paper.title)
        LINK_PDFS.append("https://arxiv.org/pdf/" + paper_id + ".pdf")
        LINK_INFO.append("https://arxiv.org/abs/" + paper_id)
        LINK_SOURCES.append("https://arxiv.org/e-print/" + paper_id)
    print("\nDone! " + str(size) + " results found.")


def ask_for_more():
    q = input("\nDo you want more results? (y/n): ") 
    if q == "y":
        next_page()
    else:
        create_db()


def create_db():
    results_db = pd.DataFrame({'Paper ID': PAPER_IDS, 'Title': TITLES, 'Paper Info': LINK_INFO, 
                    'Link PDF': LINK_PDFS, 'Link Source': LINK_SOURCES})
    results_db.to_csv('results.csv')
    print(str(TOTAL_RESULTS) + " results saved to results.csv")
    driver.quit()

In [35]:
def ask_download():
    to_print = input("How many files do you want to download? (1-" + str(TOTAL_RESULTS) + "): ")
    q = input("\nWhat do you want to download? (1/2/3)\n" + "1. PDF version\n2." +
            " Source code folder\n3. Both (1/2/3)\nInsert number: ")
    if q == "1":
        download_pdf(to_print)
    elif q == "2":
        download_source(to_print)
    elif q == "3":
        download_both(to_print)
    else:
        print("Try again")
        sys.exit(0)

def download_pdf(to_print):
    for i in range(1, int(to_print)+1):
        pdf = request.urlretrieve(LINK_PDFS[i-1], PAPER_IDS[i-1] + ".pdf")
    print("Download complete!")

def download_source(to_print):
    q = input("\nDo you also want to extract the folder? (y/n): ")
    if q == "y":
        for i in range(1, int(to_print)+1):
            source = request.urlretrieve(LINK_SOURCES[i-1], PAPER_IDS[i-1] + ".tar.gz")
            extract(source[0])
        print("Download and extraction complete!")
    else:
        for i in range(1, int(to_print)+1):
            source = request.urlretrieve(LINK_SOURCES[i-1], PAPER_IDS[i-1] + ".tar.gz")
        print("Download complete!")
    
def download_both(to_print):
    for i in range(1, int(to_print)+1):
        pdf = request.urlretrieve(LINK_PDFS[i-1], PAPER_IDS[i-1] + ".pdf")
        source = request.urlretrieve(LINK_SOURCES[i-1], PAPER_IDS[i-1] + ".tar.gz")

def extract(filename):
    folder_name = filename.split(".tar.gz")[0]
    with tarfile.open(filename, "r:gz") as tar:
        tar.extractall(path = os.path.join("../jupyter", folder_name))

In [None]:
# main

if __name__ == "__main__":
    get_info_from_user()
    first_page()
    while REMAINING_ELEMENTS > 0:
        ask_for_more()
    create_db()
    ask_download()