In [1]:
import ads
import pandas as pd
import requests
ads.config.token="" 

In [2]:
papers = ads.SearchQuery(q='(((full:"tardis" AND (full:"kerzendorf" OR (bibstem:"Natur" AND full:"supernova")))) AND year:2014-)+property:refereed', sort="date",
                         fl = ['bibcode','title','first_author','alternate_bibcode'])

bibcodes = []
titles = []
first_authors = []
arxiv_codes = []
for paper in papers:
    bibcodes.append(paper.bibcode)
    titles.append(paper.title)
    first_authors.append(paper.first_author)
    arxiv_codes.append(paper.alternate_bibcode) #

In [3]:
def format_arxiv_codes(arxiv_codes):
    """This takes what is returned by paper.alternate_bibcode and formats it into a searchable arXiv code.
    What is default returned is YEAR(4digits)arXiv######## but we need arXiv####:####~ 
    To use the requests module, you need to remove the arXiv header, which is done in a different function
    as the arXiv header is needed for manual searching."""
    arxiv_codes_no_year = []
    for item in arxiv_codes:
        try:
            if len(item) == 1:
                no_year = item[0][4:]
                arxiv_codes_no_year.append(no_year)
            else:
                if item[0].find('arXiv') == -1: #If the find statement cannot find it, it returns -1
                    arxiv_codes_no_year.append(item[1][4:])
                else:
                    arxiv_codes_no_year.append(item[0][4:])
        except TypeError: #This is the error you get if you try to do the len(None)
            arxiv_codes_no_year.append('no arXiv') 
            

    arxiv_codes_proper_format = []
    for initial_item in arxiv_codes_no_year:
        if initial_item != "no arXiv":
            temp_list = []
            for item in initial_item:
                temp_list.append(item)
        
            if temp_list[5] != ":":
                temp_list.insert(5,":")
            if temp_list[10] != ".":
                temp_list.insert(10, ".")
            if temp_list[-1].isdigit():
                continue
            else:
                del temp_list[-1]
            
            good_str = ""
            for item in temp_list:
                good_str = good_str + item
            arxiv_codes_proper_format.append(good_str)
        elif initial_item == "no arXiv": 
            arxiv_codes_proper_format.append("no arXiv")
            
    return arxiv_codes_proper_format

In [4]:
arxiv_codes_with_header = format_arxiv_codes(arxiv_codes)

def no_arxiv_header(arxiv_codes_with_header):
    """This function removed the arXiv at the start of the code so it's useable with the request module"""
    arxiv_codes = arxiv_codes_with_header #This is the list of the codes
    arxiv_codes_no_header = []
    for initial_item in arxiv_codes:
        if initial_item != "no arXiv":
            temp_list = []
            for item in initial_item:
                temp_list.append(item)
        
            del temp_list[0:6]

            good_str = ""
            for item in temp_list:
                good_str = good_str + item
            arxiv_codes_no_header.append(good_str)
            
        elif initial_item == "no arXiv": 
            arxiv_codes_no_header.append("no arXiv")
            
    return arxiv_codes_no_header


arxiv_codes_to_request = no_arxiv_header(arxiv_codes_with_header)

def get_paper_urls(arxiv_codes_to_request):
    """This function returns a list of the article urls that open a pdf. This is very slow for reasons I don't
    know, but I would like to make it go faster."""
    url_list = []
    for arxiv_code in arxiv_codes_to_request:
        if arxiv_code != 'no arXiv':
            #r = requests.get('https://arxiv.org/pdf/{}.pdf'.format(arxiv_code))
            # url_list.append(r.url)
            """This line of code is what I want to interact with in the future, but as of right now I don't 
            know how to interact, so this just makes a url list."""
            basic_url = 'https://arxiv.org/pdf/{}.pdf'.format(arxiv_code)
            url_list.append(basic_url)
            
        else:
            url_list.append('no arXiv')

    return url_list

get_paper_urls(arxiv_codes_to_request)

['https://arxiv.org/pdf/1911.05209.pdf',
 'https://arxiv.org/pdf/2005.05972.pdf',
 'https://arxiv.org/pdf/2007.12110.pdf',
 'no arXiv',
 'https://arxiv.org/pdf/2002.00393.pdf',
 'https://arxiv.org/pdf/1912.04313.pdf',
 'https://arxiv.org/pdf/1908.03001.pdf',
 'https://arxiv.org/pdf/2001.09722.pdf',
 'https://arxiv.org/pdf/1912.07603.pdf',
 'https://arxiv.org/pdf/1911.04444.pdf',
 'https://arxiv.org/pdf/1909.04545.pdf',
 'https://arxiv.org/pdf/1910.10510.pdf',
 'https://arxiv.org/pdf/1812.11692.pdf',
 'https://arxiv.org/pdf/1907.09840.pdf',
 'https://arxiv.org/pdf/1712.10091.pdf',
 'https://arxiv.org/pdf/1901.08582.pdf',
 'https://arxiv.org/pdf/1812.08695.pdf',
 'https://arxiv.org/pdf/1902.01904.pdf',
 'https://arxiv.org/pdf/1901.05500.pdf',
 'https://arxiv.org/pdf/1811.02543.pdf',
 'https://arxiv.org/pdf/1810.07165.pdf',
 'https://arxiv.org/pdf/1808.00448.pdf',
 'https://arxiv.org/pdf/1807.05965.pdf',
 'https://arxiv.org/pdf/1705.10340.pdf',
 'https://arxiv.org/pdf/1803.04436.pdf',
 'h

In [5]:
d = {'bibcode': bibcodes, 'title': titles, 'first author': first_authors, 'arxiv': format_arxiv_codes(arxiv_codes)}
df = pd.DataFrame(data=d)
df.to_csv('adslist.csv')

The .alternate_bibcode statement adds the year, and it removes the formatting of the arXiv code. For search on the website, the input needs to be arXiv:####.stuff after. The function format_arxiv_codes creates the proper format for the codes to be used in search.

In [6]:
df

Unnamed: 0,bibcode,title,first author,arxiv
0,2020ApJS..250...12C,[Artificial Intelligence-Assisted Inversion (A...,"Chen, Xingzhuo",arXiv:1911.05209
1,2020ApJ...898...56M,[The Spectacular Ultraviolet Flash from the Pe...,"Miller, A. A.",arXiv:2005.05972
2,2020MNRAS.497..246G,[AT2018kzr: the merger of an oxygen-neon white...,"Gillanders, J. H.",arXiv:2007.12110
3,2020ApJS..249....8B,[Credit Lost: Two Decades of Software Citation...,"Bouquin, Daina R.",no arXiv
4,2020MNRAS.496.1132T,[Observations of the low-luminosity Type Iax s...,"Tomasella, Lina",arXiv:2002.00393
5,2020MNRAS.494.5811L,[An asymmetric explosion mechanism may explain...,"Livneh, Ran",arXiv:1912.04313
6,2020ApJ...893..143K,[SN 2019ein: New Insights into the Similaritie...,"Kawabata, Miho",arXiv:1908.03001
7,2020ApJ...892L..24S,[The Lowest of the Low: Discovery of SN 2019gs...,"Srivastav, Shubham",arXiv:2001.09722
8,2020A&A...634A..37M,[Determining the <SUP>56</SUP>Ni distribution ...,"Magee, M. R.",arXiv:1912.07603
9,2020A&A...633A..88V,[Spectral modeling of type II supernovae. II. ...,"Vogl, C.",arXiv:1911.04444


In [7]:
combo_list = zip(bibcodes, arxiv_codes)
for item in combo_list:
    print(item)

('2020ApJS..250...12C', ['2019arXiv191105209C'])
('2020ApJ...898...56M', ['2020arXiv200505972M'])
('2020MNRAS.497..246G', ['2020arXiv200712110G'])
('2020ApJS..249....8B', None)
('2020MNRAS.496.1132T', ['2020arXiv200200393T', '2020MNRAS.tmp.1787T'])
('2020MNRAS.494.5811L', ['2019arXiv191204313L', '2020MNRAS.tmp.1131L'])
('2020ApJ...893..143K', ['2019arXiv190803001K'])
('2020ApJ...892L..24S', ['2020arXiv200109722S'])
('2020A&A...634A..37M', ['2019arXiv191207603M'])
('2020A&A...633A..88V', ['2019arXiv191104444V'])
('2019ApJ...885L..23M', ['2019arXiv190904545M'])
('2019Natur.574..497W', ['2019arXiv191010510W'])
('2019MNRAS.487.2538J', ['2018arXiv181211692J', '2019MNRAS.tmp.1251J'])
('2019LRCA....5....1N', ['2019arXiv190709840N'])
('2019ApJ...876..148C', ['2017arXiv171210091C'])
('2019MNRAS.484.4785M', ['2019MNRAS.tmp..239M', '2019arXiv190108582M'])
('2019A&A...622A.102M', ['2018arXiv181208695M'])
('2019ApJ...871..250H', ['2019arXiv190201904H'])
('2019Natur.565..324I', ['2019arXiv190105500I

In [8]:
comparison = zip(arxiv_codes, format_arxiv_codes(arxiv_codes))
for item in comparison:
    print(item)

(['2019arXiv191105209C'], 'arXiv:1911.05209')
(['2020arXiv200505972M'], 'arXiv:2005.05972')
(['2020arXiv200712110G'], 'arXiv:2007.12110')
(None, 'no arXiv')
(['2020arXiv200200393T', '2020MNRAS.tmp.1787T'], 'arXiv:2002.00393')
(['2019arXiv191204313L', '2020MNRAS.tmp.1131L'], 'arXiv:1912.04313')
(['2019arXiv190803001K'], 'arXiv:1908.03001')
(['2020arXiv200109722S'], 'arXiv:2001.09722')
(['2019arXiv191207603M'], 'arXiv:1912.07603')
(['2019arXiv191104444V'], 'arXiv:1911.04444')
(['2019arXiv190904545M'], 'arXiv:1909.04545')
(['2019arXiv191010510W'], 'arXiv:1910.10510')
(['2018arXiv181211692J', '2019MNRAS.tmp.1251J'], 'arXiv:1812.11692')
(['2019arXiv190709840N'], 'arXiv:1907.09840')
(['2017arXiv171210091C'], 'arXiv:1712.10091')
(['2019MNRAS.tmp..239M', '2019arXiv190108582M'], 'arXiv:1901.08582')
(['2018arXiv181208695M'], 'arXiv:1812.08695')
(['2019arXiv190201904H'], 'arXiv:1902.01904')
(['2019arXiv190105500I'], 'arXiv:1901.05500')
(['2018arXiv181102543V'], 'arXiv:1811.02543')
(['2018arXiv181