In [1]:
import ads
import pandas as pd
import requests
ads.config.token="" 

In [2]:
papers = ads.SearchQuery(q='(((full:"tardis" AND (full:"kerzendorf" OR (bibstem:"Natur" AND full:"supernova")))) AND year:2014-)+property:refereed', sort="date",
                         fl = ['bibcode','title','first_author','alternate_bibcode'])

bibcodes = []
titles = []
first_authors = []
arxiv_codes = []
for paper in papers:
    bibcodes.append(paper.bibcode)
    titles.append(paper.title)
    first_authors.append(paper.first_author)
    arxiv_codes.append(paper.alternate_bibcode) 

In [3]:
"""This just prints the article titles to a new file when it's run. File appears in the main location of 
tardis_impact in your files."""
article_list = open('article title list','w')
for line in titles:
    print(line, file=article_list)
article_list.close()

In [4]:
def format_arxiv_codes(arxiv_codes):
    """This takes what is returned by paper.alternate_bibcode and formats it into a searchable arXiv code.
    What is default returned is YEAR(4digits)arXiv######## but we need arXiv####:####~ 
    To use the requests module, you need to remove the arXiv header, which is done in a different function
    as the arXiv header is needed for manual searching."""
    arxiv_codes_no_year = []
    for item in arxiv_codes:
        try:
            if len(item) == 1:
                no_year = item[0][4:]
                arxiv_codes_no_year.append(no_year)
            else:
                if item[0].find('arXiv') == -1: #If the find statement cannot find it, it returns -1
                    arxiv_codes_no_year.append(item[1][4:])
                else:
                    arxiv_codes_no_year.append(item[0][4:])
        except TypeError: #This is the error you get if you try to do the len(None)
            arxiv_codes_no_year.append('no arXiv') 
            

    arxiv_codes_proper_format = []
    for initial_item in arxiv_codes_no_year:
        if initial_item != "no arXiv":
            temp_list = []
            for item in initial_item:
                temp_list.append(item)
        
            if temp_list[5] != ":":
                temp_list.insert(5,":")
            if temp_list[10] != ".":
                temp_list.insert(10, ".")
            if temp_list[-1].isdigit():
                continue
            else:
                del temp_list[-1]
            
            good_str = ""
            for item in temp_list:
                good_str = good_str + item
            arxiv_codes_proper_format.append(good_str)
        elif initial_item == "no arXiv": 
            arxiv_codes_proper_format.append("no arXiv")
            
    return arxiv_codes_proper_format

In [5]:
arxiv_codes_with_header = format_arxiv_codes(arxiv_codes)

def no_arxiv_header(arxiv_codes_with_header):
    """This function removed the arXiv at the start of the code so it's useable with the request module"""
    arxiv_codes = arxiv_codes_with_header #This is the list of the codes
    arxiv_codes_no_header = []
    for initial_item in arxiv_codes:
        if initial_item != "no arXiv":
            temp_list = []
            for item in initial_item:
                temp_list.append(item)
        
            del temp_list[0:6]

            good_str = ""
            for item in temp_list:
                good_str = good_str + item
            arxiv_codes_no_header.append(good_str)
            
        elif initial_item == "no arXiv": 
            arxiv_codes_no_header.append("no arXiv")
            
    return arxiv_codes_no_header


arxiv_codes_to_request = no_arxiv_header(arxiv_codes_with_header)

def get_paper_urls(arxiv_codes_to_request):
    """This function returns a list of the article urls that open a pdf. This is very slow for reasons I don't
    know, but I would like to make it go faster."""
    url_list = []
    for arxiv_code in arxiv_codes_to_request:
        if arxiv_code != 'no arXiv':
            #r = requests.get('https://arxiv.org/pdf/{}.pdf'.format(arxiv_code))
            # url_list.append(r.url)
            """This line of code is what I want to interact with in the future, but as of right now I don't 
            know how to interact, so this just makes a url list. If you run this right now it's incredibly slow"""
            basic_url = 'https://arxiv.org/pdf/{}.pdf'.format(arxiv_code)
            url_list.append(basic_url)
            
        else:
            url_list.append('no arXiv')

    return url_list

In [6]:
d = {'bibcode': bibcodes, 'title': titles, 'first author': first_authors, 'pdf link': get_paper_urls(arxiv_codes_to_request)}
df = pd.DataFrame(data=d)
df.to_csv('adslist.csv')

The .alternate_bibcode statement adds the year, and it removes the formatting of the arXiv code. For search on the website, the input needs to be arXiv:####.stuff after. The function format_arxiv_codes creates the proper format for the codes to be used in search.

In [7]:
url_list = get_paper_urls(arxiv_codes_to_request)
url_test = url_list[-1]
r = requests.get(url_test)
r.encoding = 'ISO-8859-1'
r.text

'%PDF-1.5\n%\x8f\n234 0 obj\n<< /Filter /FlateDecode /Length 6098 >>\nstream\nxÚ¥<É\x92Ü6²w\x7f\x85N\x8aê\x88.\x8a\x00÷7ñ\x0e\x92e?÷xäç\x90:b\x0e3s`U±»h³H\x0eAZ.\x7fýä\x06p\x11JcË\x97\x06\x90X\x08$\x12¹W¿z|ÿ£zñl¾\n_<¿\x08_üßWo\x1e¿zõm¢_(\x15\x14\t\x94\x8fO/b\x1d(¥_da\x1a\x84\x089½øÇNßýëñ¯00[\x0eÔQP(X\x85F|\x7f\x17%»jø\xadjOÝðt·×I¸{ÉÅ\x87ú\x82ÓÝ7é\x83yP¤i\x8cËìy\x1d)h±ñã\x9d\nwÝÝ>ÒáîT?Ýéd÷T\rU;\x02H\x15»þ|\x97ì®¦>\x1a\x1eb¦ÃþÒ\x9d¦¦2÷\x0cyê\x8e\x931uûÌ\x13º\x96Áã¹ª\x07®Ö\x97¾\x1bÆ²=V¸9ØØ^\x85p´\x82wP·¼w\x98À\x15X¾j\x1aZ\x10\x9b\x9d\x1cññ.\x0fw×^\x06=\x94\\\x9a©¯\x86¶»SÉî\x17Ü{)ýÿ\x0c\x93ðÃ\x0fn04U@_WYPD\x11o!\x15\x9c\x7f\x18§S]\x99å÷Ôn1\x9bËÉÈ\x9eÔN½eÐ\x11¶j\x184\x9eËÑ\x02/\x87º\x95ÉCU6µ\x19ë£\x8c\x82ö¸§\x8dì\x97;±È¸\x00â\x11ÓYNû\x88²bWwmý[9Öw\x8aq\x0b\xa0²=ñ\x98\nÏýë±\x1e¡\x9f;s:y\x15<\x07<ôM9Ø\x8ejä²l\x02®¨¢HÿÂÃ~Ä\x85Ê©Ùûnè4\x94Ç3ì?\ri\r,i\r¬È\x1aX}Ótí\x89nÓ7P\x87*Â[àÖ¹¼\x03Âø\x05¿ZÙ\x81\x80\x9aÓ\x95\x1bæÜáe~\x94Å\x04·)\x13\x89g\x87\x95Ðíq¤ÛHå\x12\x93Ý©êËa\x9c\x86JàOCwáZÓ\x1dË\x86\x07

In [8]:
df

Unnamed: 0,bibcode,title,first author,pdf link
0,2020ApJS..250...12C,[Artificial Intelligence-Assisted Inversion (A...,"Chen, Xingzhuo",https://arxiv.org/pdf/1911.05209.pdf
1,2020ApJ...898...56M,[The Spectacular Ultraviolet Flash from the Pe...,"Miller, A. A.",https://arxiv.org/pdf/2005.05972.pdf
2,2020MNRAS.497..246G,[AT2018kzr: the merger of an oxygen-neon white...,"Gillanders, J. H.",https://arxiv.org/pdf/2007.12110.pdf
3,2020ApJS..249....8B,[Credit Lost: Two Decades of Software Citation...,"Bouquin, Daina R.",no arXiv
4,2020MNRAS.496.1132T,[Observations of the low-luminosity Type Iax s...,"Tomasella, Lina",https://arxiv.org/pdf/2002.00393.pdf
5,2020MNRAS.494.5811L,[An asymmetric explosion mechanism may explain...,"Livneh, Ran",https://arxiv.org/pdf/1912.04313.pdf
6,2020ApJ...893..143K,[SN 2019ein: New Insights into the Similaritie...,"Kawabata, Miho",https://arxiv.org/pdf/1908.03001.pdf
7,2020ApJ...892L..24S,[The Lowest of the Low: Discovery of SN 2019gs...,"Srivastav, Shubham",https://arxiv.org/pdf/2001.09722.pdf
8,2020A&A...634A..37M,[Determining the <SUP>56</SUP>Ni distribution ...,"Magee, M. R.",https://arxiv.org/pdf/1912.07603.pdf
9,2020A&A...633A..88V,[Spectral modeling of type II supernovae. II. ...,"Vogl, C.",https://arxiv.org/pdf/1911.04444.pdf


In [9]:
combo_list = zip(bibcodes, arxiv_codes)
for item in combo_list:
    print(item)

('2020ApJS..250...12C', ['2019arXiv191105209C'])
('2020ApJ...898...56M', ['2020arXiv200505972M'])
('2020MNRAS.497..246G', ['2020arXiv200712110G'])
('2020ApJS..249....8B', None)
('2020MNRAS.496.1132T', ['2020arXiv200200393T', '2020MNRAS.tmp.1787T'])
('2020MNRAS.494.5811L', ['2019arXiv191204313L', '2020MNRAS.tmp.1131L'])
('2020ApJ...893..143K', ['2019arXiv190803001K'])
('2020ApJ...892L..24S', ['2020arXiv200109722S'])
('2020A&A...634A..37M', ['2019arXiv191207603M'])
('2020A&A...633A..88V', ['2019arXiv191104444V'])
('2019ApJ...885L..23M', ['2019arXiv190904545M'])
('2019Natur.574..497W', ['2019arXiv191010510W'])
('2019MNRAS.487.2538J', ['2018arXiv181211692J', '2019MNRAS.tmp.1251J'])
('2019LRCA....5....1N', ['2019arXiv190709840N'])
('2019ApJ...876..148C', ['2017arXiv171210091C'])
('2019MNRAS.484.4785M', ['2019MNRAS.tmp..239M', '2019arXiv190108582M'])
('2019A&A...622A.102M', ['2018arXiv181208695M'])
('2019ApJ...871..250H', ['2019arXiv190201904H'])
('2019Natur.565..324I', ['2019arXiv190105500I

In [10]:
comparison = zip(arxiv_codes, format_arxiv_codes(arxiv_codes))
for item in comparison:
    print(item)

(['2019arXiv191105209C'], 'arXiv:1911.05209')
(['2020arXiv200505972M'], 'arXiv:2005.05972')
(['2020arXiv200712110G'], 'arXiv:2007.12110')
(None, 'no arXiv')
(['2020arXiv200200393T', '2020MNRAS.tmp.1787T'], 'arXiv:2002.00393')
(['2019arXiv191204313L', '2020MNRAS.tmp.1131L'], 'arXiv:1912.04313')
(['2019arXiv190803001K'], 'arXiv:1908.03001')
(['2020arXiv200109722S'], 'arXiv:2001.09722')
(['2019arXiv191207603M'], 'arXiv:1912.07603')
(['2019arXiv191104444V'], 'arXiv:1911.04444')
(['2019arXiv190904545M'], 'arXiv:1909.04545')
(['2019arXiv191010510W'], 'arXiv:1910.10510')
(['2018arXiv181211692J', '2019MNRAS.tmp.1251J'], 'arXiv:1812.11692')
(['2019arXiv190709840N'], 'arXiv:1907.09840')
(['2017arXiv171210091C'], 'arXiv:1712.10091')
(['2019MNRAS.tmp..239M', '2019arXiv190108582M'], 'arXiv:1901.08582')
(['2018arXiv181208695M'], 'arXiv:1812.08695')
(['2019arXiv190201904H'], 'arXiv:1902.01904')
(['2019arXiv190105500I'], 'arXiv:1901.05500')
(['2018arXiv181102543V'], 'arXiv:1811.02543')
(['2018arXiv181