This notebook is designed for playing around with getting all PDFs from an indico timetable page e.g. from a conference, workshop

In [91]:
from time import sleep

In [68]:
import os
from collections import namedtuple

In [1]:
import requests
import bs4

In [2]:
r = requests.get("https://indico.cern.ch/event/662485/timetable/?view=standard")  # use standard view as includes abstract info

In [16]:
soup = bs4.BeautifulSoup(r.text, 'html.parser')

In [48]:
event_title = soup.title.text.replace("· Indico", "").strip()
print(event_title)

QCD@LHC 2018  (27-31 August 2018)


In [14]:
soup.findAll('li', 'timetable-contrib')[1]

<li class="timetable-item timetable-contrib" id="27-determining-the-strong-coup">
<span class="timetable-time nested">
<span class="start-time">
        09:10
    </span>
</span>
<div class="timetable-item-body flexcol">
<div class="timetable-item-header flexrow">
<span class="timetable-title nested" data-anchor="27-determining-the-strong-coup">Determining the strong coupling constant (45' + 10')</span>
<span class="icon-time timetable-duration">55m</span>
<div class="timetable-item-actions">
<div class="toolbar right thin">
</div>
</div>
</div>
<div class="speaker-list">
<span class="label">Speaker</span>:
        <span class="">
<span>Klaus Rabbertz</span><span class="affiliation">
            (<span class="text">KIT - Karlsruhe Institute of  Technology (DE)</span>)</span></span>
</div>
<div class="js-attachment-container">
<div class="material-list">
<div class="attachments-display-container toolbar">
<div class="folder folder-root">
<a class="attachment icon-file-pdf i-button" data

In [57]:
TalkEntry = namedtuple("TalkEntry", ["title", "speaker", "affiliation", "URL", "abstract"])

In [61]:
def get_entries(soup):
    """Get all contributions in the timetable
    
    
    Returns list of TalkEntry objects that contain info about each talk
    """
    entries = []

    link_stem = "https://indico.cern.ch"
    for entry in soup.findAll('li', 'timetable-contrib'):
        title_tag = entry.find('span', 'timetable-title')
        if not title_tag:
            continue
        title = title_tag.text
    #     print(title)

        speaker = ""
        affiliation = ""
        speaker_tag = entry.find('div', 'speaker-list')
        if speaker_tag:
            speaker_tag = speaker_tag.find('span', "")
            speaker = speaker_tag.find('span', "").text  # need the blank classname to avoid getting title
            affiliation_tag = speaker_tag.find('span', 'affiliation')
            if affiliation_tag:
                affiliation = affiliation_tag.find('span', 'text').text
        
        abstract = ""
        abstract_tag = entry.find("div", "contrib-description")
        if abstract_tag:
            abstract = abstract_tag.find('p').text
            
    #     print(speaker)
    #     print(affiliation)
    #     print(abstract)

        link_tag = entry.find('a', 'attachment')
        if link_tag:
            link = link_stem + link_tag['href']
        else:
            continue
    #     print(link)

        if link:
            entries.append(TalkEntry(title, speaker, affiliation, link, abstract))

    return entries

In [62]:
entries = get_entries(soup)

In [67]:
entries

[TalkEntry(title="Determining the strong coupling constant (45' + 10')", speaker='Klaus Rabbertz', affiliation='KIT - Karlsruhe Institute of  Technology (DE)', URL='https://indico.cern.ch/event/662485/contributions/3050014/attachments/1705148/2747270/alpha-s-from-ATLASCMS_QCDatLHC_2018_KRabbertz_final.pdf', abstract=''),
 TalkEntry(title="Recent developments in PDFs (45' + 10')", speaker='Robert Samuel Thorne', affiliation='University College London (UK)', URL='https://indico.cern.ch/event/662485/contributions/3050020/attachments/1704969/2746951/QCDLHC18.pdf', abstract=''),
 TalkEntry(title="Physics of the top quark and its mass (35' + 10')", speaker='Andre Hoang', affiliation='University of Vienna', URL='https://indico.cern.ch/event/662485/contributions/3050142/attachments/1705084/2747162/Hoang-Top-Plenary.pdf', abstract=''),
 TalkEntry(title="Top quark measurements in CMS (35' + 10')", speaker='Till Michael Arndt', affiliation='Deutsches Elektronen-Synchrotron (DE)', URL='https://ind

In [65]:
print(len(entries), 'contributions')

75 contributions


In [69]:
os.mkdir(event_title)

In [89]:
def default_filename(entry):
    template = "{title}-{speaker}.pdf"
    return template.format(**entry._asdict())

In [90]:
default_filename(entries[0])

"Determining the strong coupling constant (45' + 10')-Klaus Rabbertz.pdf"

In [105]:
def download_file(url, output_filename):
    print("Downloading", url, "to", output_filename)
    r = requests.get(url)
    print(r.status_code)
    if r.status_code != 200:
        print("Cannot download", url, "status code:", r.status_code)
    else:
        with open(output_filename, 'wb') as f:
            f.write(r.content)

In [108]:
def download_talks(entries, download_dir, filename_generator, pause=5):
    pause = float(pause)
    if pause <= 1:
        print("You should be nice to the server, setting pause to 1 second")
    
    if not os.path.isdir(download_dir):
        os.makedirs(download_dir)

    for entry in entries:
        output_filename = os.path.join(download_dir, filename_generator(entry))
        # replace extensions with the one from URL
        output_filename = os.path.splitext(output_filename)[0] + os.path.splitext(entry.URL)[1] 
        if not os.path.isfile(output_filename):
            download_file(entry.URL, output_filename)
            sleep(pause)  # be nice

In [109]:
download_talks(entries[:5], event_title, default_filename)