This notebook is designed to present a summary of condor commands in a more useful fashion than currently available (e.g. at https://research.cs.wisc.edu/htcondor/manual/current/11_Command_Reference.html)

It's also an opportunity to exercise some scraping techniques.

In [15]:
import bs4
import requests
import os
import urlparse
import re
import json

In [3]:
def save_page_offline(url, filename):
    """Small function to save webpage offline.
    
    url: str
        URL of the page to save.
    filename: str
        File to save it in.
    """
    r = requests.get(url)
    
    with open(filename, 'w') as offline:
        offline.write(r.text)

In [4]:
docpage = 'https://research.cs.wisc.edu/htcondor/manual/current/11_Command_Reference.html'
base_url = os.path.dirname(docpage)  # replace with something better?

In [5]:
# the online way
# r = requests.get(docpage)
# soup_main = bs4.BeautifulSoup(r.text, 'lxml')
# save_page_offline(docpage, 'doc_offline.html')

In [6]:
# open it from offline file
with open('doc_offline.html') as html:
    html_text = html.readlines()

soup_main = bs4.BeautifulSoup(''.join(html_text), 'lxml')

In [7]:
# use a dict instead?
class CondorCmd(object):
    """Holds info about a Condor command"""
    
    def __init__(self, name, brief=None, synopsis=None, 
                 description=None, options=None, url=None):
        """
        name: str
            command name
        brief: str
            Brief summary of what command does.
        synopsis: str
            Brief summary of command usage.
        description: str
            Full info about the command.
        options: str
            Flags and values that the command can use.
        url: str
            URL with info about the command. 
            Should be the full page URL. 
            e.g. 'https://research.cs.wisc.edu/htcondor/manual/current/condor_q.html'
            instead of 'condor_q.html'
        """
        self.name = name
        self.brief = brief
        self.synopsis = synopsis
        self.description = description
        self.options = options
        self.url = url       

In [8]:
# Store all of our CondorCmd objects
commands = []

In [9]:
# get the big list, crawl through to get all the possible commands
main_list = soup_main.ul
for item in main_list.children:
    if not isinstance(item, bs4.element.Tag): 
        continue
    cmd_tag = item.find_all('a')[0]
    name = cmd_tag.string
    page = cmd_tag.attrs['href']
    c = CondorCmd(name, url=os.path.join(base_url, page))
    commands.append(c)

In [10]:
for c in commands: 
    print c.name, c.url

bosco_cluster https://research.cs.wisc.edu/htcondor/manual/current/bosco_cluster.html
bosco_findplatform https://research.cs.wisc.edu/htcondor/manual/current/bosco_findplatform.html
bosco_install https://research.cs.wisc.edu/htcondor/manual/current/bosco_install.html
bosco_ssh_start https://research.cs.wisc.edu/htcondor/manual/current/bosco_ssh_start.html
bosco_start https://research.cs.wisc.edu/htcondor/manual/current/bosco_start.html
bosco_stop https://research.cs.wisc.edu/htcondor/manual/current/bosco_stop.html
bosco_uninstall https://research.cs.wisc.edu/htcondor/manual/current/bosco_uninstall.html
condor_advertise https://research.cs.wisc.edu/htcondor/manual/current/condor_advertise.html
condor_check_userlogs https://research.cs.wisc.edu/htcondor/manual/current/condor_check_userlogs.html
condor_checkpoint https://research.cs.wisc.edu/htcondor/manual/current/condor_checkpoint.html
condor_chirp https://research.cs.wisc.edu/htcondor/manual/current/condor_chirp.html
condor_cod https:/

In [39]:
def grab_cmd_info(url, cmd):
    """Grab from the URL information about the command 
    including synopsis, description, options, etc.

    Returns a dict of information.

    url: str
        Full URL from which to get command information.
    cmd: str
        Condor command
    """
#     rc = requests.get(url)
#     soup_cmd = bs4.BeautifulSoup(rc.text, 'lxml')
#     save_page_offline(url, os.path.basename(url))

    # get it from offline
    with open(os.path.basename(url)) as rc_offline:
        cmd_html_text = rc_offline.readlines()
    soup_cmd = bs4.BeautifulSoup(''.join(cmd_html_text), 'lxml')

    # get body text, sanitise
    body = soup_cmd.body.text 
    body = re.sub(r'\n\n+', '\n', body) # get rid of extra lines
    body = re.sub(r'\n[\s\xa0]+', '\n', body) # remove leading spaces
    body = re.sub(r'[\s\xa0]+\n', '\n', body) # remove trailing spaces

    info = {}

    # get brief description
    p_brief = re.compile(cmd+r'\n([\d\w.()-/ \'\n]*)\nSynopsis', re.IGNORECASE)
    brief_search = p_brief.search(body)
    if brief_search:
        info['brief'] = brief_search.group(1).replace('\n', ' ')
    else:
        print 'No brief info for', cmd
        info['brief'] = None
        
    # get synopsis
    p_synopsis = re.compile(r'Synopsis\n('+cmd+r'.+)\nDescription', re.IGNORECASE | re.DOTALL)
    synopsis_search = p_synopsis.search(body)
    if synopsis_search:
        synopsis_raw = synopsis_search.group(1)
        info['synopsis'] = [i.replace('\n', ' ').strip() for i in synopsis_raw.split(cmd) if i]
    else:
        print 'No synopsis info for', cmd
        info['synopsis'] = None
        
    # get description
#     p_desc = re.compile(r'Description\n(.*?)\nOptions', re.IGNORECASE | re.DOTALL)
#     print p_desc.findall(body)
    return info


In [40]:
for c in commands:
#     print c.name, c.url
    info_dict = grab_cmd_info(c.url, c.name)
    c.brief = info_dict['brief']
    c.synopsis = info_dict['synopsis']

No brief info for bosco_findplatform
No brief info for bosco_install
No brief info for bosco_ssh_start
No synopsis info for bosco_ssh_start
No synopsis info for bosco_start
No synopsis info for bosco_stop
No synopsis info for bosco_uninstall
No synopsis info for condor_install
No synopsis info for condor_master
No synopsis info for condor_pool_job_report


In [41]:
for c in commands:
    print c.name, c.brief, c.synopsis

bosco_cluster Manage and configure the clusters to be accessed. [u'[-h || --help]', u'[-l || --list] [-a || --add host schedd] [-r || --remove host] [-s || --status host] [-t || --test host]']
bosco_findplatform None [u'[-h || --help]', u'[-u || --url] [-b || --bit] [-f || --full] [--force=<platformstring>] [-i || --install <installoptions>]']
bosco_install None [u'[--help] | [--usage]', u'[--install[=<path/to/release_dir>]] [--prefix=<path>] [--install-dir=<path>] [--local-dir=<path>] [--make-personal-condor] [--bosco] [--type=<[submit][,execute][,manager]>] [--central-manager=<host>] [--credd] [--owner=<username>] [--maybe-daemon-owner] [--install-log=<file>] [--overwrite] [--env-scripts-dir=<dir>] [--no-env-scripts] [--ignore-missing-libs] [--force] [--backup] [--verbose]']
bosco_ssh_start None None
bosco_start start up the Personal HTCondor installation specific to Bosco None
bosco_stop Shut down HTCondor daemons in a Bosco installation. None
bosco_uninstall uninstall a Bosco insta

In [42]:
with open('dump.json', 'w') as json_file:
    json_file.write(json.JSONEncoder(indent=1).encode([c.__dict__ for c in commands]))