In [13]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

In [14]:
item = 66841
url = 'http://www9.health.gov.au/mbs/search.cfm?'
payload = {'q': '{}'.format(str(item)),
          'sopt': 'I'}
r = requests.get(url, params=payload)
# r = requests.get('http://www9.health.gov.au/mbs/search.cfm?q={}&sopt=S'.format(str(item)))
soup = BeautifulSoup(r.text, 'html.parser')

In [15]:
type(soup)

bs4.BeautifulSoup

In [16]:
# print(soup.prettify())

In [24]:
def get_categories(soup):
    """Extract the categories from the soup."""
    category_elem = filter(lambda x: 'category' in x.text.lower(), soup.find_all('h3'))[0]
    return category_elem.text.split('<')[0]

categories = get_categories(soup)
print('Categories: {}'.format(categories))

Categories: Category 6 - PATHOLOGY SERVICES


In [8]:
def get_info(soup):
    """Extract group, subgroup and subheading from the soup."""
    group = None
    subgroup = None
    subheading = None
    out = []
    for elem in soup.find_all('div'):
        elem_class = elem.get('class')
        if (elem_class is not None) and (u'span9' in elem_class):
            out.append(elem.text)
    group = out[0]
    if len(out) > 1: subgroup = out[1]
    if len(out) > 2: subheading = out[2]
    return group, subgroup, subheading

group, subgroup, subheading = get_info(soup)
print('Group: {}'.format(group))
print('Subgroup: {}'.format(subgroup))
print('Subheading: {}'.format(subheading))

Group: P2 - Chemical
Subgroup: None
Subheading: None


In [13]:
def get_description(soup):
    """Extract the item description from the soup."""
    description = None
    for elem in soup.find_all('p'):
        if elem.get('align') =='justify':
            description = elem.text.encode('utf-8').strip()
    return str(description)

description = get_description(soup)
print('Description: {}'.format(description))

Description: Quantitation of HbA1c (glycated haemoglobin) performed for the diagnosis of diabetes in asymptomatic patients at high risk.  (Item is subject to rule 25)


In [14]:
def get_dates(soup):
    """Extract the relevant dates from the soup."""
    dates_keys = []
    dates_values = []
    for elem in soup.find_all('p'):
        if 'date' in elem.text.lower():
            for i, div_elem in enumerate(elem.find_all('div')):
                if u'span8' in div_elem.get('class'):
                    dates_keys.append(div_elem.text)
                elif u'span4' in div_elem.get('class'):
                    dates_values.append(div_elem.text)
    dates = {k: v for k, v in zip(dates_keys, dates_values)}
    return dates

dates = get_dates(soup)
print('Relevant dates: {}'.format(dates))

Relevant dates: {u'Schedule Fee Start Date:': u'01-Nov-2014', u'Description Start Date:': u'01-Nov-2014', u'Item Start Date:': u'01-Nov-2014'}


In [89]:
def get_fees(soup):
    """Extract the fees from the soup."""
    fee = None
    benefit75 = None
    benefit85 = None
    safety_net = None
    for elem in soup.find_all('p'):
        for elem_p in elem.find_all('p'):
            if '$' in elem_p.text:
                splitted = np.array(re.sub(' +', ' ', elem_p.text).split(' '))
                if u'Fee:' in splitted:
                    fees = filter(lambda x: '$' in x, splitted) 
                    fee = np.float(fees[0].split('$')[1])
                    benefit75 = np.float(fees[1].split('$')[1])
                    if len(fees) > 2: benefit85 = np.float(fees[2].split('$')[1]) # this is not always there
                elif u'Safety' in splitted:
                    safety_net = np.float(splitted[-1].split('$')[1])
    return fee, benefit75, benefit85, safety_net

fee, benefit75, benefit85, safety_net = get_fees(soup)

print('Fee: {} $'.format(fee))
print('Benefit 75%: {} $'.format(benefit75))
print('Benefit 85%: {} $'.format(benefit85))
print('Safety net: {} $'.format(safety_net))

Fee: 16.8 $
Benefit 75%: 12.6 $
Benefit 85%: 14.3 $
Safety net: None $


# Test class

In [28]:
from mbspbs10pc import mbs_online
reload(mbs_online);

In [29]:
mbs = mbs_online.MBSOnline(52018)
mbs.set_attributes()

In [30]:
mbs.display()

                                                                    52018
Category                     Category 4 - ORAL AND MAXILLOFACIAL SERVICES
Group                                                O3 - General Surgery
Subgroup                                                              NaN
Subheading                                                            NaN
Description             FOREIGN BODY IN MUSCLE, TENDON OR OTHER DEEP T...
Fee start date                                                 2012-11-01
Description start date                                         2007-11-01
Item start date                                                1991-12-01
Fee (A$)                                                            276.8
Benefit 75% (A$)                                                    207.6
Benefit 85% (A$)                                                    235.3
Safety Net                                                            NaN
