# Examples

## Fetching C4SA data

In this example, we fetch data from the Code for South Africa API.

In [4]:
from urllib.request import urlopen
from ijson import items

# crime-summary
url = 'http://data.code4sa.org/resource/qtx7-xbrs.json' 
f = urlopen(url)
data = items(f, 'item')
next(data)

{'station': 'Aberdeen', 'sum_2014_2015': '1153'}

In [5]:
next(data)

{'station': 'Acornhoek', 'sum_2014_2015': '5047'}

## Reading csv

Now we will read a csv file

In [6]:
from csv import DictReader
from io import open
from os import path as p

url = p.abspath('filtered-crime-stats.csv')
f = open(url)
data = DictReader(f)
next(data)

{'Crime': 'All theft not mentioned elsewhere',
 'Incidents': '3397',
 'Police Station': 'Durban Central',
 'Province': 'KZN',
 'Year': '2014'}

In [7]:
next(data)

{'Crime': 'Drug-related crime',
 'Incidents': '2528',
 'Police Station': 'Durban Central',
 'Province': 'KZN',
 'Year': '2014'}

## Reading excel

Here, we read an xlsx file

In [8]:
from xlrd import open_workbook

url = p.abspath('filtered-crime-stats.xlsx')
book = open_workbook(url)
sheet = book.sheet_by_index(0)
sheet.row_values(0)

['Province', 'Police Station', 'Crime', 'Year', 'Incidents']

In [9]:
sheet.row_values(1)

['KZN', 'Durban Central', 'All theft not mentioned elsewhere', 2014.0, 3397.0]

## Screen scraping

Now we will read the first table in an html file

In [10]:
import requests
from bs4 import BeautifulSoup

url = 'https://github.com/reubano/pyconza-tutorial/raw/master/migrants.html'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

def get_data(table):
    for row in table.findAll('tr'):
        columns = row.findAll('th') or row.findAll('td')
        yield [c.getText() for c in columns]

table = soup.find('table')
data = get_data(table) 
next(data)

['\xa0',
 'January',
 'February',
 'March',
 'April',
 'May',
 'June',
 'July',
 'August',
 'September',
 'October',
 'November',
 'December',
 'Total to date']

In [11]:
next(data)

['Mediterranean',
 '82',
 '346',
 '61',
 '1,244',
 '95',
 '\xa010',
 '230',
 '684',
 '268',
 '432',
 '105',
 '203',
 '3,760']

## Aggregating data

Now we will aggregate data by summing the amounts

In [12]:
import itertools as it

records = [
    {'a': 'item', 'amount': 200},
    {'a': 'item', 'amount': 300},
    {'a': 'item', 'amount': 400}]

key = 'amount'
first = records[0]
value = sum(r.get(key, 0) for r in records)
dict(it.chain(first.items(), [(key, value)]))

{'a': 'item', 'amount': 900}

## Grouping data

Now we will group data by amount

In [13]:
import itertools as it
from operator import itemgetter

records = [
    {'item': 'a', 'amount': 200},
    {'item': 'b', 'amount': 200},
    {'item': 'c', 'amount': 400}]

keyfunc = itemgetter('amount')
sorted_records = sorted(records, key=keyfunc)
grouped = it.groupby(sorted_records, keyfunc)
data = ((key, list(group)) for key, group in grouped)
next(data)

(200, [{'amount': 200, 'item': 'a'}, {'amount': 200, 'item': 'b'}])

In [14]:
next(data)

(400, [{'amount': 400, 'item': 'c'}])