# meza demo

## Reading data

In this example, we fetch data from the Code for South Africa API.

In [1]:
from urllib.request import urlopen
from meza.io import read_json

# crime-summary
url = 'http://data.code4sa.org/resource/qtx7-xbrs.json' 
f = urlopen(url)
records = read_json(f)
next(records)

{'station': 'Aberdeen', 'sum_2014_2015': '1153'}

In [2]:
next(records)

{'station': 'Acornhoek', 'sum_2014_2015': '5047'}

Now we will read csv from a file like object

In [3]:
from io import StringIO
from meza.io import read_csv

f = StringIO('greeting,location\nhello,world\n')
records = read_csv(f)
next(records)

{'greeting': 'hello', 'location': 'world'}

Now we will use the universal reader

In [4]:
from os import path as p
from meza import io

url = p.abspath('crime-summary.json')
records = io.read(url)
next(records)

{'station': 'Aberdeen', 'sum_2014_2015': '1153'}

Now we will read multiple files into one records iterator

In [5]:
url2 = p.abspath('filtered-crime-stats.csv')
records = io.join(url, url2)
next(records)

{'station': 'Aberdeen', 'sum_2014_2015': '1153'}

## Reading excel

Here, we read an xlsx file

In [6]:
from io import open
from meza.io import read_xls

url = p.abspath('filtered-crime-stats.xlsx')
records = read_xls(url, sanitize=True)
next(records)

{'crime': 'All theft not mentioned elsewhere',
 'incidents': '3397.0',
 'police_station': 'Durban Central',
 'province': 'KZN',
 'year': '2014.0'}

In [7]:
next(records)

{'crime': 'Drug-related crime',
 'incidents': '2528.0',
 'police_station': 'Durban Central',
 'province': 'KZN',
 'year': '2014.0'}

## Screen scraping

Now we will read the first table in an html file

In [8]:
from meza.io import read_html

url = p.abspath('migrants.html')
records = read_html(url, sanitize=True)
next(records)

{'': 'Mediterranean',
 'april': '1,244',
 'august': '684',
 'december': '203',
 'february': '346',
 'january': '82',
 'july': '230',
 'june': '\xa010',
 'march': '61',
 'may': '95',
 'november': '105',
 'october': '432',
 'september': '268',
 'total_to_date': '3,760'}

In [9]:
next(records)

{'': 'Europe',
 'april': '15',
 'august': '81',
 'december': '3',
 'february': '1',
 'january': '0',
 'july': '11',
 'june': '\xa0 \xa02',
 'march': '2',
 'may': '2',
 'november': '5',
 'october': '6',
 'september': '11',
 'total_to_date': '139'}

## Aggregating data

Now we will aggregate data by summing the amounts

In [10]:
from meza.process import aggregate

records = [
    {'a': 'item', 'amount': 200},
    {'a': 'item', 'amount': 300},
    {'a': 'item', 'amount': 400}]

aggregate(records, 'amount', sum)

{'a': 'item', 'amount': 900}

## Grouping data

Now we will group data by amount

In [11]:
from meza.process import group

records = [
    {'item': 'a', 'amount': 200},
    {'item': 'b', 'amount': 200},
    {'item': 'c', 'amount': 400}]

grouped = group(records, 'amount')
next(grouped)

(200, [{'amount': 200, 'item': 'a'}, {'amount': 200, 'item': 'b'}])

In [12]:
next(grouped)

(400, [{'amount': 400, 'item': 'c'}])

## Type casting

Now we will detect data types and cast them to native python data structures

In [13]:
from meza.io import read_csv
from meza.process import detect_types, type_cast 

url = p.abspath('filtered-crime-stats.csv')
raw = read_csv(url)
records, result = detect_types(raw)
result['types']

[{'id': 'Incidents', 'type': 'int'},
 {'id': 'Crime', 'type': 'text'},
 {'id': 'Province', 'type': 'text'},
 {'id': 'Year', 'type': 'int'},
 {'id': 'Police Station', 'type': 'text'}]

In [14]:
casted = type_cast(records, **result)
next(casted)

{'Crime': 'All theft not mentioned elsewhere',
 'Incidents': 3397,
 'Police Station': 'Durban Central',
 'Province': 'KZN',
 'Year': 2014}

In [15]:
next(casted)

{'Crime': 'Drug-related crime',
 'Incidents': 2528,
 'Police Station': 'Durban Central',
 'Province': 'KZN',
 'Year': 2014}

## Normalizing data

Normalize data into a form that can be inserted into a database

In [16]:
from meza.process import normalize

records = [
    {'color': 'blue', 'setosa': 5, 'versi': 6},
    {'color': 'red', 'setosa': 5, 'versi': 6}]

rows = ['setosa', 'versi']
data = normalize(records, data='length', column='species', rows=rows)
next(data)

{'color': 'blue', 'length': 5, 'species': 'setosa'}

In [17]:
next(data)

{'color': 'blue', 'length': 6, 'species': 'versi'}