In [None]:
from dask.distributed import Client

client = Client(n_workers=4)

In [None]:
import dask.bag as db
import json
import os

### Example: Accounts JSON data

We've created a fake dataset of gzipped JSON data in your data directory.  This is like the example used in the `DataFrame` example we will see later, except that it has bundled up all of the entires for each individual `id` into a single record.  This is similar to data that you might collect off of a document store database or a web API.

Each line is a JSON encoded dictionary with the following keys

*  id: Unique identifier of the customer
*  name: Name of the customer
*  transactions: List of `transaction-id`, `amount` pairs, one for each transaction for the customer in that file

In [None]:
filename = os.path.join('data', 'accounts.*.json.gz')
lines = db.read_text(filename)

In [None]:
js = lines.map(json.loads)

### Example with account data
We find the number of people with the same name.

In [None]:
%%time
from operator import add
def incr(tot, _):
    return tot+1

result = js.foldby(key='name', 
                   binop=incr, 
                   initial=0, 
                   combine=add, 
                   combine_initial=0).compute()
print(sorted(result))

### Exercise: compute total amount per name

We want to groupby (or foldby) the `name` key, then add up the all of the amounts for each name.

Steps

1.  Create a small function that, given a dictionary like 

        {'name': 'Alice', 'transactions': [{'amount': 1, 'id': 123}, {'amount': 2, 'id': 456}]}
        
    produces the sum of the amounts, e.g. `3`
    
2.  Slightly change the binary operator of the `foldby` example above so that the binary operator doesn't count the number of entries, but instead accumulates the sum of the amounts.

In [None]:

# Pam's solution in this cell and the cell below.  Takes too long and has a memory problem.
def sum_amount(d):
    ddb = db.from_sequence(d['transactions'])
    return ddb.pluck('amount').sum().compute()

def incr(tot, _):
    val = sum_amount(_)
    return tot+val



In [None]:
%%time
result = js.foldby(key='name', 
                   binop=incr, 
                   initial=0, 
                   combine=add, 
                   combine_initial=0)
print(sorted(result))

Confirmation that sum_amount works:

In [None]:
d = {'name': 'Alice', 'transactions': [{'amount': 1, 'id': 123}, {'amount': 2, 'id': 456}]}
sum_amount(d)