# Data generation

We are going to download GitHub activity from the [GitHub archive](https://www.githubarchive.org).

Each hour of each day gets a file, in which each line is a JSON-serialized record of an event on GitHub - comments, pull requests, commits, etc.

We define a `fetch_one` function to fetch a single file into our `../data` directory,
and parallelize many calls to `fetch_one` with
our `fetch_range` function via `ThreadPoolExecuter.map`.

In [None]:
from datetime import datetime, timedelta
from glob import glob
import gzip
import os
import shutil

import requests

data = os.path.join('..', 'data')
if not os.path.exists(data):
    os.mkdir(data)


url_t = 'http://data.githubarchive.org/{year:04}-{month:02}-{day:02}-{hour}.json.gz'
fn_t =  os.path.join(data, 'data-{year:04}-{month:02}-{day:02}-{hour}.json')


# Pick 11 July 2015, last year's SciPy sprints
day1 = datetime(year=2015, month=7, day=11)

def fetch_one(timestamp=day1):
    """Fetch a single hour of github data.
    
    Returns filename downloaded.
    
    No-op if already downloaded.
    """
    # build URL and filename from date
    ns = dict(
        year = timestamp.year,
        month = timestamp.month,
        day = timestamp.day,
        hour = timestamp.hour,
    )
    url = url_t.format(**ns)
    fn = fn_t.format(**ns)
    
    if os.path.exists(fn):
        # already downloaded
        return fn
    
    # download the gzip-compressed json data
    resp = requests.get(url, stream=True)
    resp.raise_for_status()
    with gzip.open(resp.raw, mode='rb') as r:
        with open(fn, 'wb') as w:
            shutil.copyfileobj(r, w)
    return fn
    
def fetch_range(ex, start=day1, hours=24):
    """Fetch a range, starting from a date and grabbing every hourly dataset.
    
    parallelized via an Executor.
    
    ex: Executor
    start: datetime
    hours: integer
    """
    return ex.map(fetch_one, 
                 (start + timedelta(hours=i) for i in range(hours))
    )


In [None]:
from concurrent.futures import ThreadPoolExecutor
ex = ThreadPoolExecutor()

In [None]:
%%time

total = 0

for fn in fetch_range(ex):
    MB = os.stat(fn).st_size / 2**20
    print("%6.1f MB %s" % (MB, fn))
    total += MB
print("%6.1f MB total" % total)

filenames = glob(os.path.join(data, '*.json'))

In [None]:
import json
import pprint
with open(filenames[0]) as f:
    for i in range(10):
        event = json.loads(f.readline())
        event['payload'] = 'redacted' # payload is big
        pprint.pprint(event)