# Solutions

## #1: Display the total number of watchers per language (ignore repos w/o a language)

### Solution

In [1]:
from urllib.request import urlopen
from itertools import groupby
from operator import itemgetter
from ijson import items

url2 = 'https://api.github.com/search/repositories?q=data'
f = urlopen(url2)
repos = items(f, 'items.item')

keyfunc = itemgetter('language')
cleaned = filter(keyfunc, repos)
records = sorted(cleaned, key=keyfunc)
grouped = groupby(records, keyfunc)

for key, group in grouped:
    cnt = sum(g['watchers'] for g in group)
    print(key, cnt)

C# 35
C++ 64
HTML 352
JavaScript 4702
Jupyter Notebook 5573
PHP 129
Python 16235
R 18


## #2: Language with the most number of watchers, per `owner_type` per `has_pages`

### Solution

In [22]:
from urllib.request import urlopen
from operator import itemgetter
from functools import partial
from meza import process as pr, fntools as ft
from meza.io import read_json

url4 = 'https://api.github.com/search/repositories?q=data&sort=stars&order=desc'
f = urlopen(url4)
records = read_json(f, path='items.item')

# repos without a language have a value of None, which meza doesn't like
filled = pr.fillempty(records, value='', fields=['language'])
filled, preview = pr.peek(filled)
preview[0]

{'archive_url': 'https://api.github.com/repos/d3/d3/{archive_format}{/ref}',
 'assignees_url': 'https://api.github.com/repos/d3/d3/assignees{/user}',
 'blobs_url': 'https://api.github.com/repos/d3/d3/git/blobs{/sha}',
 'branches_url': 'https://api.github.com/repos/d3/d3/branches{/branch}',
 'clone_url': 'https://github.com/d3/d3.git',
 'collaborators_url': 'https://api.github.com/repos/d3/d3/collaborators{/collaborator}',
 'comments_url': 'https://api.github.com/repos/d3/d3/comments{/number}',
 'commits_url': 'https://api.github.com/repos/d3/d3/commits{/sha}',
 'compare_url': 'https://api.github.com/repos/d3/d3/compare/{base}...{head}',
 'contents_url': 'https://api.github.com/repos/d3/d3/contents/{+path}',
 'contributors_url': 'https://api.github.com/repos/d3/d3/contributors',
 'created_at': '2010-09-27T17:22:42Z',
 'default_branch': 'master',
 'deployments_url': 'https://api.github.com/repos/d3/d3/deployments',
 'description': 'Bring data to life with SVG, Canvas and HTML. :bar_chart

In [23]:
# meza doesn't do well with nested dicts
flat = (dict(ft.flatten(r)) for r in filled)
flat, preview = pr.peek(flat)
preview[0]

{'archive_url': 'https://api.github.com/repos/d3/d3/{archive_format}{/ref}',
 'assignees_url': 'https://api.github.com/repos/d3/d3/assignees{/user}',
 'blobs_url': 'https://api.github.com/repos/d3/d3/git/blobs{/sha}',
 'branches_url': 'https://api.github.com/repos/d3/d3/branches{/branch}',
 'clone_url': 'https://github.com/d3/d3.git',
 'collaborators_url': 'https://api.github.com/repos/d3/d3/collaborators{/collaborator}',
 'comments_url': 'https://api.github.com/repos/d3/d3/comments{/number}',
 'commits_url': 'https://api.github.com/repos/d3/d3/commits{/sha}',
 'compare_url': 'https://api.github.com/repos/d3/d3/compare/{base}...{head}',
 'contents_url': 'https://api.github.com/repos/d3/d3/contents/{+path}',
 'contributors_url': 'https://api.github.com/repos/d3/d3/contributors',
 'created_at': '2010-09-27T17:22:42Z',
 'default_branch': 'master',
 'deployments_url': 'https://api.github.com/repos/d3/d3/deployments',
 'description': 'Bring data to life with SVG, Canvas and HTML. :bar_chart

In [24]:
# `watchers` is the pivot field to aggregate by
# `language` is the pivot field to group by
args = ('watchers', 'language')

# the pivot fields we want to include in each row
rows = ['has_pages', 'owner_type']
pivotted = pr.pivot(flat, *args, rows=rows, op=sum)
pivotted, preview = pr.peek(pivotted)
preview[0]

{'C#': 7772,
 'C++': 58473,
 'Go': 13510,
 'Objective-C': 10702,
 'Ruby': 7504,
 'Swift': 27142,
 'has_pages': False,
 'owner_type': 'Organization'}

In [25]:
# `rows` are the fields we don't want to normalize (since `invert` is true)
kwargs = {'rows': rows, 'invert': True}

# `watchers` is the field to use for the normalized values
# `language` is the field to use for the normalized key
normal = pr.normalize(pivotted, *args, **kwargs)
normal, preview = pr.peek(normal)
preview[0]

{'has_pages': False,
 'language': 'Objective-C',
 'owner_type': 'Organization',
 'watchers': 10702}

In [26]:
# aggregate by `watchers`
agg_keyfunc = itemgetter('watchers')

# group by `has_pages` and `owner_type`
group_keyfunc = lambda x: tuple(x[r] for r in rows)
aggregator = partial(max, key=agg_keyfunc)

# Only emit the groups, not the group key (since `tupled` is False)
kwargs = {'tupled': False, 'aggregator': aggregator}

grouped = pr.group(normal, group_keyfunc, **kwargs)
grouped, preview = pr.peek(grouped)
preview[0]

{'has_pages': False,
 'language': 'C++',
 'owner_type': 'Organization',
 'watchers': 58473}

In [27]:
from pprint import pprint

sgrouped = sorted(grouped, key=agg_keyfunc, reverse=True)

for record in sgrouped:
    pprint(record)

{'has_pages': True,
 'language': 'JavaScript',
 'owner_type': 'Organization',
 'watchers': 130152}
{'has_pages': False,
 'language': 'C++',
 'owner_type': 'Organization',
 'watchers': 58473}
{'has_pages': False,
 'language': 'Python',
 'owner_type': 'User',
 'watchers': 48543}
{'has_pages': True,
 'language': 'JavaScript',
 'owner_type': 'User',
 'watchers': 10285}
