diff --git a/pep-9999.rst b/pep-9999.rst new file mode 100644 index 00000000000..3b1ba9aeae6 --- /dev/null +++ b/pep-9999.rst @@ -0,0 +1,530 @@ +PEP: 9999 +Title: dict-grouping +Author: Michael Selik +Status: Draft +Type: Standards Track +Content-Type: text/x-rst +Created: 28-Jun-2018 +Python-Version: 3.8 +Post-History: 28-Jun-2018 + + + +*Last revised 01-Jul-2018* + + + +Abstract +======== + +This is a proposal for creating a concise, reliable way to group +elements of a sequence, resulting in a dictionary of lists. + + + +Specification +============= + +A new built-in function, ``grouping``, will construct a dictionary of +lists based on an iterable and a key-function. + +:: + + from itertools import groupby as _groupby + + def grouping(iterable, key=None): + ''' + Group elements of an iterable into a dict of lists. + + The ``key`` is a function computing a key value for each + element. Each key corresponds to a group -- a list of elements + in the same order as encountered. By default, the key will be + the element itself. + + >>> grouping('AbBa', key=str.casefold) + {'a': ['A', 'a'], 'b': ['b', 'B']} + + ''' + groups = {} + for k, g in _groupby(iterable, key): + groups.setdefault(k, []).extend(g) + return groups + + + +Motivation +========== + +Grouping, categorizing, classifying, bucketing, or demultiplexing is a +fundamental concept in programming and analysis. It deserves a tool in +Python that is succinct, efficient, and easily accessible. + +:: + + >>> names = ['John', 'Paul', 'George', 'Ringo'] + >>> grouping(names, key=len) + {4: ['John', 'Paul'], 6: ['George'], 5: ['Ringo']} + + +Comparisons +----------- + +:: + + d = grouping(names, key=len) + + +``dict`` +~~~~~~~~ + +A "naive" solution with the built-in ``dict``. + +:: + + d = {} + for value in names: + k = len(value) + if k not in d: + d[k] = [] + d[k].append(value) + + +``dict.setdefault`` +~~~~~~~~~~~~~~~~~~~ + +A better solution with the ``setdefault`` method. + +:: + + d = {} + for value in names: + d.setdefault(len(value), []).append(value) + + +``itertools.groupby`` +~~~~~~~~~~~~~~~~~~~~~ + +While elegent in its use-case, ``groupby`` here runs in O(n log n) time. +It also requires that the keys are comparable. + +:: + + from itertools import groupby + + d = {k: list(group) for k, group in groupby(sorted(names, key=len), key=len)} + + +``collections.defaultdict`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To avoid accidental inserts later in the code, it can be desirable to +convert a ``defaultdict`` to a built-in ``dict`` [#]_. + +:: + + from collections import defaultdict + + d = defaultdict(list) + for value in names: + d[len(value)].append(value) + d = dict(d) + + +Examples +-------- + +Group words by length:: + + grouping(words, key=len) + + +Group names by first initial:: + + grouping(names, key=itemgetter(0)) + + +Group people by city:: + + grouping(contacts, key=itemgetter('city') + + +Group employees by department:: + + grouping(employees, key=itemgetter('department')) + + +Group files by extension:: + + grouping(os.listdir('.'), key=lambda filepath: os.path.splitext(filepath)[1]) + + +Group transactions by type:: + + grouping(transactions, key=lambda v: 'debit' if v > 0 else 'credit') + + +Invert a dictionary, ``d``, without discarding repeated values:: + + grouping(d, key=lambda k: d[k]) + + +Sequences of values that are already paired with their keys can be +easily transformed after grouping. + +:: + + >>> foods = [ + ... ('fruit', 'apple'), + ... ('vegetable', 'broccoli'), + ... ('fruit', 'clementine'), + ... ('vegetable', 'daikon') + ... ] + >>> groups = grouping(foods, key=lambda pair: pair[0]) + >>> {k: [v for _, v in g] for k, g in groups.items()} + {'fruit': ['apple', 'clementine'], 'vegetable': ['broccoli', 'daikon']} + + +Stateful key-functions enable sophisticated structures, such as creating +a transition table (finite state machine) from a sequence of events:: + + t0 = None + def previous(t1): + global t0 + x, t0 = t0, t1 + return x + + transitions = grouping(sequence, key=previous) + + +Aggregation +~~~~~~~~~~~ + +Group averages:: + + from csv import DictReader + from statistics import mean + from operator import itemgetter + + with open('income.csv') as f: + rows = DictReader(f) + by_state = grouping(rows, key=itemgetter('state')) + averages = {state: mean(row['income']) for state, row in by_state.items()} + + +Clustering:: + + clusters = grouping(rows, key=lambda row: nearest(row, centroids)) + + +The map-reduce paradigm is comparable to a group-and-aggregate:: + + g = grouping(sequence, key=mapper) + results = {k: reducer(group) for k, group in g.items()} + + +Uniques within each group:: + + groups = grouping(sequence, keyfunc) + {k: set(g) for k, g in groups.items()} + +Counts within each group:: + + groups = grouping(sequence, keyfunc) + {k: Counter(g) for k, g in groups.items()} + + + +Rationale +========= + +Humans think in taxonomies. In teaching Python, I've found that many +students will ask how to construct groups very early in the process of +learning the language. If they've used SQL, they're used to the GROUP +BY clause. If they've used Excel, they're used to writing row or column +labels as the first step in building a spreadsheet. + +Unfortunately, the three tools currently available for creating groups +in Python -- ``setdefault``, ``defaultdict``, and ``groupby`` -- invite +discussions of concepts that a teacher usually prefers to postpone until +after core skills like sorting and grouping. + +This proposal was inspired by similar tools available in other languages +and in Python community libraries. + +The prevalence of similar tools in so many community libraries suggests +that Python has not yet provided a satisfactory tool and that grouping +is significant enough to belong in the built-ins. + + +Other Languages +--------------- + +Java +~~~~ + +Java's ``Collectors.groupingBy`` [#]_ consumes a stream and creates a +``Map>`` associating keys with lists of values. + + +.NET +~~~~ + +C#'s ``Enumerable.GroupBy`` [#]_ is similar to Python's ``itertools.groupby`` +in that it's an iterator yielding groups that implement the +(``IGrouping``) interface. Despite claiming deferred +execution, ``Enumerable.GroupBy`` emits complete groups even if the +input sequence was not sorted by key. Additionally, it allows a +transform function for the grouped values in addition to a key function. + + +Clojure +~~~~~~~ + +Clojure has ``group-by`` [#]_, which is nearly identical to this proposal: +``(group-by f coll)``. The choice of the name begs a different order for +the parameters as well, as the phrase "group by key" is quite natural, +inviting a curry. + +:: + + user=> (group-by first ["python" "jython" "cython" "pypy" "cpython"]) + {\p ["python" "pypy"], \j ["jython"], \c ["cython" "cpython"]} + + +Community Libraries +------------------- + +Toolz +~~~~~ + +Toolz' ``groupby`` [#]_ requires the key-function as the first positional +parameter and the sequence to be grouped as the second. This order may +be more natural if a key-function is always necessary. However, it +breaks the pattern established by builtins ``sorted``, ``min``, ``max``, +and standard library ``itertools.groupby``, which all have the sequence +as the first parameter. + +:: + + >>> names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank'] + >>> groupby(len, names) + {3: ['Bob', 'Dan'], 5: ['Alice', 'Edith', 'Frank'], 7: ['Charlie']} + + +Pandas +~~~~~~ + +While Pandas may be most famous for its ``DataFrame``, the better +comparison in this situation would be ``Series.groupby`` [#]_. + +:: + + In [1]: import pandas as pd + + In [2]: def mod(x): + ...: def modulo(n): + ...: return n % x + ...: return modulo + ...: + + In [3]: pd.Series(range(10)).groupby(mod(2)).groups + Out[3]: + {0: Int64Index([0, 2, 4, 6, 8], dtype='int64'), + 1: Int64Index([1, 3, 5, 7, 9], dtype='int64')} + +As with Clojure, it fits naturally with the phrase, "group by key." +Using ``Series.groupby`` as an unbound method does not read nearly as +well. + +:: + + In [12]: pd.Series.groupby(numbers, mod(2)).groups + Out[12]: + {0: Int64Index([0, 2, 4, 6, 8], dtype='int64'), + 1: Int64Index([1, 3, 5, 7, 9], dtype='int64')} + +The ``DataFrame.groupby`` handles an interesting sub-category of usage, +when each element of the input sequence is itself a sequence with one or +many key-elements and one or many value-elements. In some cases, the +key-elements should be dropped from these sequences when grouping. + +:: + + >>> sequence = [[1, 11, 12], [1, 13, 14], [2, 21, 22], [2, 23, 24]] + >>> grouping(sequence, key=lambda row: row.pop(0)) + {1: [[11, 12], [13, 14]], 2: [[21, 22], [23, 24]]} + + +Rejected Alternatives +--------------------- + + +``key=itemgetter(0)`` +~~~~~~~~~~~~~~~~~~~~~ + +The default equality key-function is not of much practical use. In +``itertools.groupby`` the equality default can be used for finding +"runs" of duplicate values. In ``grouping``, it is nearly a duplicate +of ``collections.Counter``, though it might have a use for some unusual +types of data that compare equal even if some attributes have different +values. + +Using a default key-function of ``itemgetter(0)`` would enable a more +useful default behavior that elegantly handles iterables of ``(key, +value)`` pairs. + +:: + + from itertools import groupby as _groupby + from operator import itemgetter + + def grouping(iterable, key=itemgetter(0)): + ''' + Group elements of an iterable into a dict of lists. + + The ``key`` is a function computing a key value for each + element. Each key corresponds to a group -- a list of elements + in the same order as encountered. + + By default, the key-function gets the 0th index of each element. + + >>> grouping(['apple', 'banana', 'aardvark']) + {'a': ['apple', 'aardvark'], 'b': ['banana']} + + ''' + groups = {} + for k, g in _groupby(iterable, key): + groups.setdefault(k, []).extend(g) + return groups + + + +``dict.groupby`` +~~~~~~~~~~~~~~~~ + +The ``grouping`` function returns a ``dict`` and could be considered an +alternative constructor for the built-in dictionary. This rationale +could be extended to say that built-in functions like ``sorted`` are +``list`` constructors and becomes absurd if taken to the extreme. + +However, using the dict namespace could provide valuable clarity if the +proposed name, "grouping", becomes an issue. The most common +alternative name from other languages, "groupby" would too easily +conflict with ``itertools.groupby`` if made a built-in function. + +While "group-by" is a common choice for programming language designers, +it is more appropriate for languages like SQL in which all operations +are on iterables (rows in SQL's case). The phrase "group by" invites +the key function as the first and preferably only argument. Python has +established a pattern for functions taking similar parameters -- +``sorted``, ``min``, ``max``, and ``itertools.groupby`` -- that the +iterable is the first argument and the key-function is the second. + +The ``sorted`` function suggests using the past participle, "grouped." +The gerund "grouping" is similarly a noun-form of the task, but has the +advantage of feeling more like a verb or action, which is more pleasant +for a function name. + + +``collections.Grouping`` +~~~~~~~~~~~~~~~~~~~~~~~~ + +A new class in the collections module has some advantages. In a sense, +``Grouping`` is a special case of ``defaultdict``, but a general case of +``Counter``. Other possible names are ``Grouper`` or ``GroupBy``. It +could provide ``map`` and ``aggregate`` methods, which define an +interface for classes that provide a different internal data structure. +However, transforming and aggregating the groups can be performed as an +expressive dictionary comprehension, with more flexibility, and perhaps +more clarity than passing a function to a higher-order method. + +:: + + {k: func(g) for k, g in groups.items()} # aggregate + {k: [func(v) for v in g] for k, g in groups.items()} # map + +Merging groupings is not a one-liner, but could be included as a recipe +in the documentation. + +:: + + def merge(*groupings): + 'Combine multiple groupings into a single dict of lists' + groups = {} + for d in groupings: + for k, g in d.items(): + groups.setdefault(k, []).extend(g) + return groups + +It's hard to estimate the frequency with which programmers use the +various built-ins. Grouping is a comparable concept to many tools which +were deemed important enough to belong in the built-ins, such as +``filter`` and ``zip``. + +While importing is easy, so many Pythonistas build groups inefficiently +that ``grouping`` should not be tucked away in a module. + + + +How to Teach This +================= + +I suggest first demonstrating ``sorted`` on a list, then using +``sorted``'s key-function parameter, because sorting a list keeps the +same data type for input and output. + +:: + + >>> actors = ['Graham', 'Eric', 'Terry', 'Terry', 'John', 'Michael'] + >>> sorted(actors) + ['Eric', 'Graham', 'John', 'Michael', 'Terry', 'Terry'] + >>> sorted(actors, key=len) + ['Eric', 'John', 'Terry', 'Terry', 'Graham', 'Michael'] + + +After the students are happy with the idea of ``len`` as a sorting key, +ask them what they think ``grouping`` will do. Give them a moment to +consider the possibilities before demonstrating the results. + +:: + + >>> grouping(actors, key=len) + {4: ['Eric', 'John'], 5: ['Terry', 'Terry'], 6: ['Graham'], 7: ['Michael']} + + +``itertools.groupby`` +--------------------- + +If you have already introduced the concept of generators and/or +iterators, it would be helpful to show the differences between +``grouping`` and ``itertools.groupby``, highlighting that ``groupby`` +may yield the same key twice and that the groups are generators. + + + +References +========== + +.. [#] https://github.com/pytoolz/toolz/blob/2bd9139d0d0e17d3426cb467b5f58b1fb6d8a439/toolz/itertoolz.py#L528 +.. [#] https://docs.oracle.com/javase/8/docs/api/java/util/stream/Collectors.html +.. [#] https://msdn.microsoft.com/en-us/library/bb534304(v=vs.110).aspx +.. [#] https://clojuredocs.org/clojure.core/group-by +.. [#] http://toolz.readthedocs.io/en/latest/api.html#toolz.itertoolz.groupby +.. [#] http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.groupby.html#pandas.Series.groupby + + + +Acknowledgements +================ + +Thanks to David Mertz for suggesting the Grouping class and Chris Barker +for writing a possible implementation of it. Nicolas Rolin suggested +itemgetter(0) for the default key-function. And of course, thank you to +all who read this PEP and participated in discussion. + + + +Copyright +========= + +This document has been placed in the public domain.