## 通过某个字段将记录分组
itertools.groupby()

In [1]:
# 字段列表
rows = [
    {'address': '5412 N CLARK', 'date': '07/01/2012'},
    {'address': '5148 N CLARK', 'date': '07/04/2012'},
    {'address': '5800 E 58TH', 'date': '07/02/2012'},
    {'address': '2122 N CLARK', 'date': '07/03/2012'},
    {'address': '5645 N RAVENSWOOD', 'date': '07/02/2012'},
    {'address': '1060 W ADDISON', 'date': '07/02/2012'},
    {'address': '4801 N BROADWAY', 'date': '07/01/2012'},
    {'address': '1039 W GRANVILLE', 'date': '07/04/2012'},
]

In [2]:
from operator import itemgetter
from itertools import groupby
rows.sort(key=itemgetter('date'))

In [3]:
rows

[{'address': '5412 N CLARK', 'date': '07/01/2012'},
 {'address': '4801 N BROADWAY', 'date': '07/01/2012'},
 {'address': '5800 E 58TH', 'date': '07/02/2012'},
 {'address': '5645 N RAVENSWOOD', 'date': '07/02/2012'},
 {'address': '1060 W ADDISON', 'date': '07/02/2012'},
 {'address': '2122 N CLARK', 'date': '07/03/2012'},
 {'address': '5148 N CLARK', 'date': '07/04/2012'},
 {'address': '1039 W GRANVILLE', 'date': '07/04/2012'}]

In [7]:
for date, items in groupby(rows, key=itemgetter('date')):
    print(date)
    for i in items:
        print(' ',i)

07/01/2012
  {'address': '5412 N CLARK', 'date': '07/01/2012'}
  {'address': '4801 N BROADWAY', 'date': '07/01/2012'}
07/02/2012
  {'address': '5800 E 58TH', 'date': '07/02/2012'}
  {'address': '5645 N RAVENSWOOD', 'date': '07/02/2012'}
  {'address': '1060 W ADDISON', 'date': '07/02/2012'}
07/03/2012
  {'address': '2122 N CLARK', 'date': '07/03/2012'}
07/04/2012
  {'address': '5148 N CLARK', 'date': '07/04/2012'}
  {'address': '1039 W GRANVILLE', 'date': '07/04/2012'}


#### 看看items是什么

In [10]:
for date, items in groupby(rows, key=itemgetter('date')):
    print(date)
    print(items)

07/01/2012
<itertools._grouper object at 0x0000020592D93C18>
07/02/2012
<itertools._grouper object at 0x0000020592D93080>
07/03/2012
<itertools._grouper object at 0x0000020592D93C18>
07/04/2012
<itertools._grouper object at 0x0000020592D93080>


groupby() 函数扫描整个序列并且查找连续相同值（或者根据指定 key 函数返回值相同）的元素序列。 在每次迭代的时候，它会返回一个值和一个迭代器对象， 这个迭代器对象可以生成元素值全部等于上面那个值的组中所有对象。

In [8]:
groupby(rows, key=itemgetter('date'))

<itertools.groupby at 0x20592d71a98>

一个非常重要的准备步骤是要根据指定的字段将数据排序。 因为 groupby() 仅仅检查连续的元素,
若不进行排序:

In [9]:
rows_test = [
    {'address': '5412 N CLARK', 'date': '07/01/2012'},
    {'address': '5148 N CLARK', 'date': '07/04/2012'},
    {'address': '5800 E 58TH', 'date': '07/02/2012'},
    {'address': '2122 N CLARK', 'date': '07/03/2012'},
    {'address': '5645 N RAVENSWOOD', 'date': '07/02/2012'},
    {'address': '1060 W ADDISON', 'date': '07/02/2012'},
    {'address': '4801 N BROADWAY', 'date': '07/01/2012'},
    {'address': '1039 W GRANVILLE', 'date': '07/04/2012'},
]

for date, items in groupby(rows_test, key=itemgetter('date')):
    print(date)
    for i in items:
        print(' ',i)

07/01/2012
  {'address': '5412 N CLARK', 'date': '07/01/2012'}
07/04/2012
  {'address': '5148 N CLARK', 'date': '07/04/2012'}
07/02/2012
  {'address': '5800 E 58TH', 'date': '07/02/2012'}
07/03/2012
  {'address': '2122 N CLARK', 'date': '07/03/2012'}
07/02/2012
  {'address': '5645 N RAVENSWOOD', 'date': '07/02/2012'}
  {'address': '1060 W ADDISON', 'date': '07/02/2012'}
07/01/2012
  {'address': '4801 N BROADWAY', 'date': '07/01/2012'}
07/04/2012
  {'address': '1039 W GRANVILLE', 'date': '07/04/2012'}


### defaultdict 键映射多个值的字典
from collections import defaultdict

In [16]:
from collections import defaultdict
# 映射的多值用列表收集:有序，重复
d = defaultdict(list) 
d['a'].append(1)
d['a'].append(2)
d['b'].append(6)
d

defaultdict(list, {'a': [1, 2], 'b': [6]})

In [18]:
d = defaultdict(set) 
# 映射的值用集合收集:无序，唯一
d['a'].add(5)
d['b'].add(2)
d['b'].add(6)
d

defaultdict(set, {'a': {5}, 'b': {2, 6}})

setdefault为将要访问的键创建映射实体

In [23]:
d = {}
d.setdefault('a',[]).append(1)
d.setdefault('b',set()).add(2) # 创建空集合必须用set(); { }是用来创建一个字典的
d

{'a': [1], 'b': {2}}

In [25]:
d['a'].append(5)
d

{'a': [1, 5], 'b': {2}}

### 用defaultdict处理按字段将记录分组

In [27]:
from collections import defaultdict
rows_by_date = defaultdict(list)
for row in rows:
    rows_by_date[row['date']].append(row)
rows_by_date

defaultdict(list,
            {'07/01/2012': [{'address': '5412 N CLARK', 'date': '07/01/2012'},
              {'address': '4801 N BROADWAY', 'date': '07/01/2012'}],
             '07/02/2012': [{'address': '5800 E 58TH', 'date': '07/02/2012'},
              {'address': '5645 N RAVENSWOOD', 'date': '07/02/2012'},
              {'address': '1060 W ADDISON', 'date': '07/02/2012'}],
             '07/03/2012': [{'address': '2122 N CLARK', 'date': '07/03/2012'}],
             '07/04/2012': [{'address': '5148 N CLARK', 'date': '07/04/2012'},
              {'address': '1039 W GRANVILLE', 'date': '07/04/2012'}]})