# 群組資料和群組運算
    - 群組資料集和對群組運算是分析資料的重要一環
    - 從載入資料,合併資料,清理資料後,再來就是要群組分析,產生樞紐分析表和視覺圖表
    - pandas提供groupby的介面,使我們可以切割,組合,分析資料
    
## GroupBy 機制
    -split-apply-combine

![group aggregation](aggregation.png)

    - Grouping key可以有非常多的格式
        * list或者是array, 必需和被分析資料的長度一樣
        * DataFrame的欄位名稱
        * dictionary 或 Series
        * function

In [None]:
import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                  'key2':['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2':np.random.randn(5)})
df
'''
Out[22]: 
  key1 key2     data1     data2
0    a  one  0.840432  1.395391
1    a  two  0.032048  1.732521
2    b  one  1.309441  0.444729
3    b  two  0.434163 -0.929048
4    a  one -0.748641  0.560896
'''


#依據key1求取data1的平均
#groupby(Serial)
#會得到GroupBy物件,目前尚未運算,這只是個中介資料

grouped = df['data1'].groupby(df['key1'])
grouped

#Out[23]: <pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff541ddb190>

#求平均
#產生全新的Serial,索引是依據groupby內的Serial值和df['key1']
grouped.mean()
'''
Out[24]: 
key1
a    0.041280
b    0.871802
Name: data1, dtype: float64
'''


#groupby(list)
#傳回一個Serial和階層索引
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

'''
Out[25]: 
key1  key2
a     one     0.045896
      two     0.032048
b     one     1.309441
      two     0.434163
Name: data1, dtype: float64
'''

#建立表格式資料
means.unstack()
'''
Out[26]: 
key2       one       two
key1                    
a     0.045896  0.032048
b     1.309441  0.434163
'''

#groupby([ndArray,ndArray])
states = np.array(['台北','台中','台中','台北','台北'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()
'''
Out[27]: 
台中  2005    0.032048
    2006    1.309441
台北  2005    0.637298
    2006   -0.748641
Name: data1, dtype: float64

'''

#一個平常最常遇到的情形,群組化的資料會在同一個資料的欄位內,這時就可以使用欄位名稱
#groupby(欄位標籤名)
#小心,必需使用df,不可使用df['data1'],因為df的dataFrame內才有包含'key1'的欄位名
#會發現結果沒有'key2',原因是key2不是數值化資料,是字串,所以會被排除

df.groupby('key1').mean()
'''
Out[28]: 
         data1     data2
key1                    
a     0.041280  1.229603
b     0.871802 -0.242160
'''
#groupby([欄位標籤名,欄位標籤名])
df.groupby(['key1','key2']).mean()
'''
Out[29]: 
              data1     data2
key1 key2                    
a    one   0.045896  0.978144
     two   0.032048  1.732521
b    one   1.309441  0.444729
     two   0.434163 -0.929048


'''
#使用groupby物件的size()方法,了解目前group的狀況
#注意有任何的值NaN,將會被排除在運算內
df.groupby(['key1', 'key2']).size()




#### 一個一個讀取groupby物件內的值

In [None]:
#groupby物件支援for in迴圈,一般傳出包含2個物件的tuple

df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                  'key2':['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2':np.random.randn(5)})
df

#會傳出group後的name和group name內保含的值(尚未運算)
for name,group in df.groupby('key1'):
    print(name)
    print(group)

'''
a
  key1 key2     data1     data2
0    a  one  0.840432  1.395391
1    a  two  0.032048  1.732521
4    a  one -0.748641  0.560896
b
  key1 key2     data1     data2
2    b  one  1.309441  0.444729
3    b  two  0.434163 -0.929048
'''

for (k1, k2),group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
    
'''
('a', 'one')
  key1 key2     data1     data2
0    a  one  0.840432  1.395391
4    a  one -0.748641  0.560896
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.032048  1.732521
('b', 'one')
  key1 key2     data1     data2
2    b  one  1.309441  0.444729
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.434163 -0.929048
'''

#使用list,dict的技巧,取出群組的資料
list(df.groupby('key1'))
'''
[('a',   key1 key2     data1     data2
  0    a  one -0.054250  2.523135
  1    a  two  0.754894 -0.214982
  4    a  one -0.309069  0.659420), 
  ('b',   key1 key2     data1     data2
  2    b  one -0.721992  0.146259
  3    b  two -0.680506 -1.198810)]
'''

pieces = dict(list(df.groupby('key1')))
pieces['b']
'''
Out[32]: 
  key1 key2     data1     data2
2    b  one  1.309441  0.444729
3    b  two  0.434163 -0.929048
'''

pieces['a']
'''
Out[33]: 
  key1 key2     data1     data2
0    a  one  0.840432  1.395391
1    a  two  0.032048  1.732521
4    a  one -0.748641  0.560896
'''

#預設groupby是axis=0(一直列),也可以使用axis=1(一橫欄)
df.dtypes
'''
Out[34]: 
key1      object
key2      object
data1    float64
data2    float64
dtype: object
'''
grouped = df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print(group)
'''
float64
      data1     data2
0  0.840432  1.395391
1  0.032048  1.732521
2  1.309441  0.444729
3  0.434163 -0.929048
4 -0.748641  0.560896
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
'''



#### groupby後,選取一個欄位或多個欄位

In [8]:
import numpy as np
import pandas as pd
df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                  'key2':['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2':np.random.randn(5)})
df

#等同df['data1'].groupby(df['key1'])
df.groupby('key1')['data1']

'''
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fb0703e4c40>
'''

#等同df[['data1']].groupby(df['key1'])
df.groupby('key1')[['data1']]
'''
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff261b4f890>
'''

df.groupby(['key1', 'key2'])[['data2']].mean()
'''
Out[27]: 
              data2
key1 key2          
a    one   0.388754
     two   0.995239
b    one   1.204072
     two  -1.617815
'''



key1
a    3
b    2
dtype: int64

#### 使用dictionary and Series群組

In [14]:
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['國堂','瑞彤','家渝','建全','子瑜'])
people.iloc[2:3, [1, 2]] = np.nan
people
'''
Out[28]: 
           a         b         c         d         e
國堂  0.831007  0.794457  0.127571 -0.515541 -1.573255
瑞彤  2.047594  0.306150 -1.109582 -1.304595 -0.174706
家渝  0.082868       NaN       NaN -0.759848  0.990540
建全 -0.230918  2.648930 -0.151027  1.017882  0.759116
子瑜 -3.171536  0.650561 -0.487912 -1.027657  1.249068
'''

mapping = {'a':'red', 'b':'red', 'c':'blue', 'd':'blue', 'e':'red', 'f':'orange'}
by_column = people.groupby(mapping, axis=1)
by_column.sum()
'''
Out[29]: 
        blue       red
國堂 -0.387971  0.052209
瑞彤 -2.414177  2.179037
家渝 -0.759848  1.073408
建全  0.866855  3.177127
子瑜 -1.515569 -1.271908
'''

map_series = pd.Series(mapping)
map_series
'''
Out[30]: 
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object
'''

people.groupby(map_series, axis=1).count()
'''
Out[31]: 
    blue  red
國堂     2    3
瑞彤     2    3
家渝     1    2
建全     2    3
子瑜     2    3
'''

Unnamed: 0,blue,red
國堂,2,3
瑞彤,2,3
家渝,1,2
建全,2,3
子瑜,2,3


#### 使用function 群組

#### Homework1
[homework1](https://github.com/roberthsu2003/PythonForDataAnalysis/blob/master/%E7%BE%A4%E7%B5%84%E8%B3%87%E6%96%99%E5%92%8C%E7%BE%A4%E7%B5%84%E9%81%8B%E7%AE%97/Homework1.ipynb)