# 群組資料和群組運算
    - 群組資料集和對群組運算是分析資料的重要一環
    - 從載入資料,合併資料,清理資料後,再來就是要群組分析,產生樞紐分析表和視覺圖表
    - pandas提供groupby的介面,使我們可以切割,組合,分析資料
    
## GroupBy 機制
    -split-apply-combine

![group aggregation](aggregation.png)

    - Grouping key可以有非常多的格式
        * list或者是array, 必需和被分析資料的長度一樣
        * DataFrame的欄位名稱
        * dictionary 或 Series
        * function

In [None]:
import numpy as np
import pandas as pd

df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                  'key2':['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2':np.random.randn(5)})
df
'''
Out[22]: 
  key1 key2     data1     data2
0    a  one  0.840432  1.395391
1    a  two  0.032048  1.732521
2    b  one  1.309441  0.444729
3    b  two  0.434163 -0.929048
4    a  one -0.748641  0.560896
'''


#依據key1求取data1的平均
#groupby(Serial)
#會得到GroupBy物件,目前尚未運算,這只是個中介資料

grouped = df['data1'].groupby(df['key1'])
grouped

#Out[23]: <pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff541ddb190>

#求平均
#產生全新的Serial,索引是依據groupby內的Serial值和df['key1']
grouped.mean()
'''
Out[24]: 
key1
a    0.041280
b    0.871802
Name: data1, dtype: float64
'''


#groupby(list)
#傳回一個Serial和階層索引
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

'''
Out[25]: 
key1  key2
a     one     0.045896
      two     0.032048
b     one     1.309441
      two     0.434163
Name: data1, dtype: float64
'''

#建立表格式資料
means.unstack()
'''
Out[26]: 
key2       one       two
key1                    
a     0.045896  0.032048
b     1.309441  0.434163
'''

#groupby([ndArray,ndArray])
states = np.array(['台北','台中','台中','台北','台北'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()
'''
Out[27]: 
台中  2005    0.032048
    2006    1.309441
台北  2005    0.637298
    2006   -0.748641
Name: data1, dtype: float64

'''

#一個平常最常遇到的情形,群組化的資料會在同一個資料的欄位內,這時就可以使用欄位名稱
#groupby(欄位標籤名)
#小心,必需使用df,不可使用df['data1'],因為df的dataFrame內才有包含'key1'的欄位名
#會發現結果沒有'key2',原因是key2不是數值化資料,是字串,所以會被排除

df.groupby('key1').mean()
'''
Out[28]: 
         data1     data2
key1                    
a     0.041280  1.229603
b     0.871802 -0.242160
'''
#groupby([欄位標籤名,欄位標籤名])
df.groupby(['key1','key2']).mean()
'''
Out[29]: 
              data1     data2
key1 key2                    
a    one   0.045896  0.978144
     two   0.032048  1.732521
b    one   1.309441  0.444729
     two   0.434163 -0.929048


'''
#使用groupby物件的size()方法,了解目前group的狀況
#注意有任何的值NaN,將會被排除在運算內
df.groupby(['key1', 'key2']).size()




#### 一個一個讀取groupby物件內的值

In [None]:
#groupby物件支援for in迴圈,一般傳出包含2個物件的tuple

df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                  'key2':['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2':np.random.randn(5)})
df

#會傳出group後的name和group name內保含的值(尚未運算)
for name,group in df.groupby('key1'):
    print(name)
    print(group)

'''
a
  key1 key2     data1     data2
0    a  one  0.840432  1.395391
1    a  two  0.032048  1.732521
4    a  one -0.748641  0.560896
b
  key1 key2     data1     data2
2    b  one  1.309441  0.444729
3    b  two  0.434163 -0.929048
'''

for (k1, k2),group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
    
'''
('a', 'one')
  key1 key2     data1     data2
0    a  one  0.840432  1.395391
4    a  one -0.748641  0.560896
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.032048  1.732521
('b', 'one')
  key1 key2     data1     data2
2    b  one  1.309441  0.444729
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.434163 -0.929048
'''

#使用list,dict的技巧,取出群組的資料
list(df.groupby('key1'))
'''
[('a',   key1 key2     data1     data2
  0    a  one -0.054250  2.523135
  1    a  two  0.754894 -0.214982
  4    a  one -0.309069  0.659420), 
  ('b',   key1 key2     data1     data2
  2    b  one -0.721992  0.146259
  3    b  two -0.680506 -1.198810)]
'''

pieces = dict(list(df.groupby('key1')))
pieces['b']
'''
Out[32]: 
  key1 key2     data1     data2
2    b  one  1.309441  0.444729
3    b  two  0.434163 -0.929048
'''

pieces['a']
'''
Out[33]: 
  key1 key2     data1     data2
0    a  one  0.840432  1.395391
1    a  two  0.032048  1.732521
4    a  one -0.748641  0.560896
'''

#預設groupby是axis=0(一直列),也可以使用axis=1(一橫欄)
df.dtypes
'''
Out[34]: 
key1      object
key2      object
data1    float64
data2    float64
dtype: object
'''
grouped = df.groupby(df.dtypes, axis=1)
for dtype, group in grouped:
    print(dtype)
    print(group)
'''
float64
      data1     data2
0  0.840432  1.395391
1  0.032048  1.732521
2  1.309441  0.444729
3  0.434163 -0.929048
4 -0.748641  0.560896
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
'''


#### groupby後,選取一個欄位或多個欄位

In [None]:
import numpy as np
import pandas as pd
df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                  'key2':['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2':np.random.randn(5)})
df

#等同df['data1'].groupby(df['key1'])
df.groupby('key1')['data1']

'''
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fb0703e4c40>
'''

#等同df[['data1']].groupby(df['key1'])
df.groupby('key1')[['data1']]
'''
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff261b4f890>
'''

df.groupby(['key1', 'key2'])[['data2']].mean()
'''
Out[27]: 
              data2
key1 key2          
a    one   0.388754
     two   0.995239
b    one   1.204072
     two  -1.617815
'''



#### 使用dictionary and Series群組

In [None]:
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['國堂','瑞彤','家渝','建全','子瑜'])
people.iloc[2:3, [1, 2]] = np.nan
people
'''
Out[28]: 
           a         b         c         d         e
國堂  0.831007  0.794457  0.127571 -0.515541 -1.573255
瑞彤  2.047594  0.306150 -1.109582 -1.304595 -0.174706
家渝  0.082868       NaN       NaN -0.759848  0.990540
建全 -0.230918  2.648930 -0.151027  1.017882  0.759116
子瑜 -3.171536  0.650561 -0.487912 -1.027657  1.249068
'''

mapping = {'a':'red', 'b':'red', 'c':'blue', 'd':'blue', 'e':'red', 'f':'orange'}
by_column = people.groupby(mapping, axis=1)
by_column.sum()
'''
Out[29]: 
        blue       red
國堂 -0.387971  0.052209
瑞彤 -2.414177  2.179037
家渝 -0.759848  1.073408
建全  0.866855  3.177127
子瑜 -1.515569 -1.271908
'''

map_series = pd.Series(mapping)
map_series
'''
Out[30]: 
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object
'''

people.groupby(map_series, axis=1).count()
'''
Out[31]: 
    blue  red
國堂     2    3
瑞彤     2    3
家渝     1    2
建全     2    3
子瑜     2    3
'''

#### 使用function 群組

In [None]:
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['徐國堂','瑞彤','家渝','周建全','司馬子瑜'])
people.iloc[2:3, [1, 2]] = np.nan
people
'''
Out[4]: 
             a         b         c         d         e
徐國堂  -2.491357 -0.951911  1.099882  0.719504 -0.702062
瑞彤    2.258345  0.023958 -1.399993  1.052907  1.523407
家渝    1.675287       NaN       NaN -1.991623 -0.983414
周建全  -0.660723 -0.114712 -1.237950  0.088263 -1.195361
司馬子瑜 -0.213963 -0.649956  0.882169 -0.902870  0.502168
'''

#使用len function,傳出索引長度為2,3,4,就以傳出的值作為group name
people.groupby(len).sum()
'''
Out[5]: 
          a         b         c         d         e
2  3.933632  0.023958 -1.399993 -0.938716  0.539993
3 -3.152080 -1.066623 -0.138068  0.807767 -1.897423
4 -0.213963 -0.649956  0.882169 -0.902870  0.502168
'''

#可以function混合(陣列)加入陣列成為索引
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
'''
Out[6]: 
              a         b         c         d         e
2 one  1.675287  0.023958 -1.399993 -1.991623 -0.983414
3 one -2.491357 -0.951911  1.099882  0.719504 -0.702062
  two -0.660723 -0.114712 -1.237950  0.088263 -1.195361
4 two -0.213963 -0.649956  0.882169 -0.902870  0.502168
'''

#### 使用階層索引群組

In [None]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                   names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
hier_df.groupby(level='cty', axis=1).count()
'''
Out[7]: 
cty  JP  US
0     2   3
1     2   3
2     2   3
3     2   3
'''

#### Homework1
[homework1](https://github.com/roberthsu2003/PythonForDataAnalysis/blob/master/%E7%BE%A4%E7%B5%84%E8%B3%87%E6%96%99%E5%92%8C%E7%BE%A4%E7%B5%84%E9%81%8B%E7%AE%97/Homework1.ipynb)

## 群組資料的運算
### groupby物件最佳化的method
    - count()
    - sum()
    - mean()
    - median()
    - std, var
    - min, max
    - prod
    - first, last

In [19]:
df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                  'key2':['one', 'two', 'one', 'two', 'one'],
                  'data1': np.random.randn(5),
                  'data2':np.random.randn(5)})
df
'''
Out[8]: 
  key1 key2     data1     data2
0    a  one  0.832722  1.542483
1    a  two -0.045988  1.429863
2    b  one  0.082411  0.910685
3    b  two -0.177623  0.115026
4    a  one  1.224924  0.133102
'''
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
'''
Out[9]: 
key1
a    1.146483
b    0.056408
Name: data1, dtype: float64
'''

# 使用groupby物件的agg()方法,可以使用自訂function
# 自訂的function必需定義一個參數
# 參數會傳入一個series
# 必需return一個純值
def peak_to_peak(arr):    
    return arr.max() - arr.min()

grouped[['data1','data2']].agg(peak_to_peak)
'''
Out[10]: 
         data1     data2
key1                    
a     1.270911  1.409381
b     0.260034  0.795658
'''

#groupby物件也接受describe()方法
grouped.describe()
'''
Out[11]: 
     data1                      ...     data2                    
     count      mean       std  ...       50%       75%       max
key1                            ...                              
a      3.0  0.670553  0.650790  ...  1.429863  1.486173  1.542483
b      2.0 -0.047606  0.183872  ...  0.512856  0.711770  0.910685

[2 rows x 16 columns]

'''

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-1.177036,0.144589,-1.285771,-1.25908,-1.232389,-1.122668,-1.012948,3.0,0.570554,0.252611,0.410882,0.424935,0.438987,0.65039,0.861793
b,2.0,1.185472,1.422006,0.179962,0.682717,1.185472,1.688227,2.190982,2.0,-0.404292,0.701043,-0.900004,-0.652148,-0.404292,-0.156436,0.09142


#### homework2
    - 美國球賽門票收入
    - 統計
[homework2](https://github.com/roberthsu2003/PythonForDataAnalysis/blob/master/%E7%BE%A4%E7%B5%84%E8%B3%87%E6%96%99%E5%92%8C%E7%BE%A4%E7%B5%84%E9%81%8B%E7%AE%97/Homework2.ipynb)

#### homework3 
    - 線上及時下載新北市公共自行車租賃系統
    - 統計個區ubike數量,可借數量,可還數量,故障率

[homework3](https://github.com/roberthsu2003/PythonForDataAnalysis/blob/master/%E7%BE%A4%E7%B5%84%E8%B3%87%E6%96%99%E5%92%8C%E7%BE%A4%E7%B5%84%E9%81%8B%E7%AE%97/homework3.ipynb)

## groupby物件使用apply()方法

In [1]:
import numpy as np
import pandas as pd

#美國球賽門票收入
tips = pd.read_csv('tips.csv')
tips
'''
Out[12]: 
     total_bill   tip smoker   day    time  size
0         16.99  1.01     No   Sun  Dinner     2
1         10.34  1.66     No   Sun  Dinner     3
2         21.01  3.50     No   Sun  Dinner     3
3         23.68  3.31     No   Sun  Dinner     2
4         24.59  3.61     No   Sun  Dinner     4
..          ...   ...    ...   ...     ...   ...
239       29.03  5.92     No   Sat  Dinner     3
240       27.18  2.00    Yes   Sat  Dinner     2
241       22.67  2.00    Yes   Sat  Dinner     2
242       17.82  1.75     No   Sat  Dinner     2
243       18.78  3.00     No  Thur  Dinner     2

[244 rows x 6 columns]
'''

#改變欄位
tips.columns = ['總票價', '小費', '吸煙者', '日期', '時間', '大小']
tips
'''
Out[13]: 
     總票價  小費  吸煙者  日期   時間  大小
0    16.99  1.01   No   Sun  Dinner   2
1    10.34  1.66   No   Sun  Dinner   3
2    21.01  3.50   No   Sun  Dinner   3
3    23.68  3.31   No   Sun  Dinner   2
4    24.59  3.61   No   Sun  Dinner   4
..     ...   ...  ...   ...     ...  ..
239  29.03  5.92   No   Sat  Dinner   3
240  27.18  2.00  Yes   Sat  Dinner   2
241  22.67  2.00  Yes   Sat  Dinner   2
242  17.82  1.75   No   Sat  Dinner   2
243  18.78  3.00   No  Thur  Dinner   2

[244 rows x 6 columns]
'''


#增加小費但總費用的比例
tips['小費比例'] = tips['小費'] / tips['總票價']
tips
'''
Out[14]: 
      總票價  小費  吸煙者 日期    時間  大小  小費比例
0    16.99  1.01   No   Sun  Dinner   2  0.059447
1    10.34  1.66   No   Sun  Dinner   3  0.160542
2    21.01  3.50   No   Sun  Dinner   3  0.166587
3    23.68  3.31   No   Sun  Dinner   2  0.139780
4    24.59  3.61   No   Sun  Dinner   4  0.146808
..     ...   ...  ...   ...     ...  ..       ...
239  29.03  5.92   No   Sat  Dinner   3  0.203927
240  27.18  2.00  Yes   Sat  Dinner   2  0.073584
241  22.67  2.00  Yes   Sat  Dinner   2  0.088222
242  17.82  1.75   No   Sat  Dinner   2  0.098204
243  18.78  3.00   No  Thur  Dinner   2  0.159744

[244 rows x 7 columns]
'''


#利用自訂function建立,過濾出小費比例最最高的前6筆
def top(df, n=5, column='小費比例'):
    return df.sort_values(by=column)[-n:]

top(tips, n=6)
'''
Out[15]: 
      總票價  小費 吸煙者 日期  時間    大小  小費比例
109  14.31  4.00  Yes  Sat  Dinner   2  0.279525
183  23.17  6.50  Yes  Sun  Dinner   4  0.280535
232  11.61  3.39   No  Sat  Dinner   2  0.291990
67    3.07  1.00  Yes  Sat  Dinner   1  0.325733
178   9.60  4.00  Yes  Sun  Dinner   2  0.416667
172   7.25  5.15  Yes  Sun  Dinner   2  0.710345
'''

#取出gropy欄位吸煙者的前6筆
#會產生階層式的索引
#內層索引是每一筆的索引值

tips.groupby('吸煙者').apply(top)
'''
Out[16]: 
         總票價  小費  吸煙者  日期    時間  大小   小費比例
吸煙者                                                  
No  88   24.71  5.85   No  Thur   Lunch   2  0.236746
    185  20.69  5.00   No   Sun  Dinner   5  0.241663
    51   10.29  2.60   No   Sun  Dinner   2  0.252672
    149   7.51  2.00   No  Thur   Lunch   2  0.266312
    232  11.61  3.39   No   Sat  Dinner   2  0.291990
Yes 109  14.31  4.00  Yes   Sat  Dinner   2  0.279525
    183  23.17  6.50  Yes   Sun  Dinner   4  0.280535
    67    3.07  1.00  Yes   Sat  Dinner   1  0.325733
    178   9.60  4.00  Yes   Sun  Dinner   2  0.416667
    172   7.25  5.15  Yes   Sun  Dinner   2  0.710345
'''

#apply(function,後面參數可以提供給前面的function)
tips.groupby(['吸煙者', '日期']).apply(top, n=1, column='總票價')
'''
Out[17]: 
              總票價   小費  吸煙者  日期    時間  大小   小費比例
吸煙者 日期                                                     
No  Fri  94   22.75   3.25   No   Fri  Dinner   2  0.142857
    Sat  212  48.33   9.00   No   Sat  Dinner   4  0.186220
    Sun  156  48.17   5.00   No   Sun  Dinner   6  0.103799
    Thur 142  41.19   5.00   No  Thur   Lunch   5  0.121389
Yes Fri  95   40.17   4.73  Yes   Fri  Dinner   4  0.117750
    Sat  170  50.81  10.00  Yes   Sat  Dinner   3  0.196812
    Sun  182  45.35   3.50  Yes   Sun  Dinner   3  0.077178
    Thur 197  43.11   5.00  Yes  Thur   Lunch   4  0.115982
'''

#apply()內的自訂function一定要回傳pandas物件或純值

result = tips.groupby('吸煙者')['小費比例'].describe()
'''
Out[19]: 
     count      mean       std       min       25%       50%       75%       max
吸煙者                                                                             
No   151.0  0.159328  0.039910  0.056797  0.136906  0.155625  0.185014  0.291990
Yes   93.0  0.163196  0.085119  0.035638  0.106771  0.153846  0.195059  0.710345
'''

result.unstack()
'''
Out[20]: 
       吸煙者
count  No     151.000000
       Yes     93.000000
mean   No       0.159328
       Yes      0.163196
std    No       0.039910
       Yes      0.085119
min    No       0.056797
       Yes      0.035638
25%    No       0.136906
       Yes      0.106771
50%    No       0.155625
       Yes      0.153846
75%    No       0.185014
       Yes      0.195059
max    No       0.291990
       Yes      0.710345
dtype: float64
'''

f = lambda x:x.describe()
tips.groupby('吸煙者')['小費比例'].apply(f)

#隱藏GroupKey產生的索引
tips.groupby('吸煙者', group_keys=False).apply(top)
'''
Out[8]: 
    總票價  小費  吸煙者    日期    時間  大小   小費比例
88   24.71  5.85   No  Thur   Lunch   2  0.236746
185  20.69  5.00   No   Sun  Dinner   5  0.241663
51   10.29  2.60   No   Sun  Dinner   2  0.252672
149   7.51  2.00   No  Thur   Lunch   2  0.266312
232  11.61  3.39   No   Sat  Dinner   2  0.291990
109  14.31  4.00  Yes   Sat  Dinner   2  0.279525
183  23.17  6.50  Yes   Sun  Dinner   4  0.280535
67    3.07  1.00  Yes   Sat  Dinner   1  0.325733
178   9.60  4.00  Yes   Sun  Dinner   2  0.416667
172   7.25  5.15  Yes   Sun  Dinner   2  0.710345
'''

Unnamed: 0,總票價,小費,吸煙者,日期,時間,大小,小費比例
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


#### 將數值資料分割成為數個範圍

In [8]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
                     'data2':np.random.randn(1000)})
#Categorical傳回
quartiles = pd.cut(frame.data1, 4)
quartiles
'''
Out[9]: 
0       (0.205, 1.812]
1       (0.205, 1.812]
2      (-1.402, 0.205]
3      (-1.402, 0.205]
4      (-1.402, 0.205]
            ...       
995    (-1.402, 0.205]
996     (0.205, 1.812]
997    (-1.402, 0.205]
998     (0.205, 1.812]
999     (0.205, 1.812]
Name: data1, Length: 1000, dtype: category
Categories (4, interval[float64]): [(-3.015, -1.402] < (-1.402, 0.205] < (0.205, 1.812] <
                                    (1.812, 3.419]]

'''
def get_states(group):
    return {'min':group.min(), 'max':group.max(), 'count':group.count, 'mean':group.mean()}

grouped = frame.data2.groupby(quartiles)
grouped.apply(get_states).unstack()
'''
Out[11]: 
                      min      max                                              count       mean
data1                                                                                           
(-3.015, -1.402] -1.94054  2.73615  <bound method Series.count of 29     1.137708\...   0.106789
(-1.402, 0.205]  -2.71184  3.09183  <bound method Series.count of 2      0.168291\... -0.0122795
(0.205, 1.812]   -2.50876  3.28695  <bound method Series.count of 0     -1.245815\...  0.0451062
(1.812, 3.419]   -1.91989  2.83793  <bound method Series.count of 7      1.544971\...  0.0313546
'''

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.213, -1.461]",-2.49609,3.45658,<bound method Series.count of 7 -0.167006\...,0.0192501
"(-1.461, 0.284]",-3.2033,3.30168,<bound method Series.count of 3 -2.045291\...,-0.028515
"(0.284, 2.029]",-2.84224,3.11333,<bound method Series.count of 0 2.082226\...,0.00133624
"(2.029, 3.774]",-1.48699,0.959169,<bound method Series.count of 75 -0.854548\...,-0.135733


## 樞紐分析表
    - pivote_table()
    - pandas.pivote_table()


In [6]:
import numpy as np
import pandas as pd

#美國球賽門票收入
tips = pd.read_csv('tips.csv')
tips
'''
Out[12]: 
     total_bill   tip smoker   day    time  size
0         16.99  1.01     No   Sun  Dinner     2
1         10.34  1.66     No   Sun  Dinner     3
2         21.01  3.50     No   Sun  Dinner     3
3         23.68  3.31     No   Sun  Dinner     2
4         24.59  3.61     No   Sun  Dinner     4
..          ...   ...    ...   ...     ...   ...
239       29.03  5.92     No   Sat  Dinner     3
240       27.18  2.00    Yes   Sat  Dinner     2
241       22.67  2.00    Yes   Sat  Dinner     2
242       17.82  1.75     No   Sat  Dinner     2
243       18.78  3.00     No  Thur  Dinner     2

[244 rows x 6 columns]
'''

#改變欄位
tips.columns = ['總票價', '小費', '吸煙者', '日期', '時間', '大小']
tips
'''
Out[1]: 
    總票價    小費  吸煙者  日期  時間  大小
0    16.99  1.01   No   Sun  Dinner   2
1    10.34  1.66   No   Sun  Dinner   3
2    21.01  3.50   No   Sun  Dinner   3
3    23.68  3.31   No   Sun  Dinner   2
4    24.59  3.61   No   Sun  Dinner   4
..     ...   ...  ...   ...     ...  ..
239  29.03  5.92   No   Sat  Dinner   3
240  27.18  2.00  Yes   Sat  Dinner   2
241  22.67  2.00  Yes   Sat  Dinner   2
242  17.82  1.75   No   Sat  Dinner   2
243  18.78  3.00   No  Thur  Dinner   2

[244 rows x 6 columns]
'''

#增加小費但總費用的比例
tips['小費比例'] = tips['小費'] / tips['總票價']
tips
'''
Out[14]: 
      總票價  小費  吸煙者 日期    時間  大小  小費比例
0    16.99  1.01   No   Sun  Dinner   2  0.059447
1    10.34  1.66   No   Sun  Dinner   3  0.160542
2    21.01  3.50   No   Sun  Dinner   3  0.166587
3    23.68  3.31   No   Sun  Dinner   2  0.139780
4    24.59  3.61   No   Sun  Dinner   4  0.146808
..     ...   ...  ...   ...     ...  ..       ...
239  29.03  5.92   No   Sat  Dinner   3  0.203927
240  27.18  2.00  Yes   Sat  Dinner   2  0.073584
241  22.67  2.00  Yes   Sat  Dinner   2  0.088222
242  17.82  1.75   No   Sat  Dinner   2  0.098204
243  18.78  3.00   No  Thur  Dinner   2  0.159744

[244 rows x 7 columns]
'''

#自動群組和求平均
tips.pivot_table(index=['日期', '吸煙者'])
'''
Out[2]: 
            大小        小費    總票價
日期   吸煙者                               
Fri  No   2.250000  2.812500  18.420000
     Yes  2.066667  2.714000  16.813333
Sat  No   2.555556  3.102889  19.661778
     Yes  2.476190  2.875476  21.276667
Sun  No   2.929825  3.167895  20.506667
     Yes  2.578947  3.516842  24.120000
Thur No   2.488889  2.673778  17.113111
     Yes  2.352941  3.030000  19.190588
'''

tips.pivot_table(['小費比例', '大小'], index=['時間','日期'], columns='吸煙者')
'''
Out[4]: 
                     大小               小費比例          
吸煙者            No       Yes        No       Yes
時間     日期                                          
Dinner Fri   2.000000  2.222222  0.139622  0.165347
       Sat   2.555556  2.476190  0.158048  0.147906
       Sun   2.929825  2.578947  0.160113  0.187250
       Thur  2.000000       NaN  0.159744       NaN
Lunch  Fri   3.000000  1.833333  0.187735  0.188937
       Thur  2.500000  2.352941  0.160311  0.163863
'''

tips.pivot_table(['小費比例', '大小'], index=['時間','日期'], columns='吸煙者', margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,大小,大小,大小,小費比例,小費比例,小費比例
Unnamed: 0_level_1,吸煙者,No,Yes,All,No,Yes,All
時間,日期,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Dinner,Fri,2.0,2.222222,2.166667,0.139622,0.165347,0.158916
Dinner,Sat,2.555556,2.47619,2.517241,0.158048,0.147906,0.153152
Dinner,Sun,2.929825,2.578947,2.842105,0.160113,0.18725,0.166897
Dinner,Thur,2.0,,2.0,0.159744,,0.159744
Lunch,Fri,3.0,1.833333,2.0,0.187735,0.188937,0.188765
Lunch,Thur,2.5,2.352941,2.459016,0.160311,0.163863,0.161301
All,,2.668874,2.408602,2.569672,0.159328,0.163196,0.160803
