## <font color='red'>什么是分箱操作？</font>

### 等宽分箱

In [13]:
import pandas as pd

data = [0,10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
bins = [0- 0.1, 30, 60, 100 + 0.1] # 自定义宽度
labels = ['Low', 'Medium', 'High']
pd.cut(data,bins,labels=labels,right = True)

['Low', 'Low', 'Low', 'Low', 'Medium', ..., 'Medium', 'High', 'High', 'High', 'High']
Length: 11
Categories (3, object): ['Low' < 'Medium' < 'High']

In [38]:
import pandas as pd

data = [0,10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
bins = 3
labels = ['Low', 'Medium', 'High']
pd.cut(data,bins,labels=labels,right = True,retbins=True)

(['Low', 'Low', 'Low', 'Low', 'Medium', ..., 'Medium', 'High', 'High', 'High', 'High']
 Length: 11
 Categories (3, object): ['Low' < 'Medium' < 'High'],
 array([ -0.1       ,  33.33333333,  66.66666667, 100.        ]))

### 等频分箱

In [23]:
import pandas as pd
import numpy  as np

data = np.random.randint(0,151,size = 200)

quantiles,bins = pd.qcut(data, q=3, labels=['Low', 'Medium', 'High'],retbins=True)
display(quantiles,bins)

['High', 'High', 'Medium', 'Low', 'Low', ..., 'High', 'High', 'Low', 'Low', 'High']
Length: 200
Categories (3, object): ['Low' < 'Medium' < 'High']

array([  0.        ,  46.        ,  89.66666667, 149.        ])

In [20]:
data

array([ 95,  39,  12,  52,   8, 105,  29, 131,  49, 142,  86,  45, 145,
       109,  88,  17, 145, 116,  48,   1,   6,  94, 138, 120,  55,  39,
        10,  64,  49,  15,  81, 108, 139, 103,   0, 121, 100, 111,  13,
       106,  47, 116, 121,  31,  84, 139,  53, 114, 113, 150,  46,  88,
        97,  71,  37,  56, 110,  12, 111,  29, 139,  39, 149, 122,  95,
        98,  30,  95, 141,  51,  56,  53,  68,  13,  78, 120, 125,  30,
        27, 130,  96, 132,  91,  40,  82,  95,  72,  16,  77,  70,  51,
        52,   6,  74, 105,  60,  78,  35, 125, 142,  45, 127,  44, 125,
        90,  42,  48,  53,  17,  76, 137, 101, 111, 102,  95,  71,  17,
        36, 120,  29,  74,  49,   7,  91,  38,   9, 133, 106,  30, 112,
        44,  44,  65,  11,   0,  70,  23,  22,  28,  39,  85,   8,  53,
        38,  24,  52, 139,  93, 108, 115,  79, 112,  64,  18, 137,   9,
        27,  29,  61, 104,  63, 138,  49,  96,  93,  64, 144,  50,  69,
        13,  95,  48, 129,  96,  21,  71, 134, 112,  76, 106,  6

In [21]:
bins

['Medium', 'Low', 'Low', 'Medium', 'Low', ..., 'Medium', 'High', 'Medium', 'Low', 'Low']
Length: 200
Categories (3, object): ['Low' < 'Medium' < 'High']

In [36]:
pd.cut(data,bins,labels =['Low', 'Medium', 'High'])

['High', 'High', 'Medium', 'Low', 'Low', ..., 'High', 'High', 'Low', 'Low', 'High']
Length: 200
Categories (3, object): ['Low' < 'Medium' < 'High']

In [15]:
df = pd.DataFrame({'Python':data,'等级':quantiles})
df

Unnamed: 0,Python,等级
0,29,Low
1,106,High
2,98,Medium
3,140,High
4,34,Low
...,...,...
195,144,High
196,118,High
197,55,Medium
198,34,Low


In [16]:
df['等级'].value_counts()

Low       68
High      67
Medium    65
Name: 等级, dtype: int64

### 自定义分箱

In [44]:
import pandas as pd

# 示例数据：年龄数据
ages = [18, 25, 35, 50, 65, 80, 90]

# 自定义函数，根据年龄进行分箱
def custom_age_binning(age):
    if age <= 30:
        return 'Young'
    elif age <= 60:
        return 'Middle-aged'
    else:
        return 'Senior'

# 将自定义函数应用于年龄数据
age_series = pd.Series(ages)
custom_bins = age_series.apply(custom_age_binning)
custom_bins

0          Young
1          Young
2    Middle-aged
3    Middle-aged
4         Senior
5         Senior
6         Senior
dtype: object

In [41]:
pd.Series(pd.cut(ages,bins = [0,30,60,91],right=True,labels=['Young','Middle-age','Senior']))

0         Young
1         Young
2    Middle-age
3    Middle-age
4        Senior
5        Senior
6        Senior
dtype: category
Categories (3, object): ['Young' < 'Middle-age' < 'Senior']

### 类别型数据

In [45]:
import pandas as pd

# 创建一个包含 Category 数据类型的 Series
data = pd.Categorical(['B', 'A', 'C', 'A', 'B'], categories=['A', 'B', 'C'], ordered=True)
s = pd.Series(data)
s

0    B
1    A
2    C
3    A
4    B
dtype: category
Categories (3, object): ['A' < 'B' < 'C']

In [46]:
type(2)

int

In [47]:
type(3.14)

float

In [48]:
type("hello")

str

In [49]:
s.dtype

CategoricalDtype(categories=['A', 'B', 'C'], ordered=True)

In [51]:
s.sort_values()

1    A
3    A
0    B
4    B
2    C
dtype: category
Categories (3, object): ['A' < 'B' < 'C']

In [55]:
# 修改 Category 的顺序
new_categories = ['C', 'A', 'B']
data = data.set_categories(new_categories)
s2 = pd.Series(data)
display(s2)

# 按照新的 Category 顺序排序
s2.sort_values()

0    B
1    A
2    C
3    A
4    B
dtype: category
Categories (3, object): ['C' < 'A' < 'B']

2    C
1    A
3    A
0    B
4    B
dtype: category
Categories (3, object): ['C' < 'A' < 'B']

## <font color='red'>什么是分组操作？</font>

In [60]:
import numpy as np
import pandas as pd
# 准备数据
df = pd.DataFrame(data = {'sex':np.random.randint(0,2,size = 300), # 0男，1女
                          'class':np.random.randint(1,9,size = 300),#1~8八个班
                          'Python':np.random.randint(0,151,size = 300),#Python成绩
                          'Keras':np.random.randint(0,151,size =300),#Keras成绩
                          'Tensorflow':np.random.randint(0,151,size=300),
                          'Java':np.random.randint(0,151,size = 300),
                          'C++':np.random.randint(0,151,size = 300)})
df['sex'] = df['sex'].map({0:'男',1:'女'}) # 将0，1映射成男女
df

Unnamed: 0,sex,class,Python,Keras,Tensorflow,Java,C++
0,女,1,106,13,71,148,83
1,女,3,11,115,132,85,81
2,男,5,16,95,1,16,43
3,女,1,31,53,138,48,147
4,女,2,140,106,74,67,16
...,...,...,...,...,...,...,...
295,女,8,123,41,70,38,62
296,男,3,74,122,56,139,131
297,男,1,99,48,79,72,109
298,男,7,36,3,132,86,36


In [62]:
g = df.groupby(by = 'sex') # 但条件分组
g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E0C7712FA0>

In [64]:
for name,data in g:
    print('------------组名',name)
    print('++++++++++++数据\n',data)

------------组名 女
++++++++++++数据
     sex  class  Python  Keras  Tensorflow  Java  C++
0     女      1     106     13          71   148   83
1     女      3      11    115         132    85   81
3     女      1      31     53         138    48  147
4     女      2     140    106          74    67   16
5     女      4      69     37          52   149   44
..   ..    ...     ...    ...         ...   ...  ...
286   女      8     102    118         135    62   91
287   女      2     140    107          25    36   17
290   女      2      96     25          92    47  105
295   女      8     123     41          70    38   62
299   女      7      16    105          18    32   57

[161 rows x 7 columns]
------------组名 男
++++++++++++数据
     sex  class  Python  Keras  Tensorflow  Java  C++
2     男      5      16     95           1    16   43
6     男      1      10     41         132   116   33
9     男      5      60     21         114    28   15
11    男      7       3     63          28    39    0
12    男  

In [65]:
# 2 * 8 = 16
# 8班全是男生！
g = df.groupby(by = ['sex','class']) # 多条件分组
for name,data in g:
    print('------------组名',name)
    print('++++++++++++数据\n',data)

------------组名 ('女', 1)
++++++++++++数据
     sex  class  Python  Keras  Tensorflow  Java  C++
0     女      1     106     13          71   148   83
3     女      1      31     53         138    48  147
8     女      1      24     20          58    96   72
17    女      1      20    114           8    19   22
19    女      1      90    122          65    54   75
25    女      1      66    149          73   120  145
56    女      1      78    131          32    29   43
58    女      1      98     84         131   106  143
62    女      1      90     17         112    22   25
70    女      1      44    147          86    21   71
79    女      1       0     21           3     9  106
108   女      1      84     37          46    41  109
134   女      1     120    135          65    21   58
163   女      1      84     99         127   116  101
174   女      1      26    105          63   144   62
177   女      1     124      7          22    37   67
183   女      1     148     69         133    90   13
219   

In [67]:
g = df['Python'].groupby(df['class'])
for name,data in g:
    print('------------组名',name)
    print('++++++++++++数据\n',data)

------------组名 1
++++++++++++数据
 0      106
3       31
6       10
8       24
17      20
19      90
25      66
56      78
58      98
62      90
70      44
79       0
93      93
108     84
115     94
117    128
128     70
133     73
134    120
146    148
150     62
163     84
174     26
177    124
183    148
201    132
219    136
232     47
233    150
246     66
253     94
256     53
257     26
263     26
268     96
271    102
273    146
280     51
288     98
291     51
293      4
294    132
297     99
Name: Python, dtype: int32
------------组名 2
++++++++++++数据
 4      140
13      10
14       1
33      92
34     139
39     149
41      13
45       3
82      73
83     108
84      20
98     113
102     61
106     11
107     67
120    109
140    132
151    108
159     88
160     42
164     59
192     32
206     50
210     31
211     82
212     42
216    126
229    103
236     83
242     55
245     69
250     95
264     39
279     12
287    140
290     96
Name: Python, dtype: int32
-----------

In [69]:
g = df[['Python','Java']].groupby([df['class'],df['sex']])
for name,data in g:
    print('------------组名',name)
    print('++++++++++++数据\n',data)

------------组名 (1, '女')
++++++++++++数据
      Python  Java
0       106   148
3        31    48
8        24    96
17       20    19
19       90    54
25       66   120
56       78    29
58       98   106
62       90    22
70       44    21
79        0     9
108      84    41
134     120    21
163      84   116
174      26   144
177     124    37
183     148    90
219     136     7
232      47    18
233     150    24
246      66   143
253      94    56
256      53    60
257      26   135
273     146   125
280      51   128
------------组名 (1, '男')
++++++++++++数据
      Python  Java
6        10   116
93       93   108
115      94    60
117     128   120
128      70   129
133      73   104
146     148    46
150      62    99
201     132   109
263      26    14
268      96   110
271     102    49
288      98    19
291      51    23
293       4    77
294     132    55
297      99    72
------------组名 (2, '女')
++++++++++++数据
      Python  Java
4       140    67
13       10    14
39      149    8

## <font color='red'>pandas中什么是分组聚合？</font>

In [78]:
import numpy as np
import pandas as pd
# 准备数据
df = pd.DataFrame(data = {'sex':np.random.randint(0,2,size = 300), # 0男，1女
                          'class':np.random.randint(1,9,size = 300),#1~8八个班
                          'Python':np.random.randint(0,151,size = 300),#Python成绩
                          'Keras':np.random.randint(0,151,size =300),#Keras成绩
                          'Tensorflow':np.random.randint(0,151,size=300),
                          'Java':np.random.randint(0,151,size = 300),
                          'C++':np.random.randint(0,151,size = 300)})
df['sex'] = df['sex'].map({0:'男',1:'女'}) # 将0，1映射成男女
df['class'] = df['class'].astype('object')
df.dtypes

sex           object
class         object
Python         int32
Keras          int32
Tensorflow     int32
Java           int32
C++            int32
dtype: object

In [79]:
df

Unnamed: 0,sex,class,Python,Keras,Tensorflow,Java,C++
0,男,5,145,137,23,101,145
1,女,7,101,83,100,0,147
2,男,4,62,104,41,114,92
3,女,2,107,5,141,37,102
4,女,1,2,110,146,9,100
...,...,...,...,...,...,...,...
295,女,7,69,25,59,83,69
296,男,4,33,40,18,11,117
297,女,4,128,61,101,25,42
298,男,4,65,68,97,19,118


In [82]:
df.groupby(by = 'sex').max().round(2)

Unnamed: 0_level_0,class,Python,Keras,Tensorflow,Java,C++
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
女,8,150,149,149,150,150
男,8,150,148,150,150,150


In [None]:
pd.Series.to_frame()

In [90]:
df.groupby(by = 'class').size().to_frame(name = '班级人数')

Unnamed: 0_level_0,班级人数
class,Unnamed: 1_level_1
1,31
2,35
3,34
4,40
5,34
6,39
7,47
8,40


In [88]:
df.loc[1,'Python'] = np.nan
df

Unnamed: 0,sex,class,Python,Keras,Tensorflow,Java,C++
0,男,5,145.0,137,23,101,145
1,女,7,,83,100,0,147
2,男,4,62.0,104,41,114,92
3,女,2,107.0,5,141,37,102
4,女,1,2.0,110,146,9,100
...,...,...,...,...,...,...,...
295,女,7,69.0,25,59,83,69
296,男,4,33.0,40,18,11,117
297,女,4,128.0,61,101,25,42
298,男,4,65.0,68,97,19,118


In [None]:
df.loc[]

In [89]:
df.groupby(by = 'class').count()

Unnamed: 0_level_0,sex,Python,Keras,Tensorflow,Java,C++
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,31,31,31,31,31,31
2,35,35,35,35,35,35
3,34,34,34,34,34,34
4,40,40,40,40,40,40
5,34,34,34,34,34,34
6,39,39,39,39,39,39
7,47,46,47,47,47,47
8,40,40,40,40,40,40


In [92]:
df.groupby(by = ['sex','class']).describe().T

Unnamed: 0_level_0,sex,女,女,女,女,女,女,女,女,男,男,男,男,男,男,男,男
Unnamed: 0_level_1,class,1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8
Python,count,16.0,19.0,21.0,23.0,15.0,18.0,25.0,21.0,15.0,16.0,13.0,17.0,19.0,21.0,21.0,19.0
Python,mean,82.3125,73.789474,77.428571,83.782609,83.066667,88.833333,80.36,88.857143,70.933333,71.25,102.076923,70.705882,74.421053,69.47619,79.619048,82.736842
Python,std,52.527731,44.864709,42.124306,38.089555,27.924558,38.091608,39.453432,48.908369,40.3068,50.517984,34.824468,40.262831,48.831133,39.690829,42.090945,35.817193
Python,min,1.0,2.0,0.0,12.0,45.0,12.0,4.0,2.0,6.0,0.0,52.0,3.0,0.0,4.0,8.0,6.0
Python,25%,28.25,37.5,43.0,60.5,56.0,70.25,42.0,57.0,44.5,27.5,69.0,50.0,33.0,44.0,43.0,59.0
Python,50%,98.0,75.0,75.0,82.0,87.0,84.0,83.0,97.0,61.0,80.5,101.0,57.0,66.0,77.0,80.0,81.0
Python,75%,120.75,110.5,104.0,112.5,101.0,122.0,111.0,132.0,93.0,123.25,138.0,100.0,123.0,103.0,118.0,105.5
Python,max,149.0,142.0,149.0,148.0,127.0,144.0,148.0,150.0,140.0,140.0,150.0,145.0,145.0,137.0,148.0,145.0
Keras,count,16.0,19.0,21.0,23.0,15.0,18.0,26.0,21.0,15.0,16.0,13.0,17.0,19.0,21.0,21.0,19.0
Keras,mean,75.375,67.736842,61.47619,60.608696,71.666667,83.722222,73.692308,86.238095,70.133333,51.625,72.538462,86.294118,80.473684,85.952381,73.0,78.578947


In [96]:
df.groupby(by = 'class')[['Python','Java']].quantile([0.25,0.75,0.95])

Unnamed: 0_level_0,Unnamed: 1_level_0,Python,Java
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.25,40.5,48.5
1,0.75,117.0,117.0
1,0.95,141.5,137.5
2,0.25,33.0,48.0
2,0.75,115.5,116.0
2,0.95,140.3,145.9
3,0.25,57.0,42.25
3,0.75,120.5,120.75
3,0.95,147.7,147.0
4,0.25,52.5,23.0


In [94]:
df.groupby(by = 'class').median()

Unnamed: 0_level_0,Python,Keras,Tensorflow,Java,C++
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,89.0,76.0,78.0,83.0,92.0
2,80.0,52.0,69.0,75.0,79.0
3,79.0,67.0,81.0,75.0,69.5
4,70.0,73.5,83.0,56.0,46.5
5,83.5,84.5,72.0,85.5,70.0
6,79.0,90.0,38.0,74.0,81.0
7,80.5,70.0,63.0,61.0,80.0
8,89.5,84.0,73.0,79.0,69.0


In [97]:
g = df.groupby(by = 'sex')
g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E0CA0D1AC0>

## <font color='red'>apply分组聚合如何操作？</font>

In [43]:
import pandas as pd

data = {
    'user_id': [1, 2, 3, 4, 5, 6],
    '渠道': ['Alipay', 'WeChat', 'Alipay', 'WeChat', 'Alipay', 'WeChat'],
    '金额': [100, 150, 120, 180, 90, 200]
}
df = pd.DataFrame(data)
df

Unnamed: 0,user_id,渠道,金额
0,1,Alipay,100
1,2,WeChat,150
2,3,Alipay,120
3,4,WeChat,180
4,5,Alipay,90
5,6,WeChat,200


In [18]:
def calculate_mean_payment(group):
    return group['金额'].mean().round(2)
result = df.groupby(by = '渠道').apply(calculate_mean_payment)
result.to_frame(name = '平均金额')

Unnamed: 0_level_0,平均金额
渠道,Unnamed: 1_level_1
Alipay,103.33
WeChat,176.67


In [68]:
def calculate_payment(group):
    display(group)
    res = {'平均':group['金额'].mean().round(2),
           '最大':group['金额'].max(),
           '最小':group['金额'].min()}
    return pd.Series(res)

result = df.groupby(by = '渠道').apply(calculate_payment)
result

Unnamed: 0,user_id,渠道,金额
0,1,Alipay,100
2,3,Alipay,120
4,5,Alipay,90


Unnamed: 0,user_id,渠道,金额
1,2,WeChat,150
3,4,WeChat,180
5,6,WeChat,200


Unnamed: 0_level_0,平均,最大,最小
渠道,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alipay,103.33,120.0,90.0
WeChat,176.67,200.0,150.0


## <font color='red'>transform分组聚合如何操作？</font>

In [52]:
import pandas as pd

data = {
    'user_id': [1, 2, 3, 4, 5, 6],
    '渠道': ['Alipay', 'WeChat', 'Alipay', 'WeChat', 'Alipay', 'WeChat'],
    '金额': [100, 150, 120, 180, 90, 200]
}
df = pd.DataFrame(data)
df

Unnamed: 0,user_id,渠道,金额
0,1,Alipay,100
1,2,WeChat,150
2,3,Alipay,120
3,4,WeChat,180
4,5,Alipay,90
5,6,WeChat,200


In [64]:
import numpy as np
df.groupby('渠道')[['金额']].apply('mean').round(2)

Unnamed: 0_level_0,金额
渠道,Unnamed: 1_level_1
Alipay,103.33
WeChat,176.67


In [60]:
df.groupby('渠道').mean()

Unnamed: 0_level_0,user_id,金额
渠道,Unnamed: 1_level_1,Unnamed: 2_level_1
Alipay,3.0,103.333333
WeChat,4.0,176.666667


In [65]:
res = df.groupby('渠道')[['金额']].transform('mean').round(2)
res.columns = ['平均金额']
res

Unnamed: 0,平均金额
0,103.33
1,176.67
2,103.33
3,176.67
4,103.33
5,176.67


In [67]:
pd.merge(df,res,left_index=True,right_index=True)

Unnamed: 0,user_id,渠道,金额,平均金额
0,1,Alipay,100,103.33
1,2,WeChat,150,176.67
2,3,Alipay,120,103.33
3,4,WeChat,180,176.67
4,5,Alipay,90,103.33
5,6,WeChat,200,176.67


In [72]:
def calculate_payment(group):
#     display(group)
    res = {'平均':group['金额'].mean().round(2),
           '最大':group['金额'].max(),
           '最小':group['金额'].min()}
    return pd.Series(res)

result = df.groupby(by = '渠道').apply(calculate_payment)
result

Unnamed: 0_level_0,平均,最大,最小
渠道,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alipay,103.33,120.0,90.0
WeChat,176.67,200.0,150.0


In [74]:
df

Unnamed: 0,user_id,渠道,金额
0,1,Alipay,100
1,2,WeChat,150
2,3,Alipay,120
3,4,WeChat,180
4,5,Alipay,90
5,6,WeChat,200


In [75]:
pd.merge(df,result,left_on='渠道',right_index=True)

Unnamed: 0,user_id,渠道,金额,平均,最大,最小
0,1,Alipay,100,103.33,120.0,90.0
2,3,Alipay,120,103.33,120.0,90.0
4,5,Alipay,90,103.33,120.0,90.0
1,2,WeChat,150,176.67,200.0,150.0
3,4,WeChat,180,176.67,200.0,150.0
5,6,WeChat,200,176.67,200.0,150.0


## <font color='red'>agg分组聚合如何操作？</font>

In [26]:
import numpy as np
import pandas as pd
# 准备数据
df = pd.DataFrame(data = {'sex':np.random.randint(0,2,size = 300), # 0男，1女
                          'class':np.random.randint(1,9,size = 300),#1~8八个班
                          'Python':np.random.randint(0,151,size = 300),#Python成绩
                          'Math':np.random.randint(0,151,size =300),#Keras成绩
                          'Chinese':np.random.randint(0,151,size=300)})
df['sex'] = df['sex'].map({0:'男',1:'女'}) # 将0，1映射成男女
df

Unnamed: 0,sex,class,Python,Math,Chinese
0,男,5,60,34,79
1,男,5,84,124,8
2,女,6,24,60,18
3,男,8,115,132,129
4,女,5,93,110,27
...,...,...,...,...,...
295,女,5,37,3,10
296,男,3,37,73,113
297,男,3,123,117,46
298,女,6,115,133,88


In [35]:
df.groupby(by = 'class')['Python'].agg([np.median,np.mean]).round(2)

Unnamed: 0_level_0,median,mean
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1,70.0,68.48
2,83.5,84.35
3,96.5,80.95
4,63.0,68.7
5,63.5,65.22
6,37.0,54.18
7,96.0,82.15
8,85.5,81.75


In [39]:
df.groupby(by = 'class')[['Python','Math']].agg([np.median,np.mean]).round(2)

Unnamed: 0_level_0,Python,Python,Math,Math
Unnamed: 0_level_1,median,mean,median,mean
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,70.0,68.48,80.0,77.0
2,83.5,84.35,82.5,78.88
3,96.5,80.95,80.5,76.24
4,63.0,68.7,78.0,73.94
5,63.5,65.22,63.5,68.2
6,37.0,54.18,71.5,76.59
7,96.0,82.15,58.0,72.03
8,85.5,81.75,63.5,66.56


In [41]:
def delta(item):
    return item.max() - item.min()

df2 = df.groupby(by = ['class','sex']).aggregate({'Math': [('最大差值',delta),('平均分',np.mean)],
                                                          'Python':[('最大值',np.max)]}).round(2)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Math,Math,Python
Unnamed: 0_level_1,Unnamed: 1_level_1,最大差值,平均分,最大值
class,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,女,146,73.62,150
1,男,143,80.68,133
2,女,122,77.09,142
2,男,142,80.2,148
3,女,142,71.48,141
3,男,130,84.8,150
4,女,121,74.06,91
4,男,135,73.82,140
5,女,139,73.96,140
5,男,149,62.88,147


## <font color='red'>不同分组聚合函数有什么区别？</font>

In [42]:
import pandas as pd

data = {
    'user_id': [1, 2, 3, 4, 5, 6],
    '渠道': ['Alipay', 'WeChat', 'Alipay', 'WeChat', 'Alipay', 'WeChat'],
    '金额': [100, 150, 120, 180, 90, 200]
}
df = pd.DataFrame(data)
df

Unnamed: 0,user_id,渠道,金额
0,1,Alipay,100
1,2,WeChat,150
2,3,Alipay,120
3,4,WeChat,180
4,5,Alipay,90
5,6,WeChat,200


### apply

In [44]:
def convert(item):
    display(item)
    item['x'] = item['金额'] * item['user_id'] # 更加复杂运算
    print('*'*50)
    return item

df.groupby(by = '渠道').apply(convert)

Unnamed: 0,user_id,渠道,金额
0,1,Alipay,100
2,3,Alipay,120
4,5,Alipay,90


**************************************************


Unnamed: 0,user_id,渠道,金额
1,2,WeChat,150
3,4,WeChat,180
5,6,WeChat,200


**************************************************


Unnamed: 0,user_id,渠道,金额,x
0,1,Alipay,100,100
1,2,WeChat,150,300
2,3,Alipay,120,360
3,4,WeChat,180,720
4,5,Alipay,90,450
5,6,WeChat,200,1200


### agg

In [50]:
def convert(item):
    display(item)
#     item['x'] = item['金额'] * item['user_id']
    print('*'*50)
    return item.mean() # 聚合
df.groupby(by = '渠道').aggregate(convert)

0    1
2    3
4    5
Name: user_id, dtype: int64

**************************************************


1    2
3    4
5    6
Name: user_id, dtype: int64

**************************************************


0    100
2    120
4     90
Name: 金额, dtype: int64

**************************************************


1    150
3    180
5    200
Name: 金额, dtype: int64

**************************************************


Unnamed: 0_level_0,user_id,金额
渠道,Unnamed: 1_level_1,Unnamed: 2_level_1
Alipay,3.0,103.333333
WeChat,4.0,176.666667


### transform

In [53]:
def convert(item):
    display(item)
#     item['x'] = item['金额'] * item['user_id']
    print('*'*50)
    return item.mean() # 聚合
df.groupby(by = '渠道').transform(convert)

0    1
2    3
4    5
Name: user_id, dtype: int64

**************************************************


0    100
2    120
4     90
Name: 金额, dtype: int64

**************************************************


Unnamed: 0,user_id,金额
0,1,100
2,3,120
4,5,90


**************************************************


1    2
3    4
5    6
Name: user_id, dtype: int64

**************************************************


1    150
3    180
5    200
Name: 金额, dtype: int64

**************************************************


Unnamed: 0,user_id,金额
0,3.0,103.333333
1,4.0,176.666667
2,3.0,103.333333
3,4.0,176.666667
4,3.0,103.333333
5,4.0,176.666667


## <font color='red'>什么是pivot透视表？</font></font>

In [59]:
import pandas as pd

data = {
    'Category': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'A'],
    'Region': ['North', 'North', 'South', 'South', 'North', 'South', 'North', 'South'],
    'Revenue': [100, 200, 150, 250, 120, 180, 130, 110]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Category,Region,Revenue
0,A,North,100
1,B,North,200
2,A,South,150
3,B,South,250
4,A,North,120
5,B,South,180
6,A,North,130
7,A,South,110


In [60]:
pivot_result = df.pivot_table(values='Revenue', index='Category', columns='Region', aggfunc='sum')
pivot_result

Region,North,South
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,350,260
B,200,430


In [62]:
df.groupby(by = ['Category','Region']).sum().unstack(level=-1)

Unnamed: 0_level_0,Revenue,Revenue
Region,North,South
Category,Unnamed: 1_level_2,Unnamed: 2_level_2
A,350,260
B,200,430


In [63]:
import numpy as np
import pandas as pd
# 准备数据
df = pd.DataFrame(data = {'sex':np.random.randint(0,2,size = 300), # 0男，1女
                          'class':np.random.randint(1,9,size = 300),#1~8八个班
                          'Python':np.random.randint(0,151,size = 300),#Python成绩
                          'Math':np.random.randint(0,151,size =300),#Keras成绩
                          'Chinese':np.random.randint(0,151,size=300)})
df['sex'] = df['sex'].map({0:'男',1:'女'}) # 将0，1映射成男女
df

Unnamed: 0,sex,class,Python,Math,Chinese
0,男,3,48,88,126
1,男,6,8,112,17
2,女,8,37,12,108
3,男,3,138,115,35
4,男,3,68,27,61
...,...,...,...,...,...
295,女,2,88,27,42
296,女,4,53,98,150
297,男,7,17,29,27
298,女,3,59,124,96


In [68]:
# 透视表也是一种分组聚合运算
df.pivot_table(values=['Python','Math','Chinese'],# 要透视分组的值
               index=['class','sex'], # 分组透视指标
               aggfunc={'Python':[('最大值',np.max)], # 聚合运算
                        'Math':[('最小值',np.min),('中位数',np.median)],
                        'Chinese':[('最小值',np.min),('平均值',np.mean),('计数',pd.Series.count)]})

Unnamed: 0_level_0,Unnamed: 1_level_0,Chinese,Chinese,Chinese,Math,Math,Python
Unnamed: 0_level_1,Unnamed: 1_level_1,平均值,最小值,计数,中位数,最小值,最大值
class,sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,女,81.25,10,16,58.5,7,148
1,男,73.2,10,25,74.0,0,147
2,女,71.65,19,20,71.5,14,137
2,男,62.117647,7,17,90.0,30,147
3,女,54.4,0,25,72.0,3,137
3,男,96.117647,25,17,63.0,1,145
4,女,77.769231,17,13,79.0,11,143
4,男,72.461538,0,13,107.0,31,144
5,女,80.95,10,20,81.5,0,133
5,男,89.6,2,20,88.5,8,142


In [65]:
pd.DataFrame.cumsum

<function pandas.core.frame.DataFrame.count(self, axis: 'Axis' = 0, level: 'Level | None' = None, numeric_only: 'bool' = False)>