In [6]:
import pandas as pd
df =pd.read_excel("team2.xlsx")
df

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
0,Liver,E,89,21,24,1,shanghai,yellew
1,Arry,C,55,37,37,2,nanjin,red
2,Ack,A,57,60,33,3,beijin,purple
3,Eorge,C,93,4,71,4,tianjin,blue
4,Oah,D,93,49,44,5,henan,black
5,Harlie,C,24,13,87,6,hebei,pink


In [None]:
# pipe() 应用于DF和Series
# apply() 应用在DF的行或列，默认为列
# applymap() 应用在DF的每个元素
# map() 应用在Series或DF的一列的每个元素

# 5.7.1 pipe()

In [10]:
def add_mean(rdf, n):
    df = rdf.copy()
    df = df.loc[:,'Q1':'Q4'].applymap(lambda x: x+n)
    df['avg'] = df.loc[:,'Q1':'Q4'].mean(1)
    return df

In [11]:
# 所有季度成绩加100，然后增加平均值
df.pipe(add_mean, 100)

Unnamed: 0,Q1,Q2,Q3,Q4,avg
0,189,121,124,101,133.75
1,155,137,137,102,132.75
2,157,160,133,103,138.25
3,193,104,171,104,143.0
4,193,149,144,105,147.75
5,124,113,187,106,132.5


In [16]:
# 筛选出Q1大于等于80且Q2大于等于20
df.pipe(lambda df_,x,y : df_[(df_.Q1>=x) & (df_.Q2>=y)], 80,20 )

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
0,Liver,E,89,21,24,1,shanghai,yellew
4,Oah,D,93,49,44,5,henan,black


# 5.7.2 apply()

In [23]:
import numpy as np
# 将team为C，Q1大于90的输出good，不满足输出other
fun = lambda x: np.where(x.team=='C' and x.Q1>90, 'good','other')
df.apply(fun,axis=1)

0    other
1    other
2    other
3     good
4    other
5    other
dtype: object

In [20]:
df

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
0,Liver,E,89,21,24,1,shanghai,yellew
1,Arry,C,55,37,37,2,nanjin,red
2,Ack,A,57,60,33,3,beijin,purple
3,Eorge,C,93,4,71,4,tianjin,blue
4,Oah,D,93,49,44,5,henan,black
5,Harlie,C,24,13,87,6,hebei,pink


# 5.7.4 map()

In [24]:
# map()根据输入对应关系映射值返回最终数据

df.team.map({'A':'一班', 'B':'2班', 'C':'3班', 'D':'4班'})

0    NaN
1     3班
2     一班
3     3班
4     4班
5     3班
Name: team, dtype: object

In [25]:
df.team.map('I am a {}'.format)

0    I am a E
1    I am a C
2    I am a A
3    I am a C
4    I am a D
5    I am a C
Name: team, dtype: object

In [26]:
df.team.map('I am a {}'.format, na_action='ignore')

0    I am a E
1    I am a C
2    I am a A
3    I am a C
4    I am a D
5    I am a C
Name: team, dtype: object

In [29]:
# 获取name列名称的字符数量
def f(x):
    return len(str(x))
df['name'].map(f)

0    5
1    4
2    3
3    5
4    3
5    6
Name: name, dtype: int64

# 5.7.5 agg()

In [30]:
# 每列的最大值
df.agg('max')

name              Oah
team                E
Q1                 93
Q2                 60
Q3                 87
Q4                  6
1city         tianjin
what color     yellew
dtype: object

In [31]:
# 将所有列聚合产生sum和min
df.agg(['sum', 'min'])

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
sum,LiverArryAckEorgeOahHarlie,ECACDC,411,184,296,21,shanghainanjinbeijintianjinhenanhebei,yellewredpurpleblueblackpink
min,Ack,A,24,4,24,1,beijin,black


In [33]:
# 序列多个聚合
df.agg({'Q1':['sum', 'min'], 'Q2':['max', 'min']})

Unnamed: 0,Q1,Q2
sum,411.0,
min,24.0,4.0
max,,60.0


In [34]:
# 对team进行分组，输出第一个数字类型（Q1）最大值
df.groupby('team').agg('max')

Unnamed: 0_level_0,name,Q1,Q2,Q3,Q4,1city,what color
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A,Ack,57,60,33,3,beijin,purple
C,Harlie,93,37,87,6,tianjin,red
D,Oah,93,49,44,5,henan,black
E,Liver,89,21,24,1,shanghai,yellew


In [35]:
df


Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
0,Liver,E,89,21,24,1,shanghai,yellew
1,Arry,C,55,37,37,2,nanjin,red
2,Ack,A,57,60,33,3,beijin,purple
3,Eorge,C,93,4,71,4,tianjin,blue
4,Oah,D,93,49,44,5,henan,black
5,Harlie,C,24,13,87,6,hebei,pink


In [36]:
# 汇总Q1的合计与平均值
df.Q1.agg(['sum','mean'])

sum     411.0
mean     68.5
Name: Q1, dtype: float64

In [37]:
def mymean(x):
    return x.mean()
df.Q2.agg(['sum',mymean])

sum       184.000000
mymean     30.666667
Name: Q2, dtype: float64

In [38]:
# 每列使用不同的方法进行聚合
df.agg(a=('Q1',max),b=('Q2',min),c=('Q3',np.mean),d=('Q4',lambda s:s.sum()+1))

Unnamed: 0,Q1,Q2,Q3,Q4
a,93.0,,,
b,,4.0,,
c,,,49.333333,
d,,,,22.0


In [41]:
# 汇总Q1-Q4的平均值（按行聚合）
df.loc[:, 'Q1':'Q4'].agg("mean", axis="columns")

0    33.75
1    32.75
2    38.25
3    43.00
4    47.75
5    32.50
dtype: float64

In [42]:
# 对所有数据加10
df.loc[:, 'Q1':'Q4'].agg(pd.Series.add, other=10)

Unnamed: 0,Q1,Q2,Q3,Q4
0,99,31,34,11
1,65,47,47,12
2,67,70,43,13
3,103,14,81,14
4,103,59,54,15
5,34,23,97,16


# 5.7.6 transform()

In [43]:
# 所有内容重复2
df.transform(lambda x: x*2)

Unnamed: 0,name,team,Q1,Q2,Q3,Q4,1city,what color
0,LiverLiver,EE,178,42,48,2,shanghaishanghai,yellewyellew
1,ArryArry,CC,110,74,74,4,nanjinnanjin,redred
2,AckAck,AA,114,120,66,6,beijinbeijin,purplepurple
3,EorgeEorge,CC,186,8,142,8,tianjintianjin,blueblue
4,OahOah,DD,186,98,88,10,henanhenan,blackblack
5,HarlieHarlie,CC,48,26,174,12,hebeihebei,pinkpink


In [44]:
# 对所有数字列进行平方和e操作
df.transform([np.sqrt, np.exp])

  df.transform([np.sqrt, np.exp])


Unnamed: 0_level_0,Q1,Q1,Q2,Q2,Q3,Q3,Q4,Q4
Unnamed: 0_level_1,sqrt,exp,sqrt,exp,sqrt,exp,sqrt,exp
0,9.433981,4.489613e+38,4.582576,1318816000.0,4.898979,26489120000.0,1.0,2.718282
1,7.416198,7.694785e+23,6.082763,1.171914e+16,6.082763,1.171914e+16,1.414214,7.389056
2,7.549834,5.68572e+24,7.745967,1.142007e+26,5.744563,214643600000000.0,1.732051,20.085537
3,9.643651,2.451246e+40,2.0,54.59815,8.42615,6.837671e+30,2.0,54.59815
4,9.643651,2.451246e+40,7.0,1.907347e+21,6.63325,1.28516e+19,2.236068,148.413159
5,4.898979,26489120000.0,3.605551,442413.4,9.327379,6.07603e+37,2.44949,403.428793


In [45]:
df.transform([np.abs, lambda x:x+1])

  df.transform([np.abs, lambda x:x+1])


Unnamed: 0_level_0,Q1,Q1,Q2,Q2,Q3,Q3,Q4,Q4
Unnamed: 0_level_1,absolute,<lambda>,absolute,<lambda>,absolute,<lambda>,absolute,<lambda>
0,89,90,21,22,24,25,1,2
1,55,56,37,38,37,38,2,3
2,57,58,60,61,33,34,3,4
3,93,94,4,5,71,72,4,5
4,93,94,49,50,44,45,5,6
5,24,25,13,14,87,88,6,7


In [48]:
df.transform({'Q1':np.abs, 'Q2':lambda x:x+1})

Unnamed: 0,Q1,Q2
0,89,22
1,55,38
2,57,61
3,93,5
4,93,50
5,24,14


In [51]:
# 先分组，再计算sum
df.groupby('team').sum()

  df.groupby('team').sum()


Unnamed: 0_level_0,Q1,Q2,Q3,Q4
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,57,60,33,3
C,172,54,195,12
D,93,49,44,5
E,89,21,24,1


In [53]:
# 先transform计算sum，返回的是原数据结构，在指定位置上显示聚合计算后的结果。
df.groupby('team').transform(sum)

  df.groupby('team').transform(sum)


Unnamed: 0,Q1,Q2,Q3,Q4
0,89,21,24,1
1,172,54,195,12
2,57,60,33,3
3,172,54,195,12
4,93,49,44,5
5,172,54,195,12


# 5.7.7 copy()

In [54]:
# deep=True 深copy，不影响副本
# deep=False 浅copy，影响副本

In [82]:
s = pd.Series([1,2], index=['a','b'])
s

a    1
b    2
dtype: int64

In [83]:
s1 = s
scopy = s.copy(deep=True)

In [84]:
s1 is s

True

In [85]:
scopy is s 

False

In [86]:
scopy['a'] = 5

In [87]:
s

a    1
b    2
dtype: int64