# Pandas介绍
+ 作者Wes Mckinney，项目开始于2008年
+ 基于NumPy构建

亮点：
+ 快速高效的数据操作
+ 方便的读写不同数据源
+ 数据对齐和缺失数据处理
+ 数据聚合
+ 合并和连接数据
+ 时间序列处理
+ 应用广泛（金融、经济学、统计学等）

# Pandas数据结构

In [1]:
# Series 创建
import pandas as pd
s = pd.Series([4, 5, 3, 1])


In [2]:
# 获取数值
print(s.values)


[4 5 3 1]


In [3]:
# 获取索引
print(s.index)

RangeIndex(start=0, stop=4, step=1)


In [4]:
# 显式指定索引
s2 = pd.Series([4, 5, 3, 1], index=['a', 'b', 'c', 'd'])
print(s2.index)


Index(['a', 'b', 'c', 'd'], dtype='object')


In [5]:
# 通过索引选取值
print(s2['a'])
s2['a'] = 6
s2[['a','b']]

4


a    6
b    5
dtype: int64

In [6]:
# 数组运算
import numpy as np
s2[s2>0]
s2*2
np.exp(s2)


a    403.428793
b    148.413159
c     20.085537
d      2.718282
dtype: float64

In [7]:
# 判断索引是否存在
'b' in s2
'e' in s2


False

In [8]:
# 通过字典创建Series
data = {'shanghai': 1, 'beijing': 2, 'shenzhen': 3}
s3 = pd.Series(data)

In [9]:
# 检查缺失值
s3['guangzhou'] = np.nan
s3.isnull()


shanghai     False
beijing      False
shenzhen     False
guangzhou     True
dtype: bool

In [10]:
# 按索引自动对齐
s4 = pd.Series({'shanghai': 1, 'beijing': 2, 'chongqing': 3})
s3 + s4


beijing      4.0
chongqing    NaN
guangzhou    NaN
shanghai     2.0
shenzhen     NaN
dtype: float64

In [11]:
# Series的name属性
s3.name = 'population'
s3.index.name = 'city'


In [12]:
# DataFrame 创建
data = {'year': [2017, 2018, 2019],
    'revenue': [10, 20, 30]}
df = pd.DataFrame(data)


In [13]:
# 指定列序列
df = pd.DataFrame(data, columns=['year', 'revenue'])


In [14]:
# 指定索引
df = pd.DataFrame(data, index=[4, 5, 6])


In [15]:
# 设置索引列
df.set_index('year', inplace=True)


In [16]:
# 通过传入Series创建DataFrame
s1 = pd.Series([1,2,3])
s2 = pd.Series([4,5,6])
df = pd.DataFrame({'A': s1, 'B': s2})


In [17]:
# 通过传入ndarray创建DataFrame
a1 = np.array([1,2,3])
a2 = np.array([4,5,6])
df = pd.DataFrame({'A': a1, 'B': a2})

In [18]:
# 创建带时间索引的DataFrame
dates = pd.date_range('2019-01-01', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))


# 练习一 - 见PPT

# 查看数据

In [19]:
# 查看顶部数据
df.head()

Unnamed: 0,A,B,C
2019-01-01,-0.514474,1.303681,-0.180275
2019-01-02,0.388647,0.166522,0.128407
2019-01-03,-0.999601,-1.025495,0.809373
2019-01-04,-1.366051,-1.437558,-1.412334
2019-01-05,0.46898,0.538422,0.128774


In [20]:
# 查看底部数据
df.tail(3)

Unnamed: 0,A,B,C
2019-01-04,-1.366051,-1.437558,-1.412334
2019-01-05,0.46898,0.538422,0.128774
2019-01-06,0.610557,-1.822243,0.718715


In [21]:
# 显示索引
df.index

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')

In [22]:
# 显示列
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [23]:
# 显示数据的统计摘要
df.describe()


Unnamed: 0,A,B,C
count,6.0,6.0,6.0
mean,-0.235324,-0.379445,0.03211
std,0.841604,1.232244,0.803207
min,-1.366051,-1.822243,-1.412334
25%,-0.878319,-1.334542,-0.103105
50%,-0.062913,-0.429487,0.128591
75%,0.448897,0.445447,0.57123
max,0.610557,1.303681,0.809373


In [24]:
# 转置数据
df.T


Unnamed: 0,2019-01-01,2019-01-02,2019-01-03,2019-01-04,2019-01-05,2019-01-06
A,-0.514474,0.388647,-0.999601,-1.366051,0.46898,0.610557
B,1.303681,0.166522,-1.025495,-1.437558,0.538422,-1.822243
C,-0.180275,0.128407,0.809373,-1.412334,0.128774,0.718715


In [25]:
# 按轴排序
df.sort_index(axis=1, ascending=False)


Unnamed: 0,C,B,A
2019-01-01,-0.180275,1.303681,-0.514474
2019-01-02,0.128407,0.166522,0.388647
2019-01-03,0.809373,-1.025495,-0.999601
2019-01-04,-1.412334,-1.437558,-1.366051
2019-01-05,0.128774,0.538422,0.46898
2019-01-06,0.718715,-1.822243,0.610557


In [26]:
# 按值排序
dates = pd.date_range('2019-01-01', periods=6)
df = pd.DataFrame(np.random.randn(6,3), index=dates, columns=list('ABC'))
df.sort_values(by='B')


Unnamed: 0,A,B,C
2019-01-06,-2.144984,-0.470365,0.066287
2019-01-02,-0.112901,-0.397299,-0.958325
2019-01-01,-1.602139,-0.026714,0.160011
2019-01-05,0.851178,0.374649,0.961546
2019-01-04,0.202018,0.773198,-1.567076
2019-01-03,0.852147,1.366912,-0.458827


# 选择数据

In [27]:
# 选择一个列
df['A']


2019-01-01   -1.602139
2019-01-02   -0.112901
2019-01-03    0.852147
2019-01-04    0.202018
2019-01-05    0.851178
2019-01-06   -2.144984
Freq: D, Name: A, dtype: float64

In [28]:
# 切片操作
df[1:3]
df['2019-01-02':'2019-01-05']


Unnamed: 0,A,B,C
2019-01-02,-0.112901,-0.397299,-0.958325
2019-01-03,0.852147,1.366912,-0.458827
2019-01-04,0.202018,0.773198,-1.567076
2019-01-05,0.851178,0.374649,0.961546


In [29]:
# 选择多列数据
df.loc[:, ['A', 'B']]


Unnamed: 0,A,B
2019-01-01,-1.602139,-0.026714
2019-01-02,-0.112901,-0.397299
2019-01-03,0.852147,1.366912
2019-01-04,0.202018,0.773198
2019-01-05,0.851178,0.374649
2019-01-06,-2.144984,-0.470365


In [30]:
# 在两个轴上切片
df.loc['2019-01-02':'2019-01-05', ['A','B']]


Unnamed: 0,A,B
2019-01-02,-0.112901,-0.397299
2019-01-03,0.852147,1.366912
2019-01-04,0.202018,0.773198
2019-01-05,0.851178,0.374649


In [31]:
# 获取一行
df.loc['2019-01-05']


A    0.851178
B    0.374649
C    0.961546
Name: 2019-01-05 00:00:00, dtype: float64

In [32]:
# 获取某一行的某一列
df.loc['2019-01-02', 'A']
df.at['2019-01-02', 'A']



-0.11290141667128237

In [33]:
# 按位置选择
df.iloc[3]


A    0.202018
B    0.773198
C   -1.567076
Name: 2019-01-04 00:00:00, dtype: float64

In [34]:
# 通过整数切片
df.iloc[3:5, 0:2]


Unnamed: 0,A,B
2019-01-04,0.202018,0.773198
2019-01-05,0.851178,0.374649


In [35]:
# 整行切片
df.iloc[3:5, :]


Unnamed: 0,A,B,C
2019-01-04,0.202018,0.773198,-1.567076
2019-01-05,0.851178,0.374649,0.961546


In [36]:
# 整列切片
df.iloc[:, 0:2]


Unnamed: 0,A,B
2019-01-01,-1.602139,-0.026714
2019-01-02,-0.112901,-0.397299
2019-01-03,0.852147,1.366912
2019-01-04,0.202018,0.773198
2019-01-05,0.851178,0.374649
2019-01-06,-2.144984,-0.470365


In [37]:
# 获取某一行某一列
df.iloc[1, 1]
df.iat[1, 1]


-0.39729874727821896

In [38]:
# 使用单个列的值来选择数据
df[df.A > 0]


Unnamed: 0,A,B,C
2019-01-03,0.852147,1.366912,-0.458827
2019-01-04,0.202018,0.773198,-1.567076
2019-01-05,0.851178,0.374649,0.961546


In [39]:
# 选择满足布尔条件的值
df[df > 0]


Unnamed: 0,A,B,C
2019-01-01,,,0.160011
2019-01-02,,,
2019-01-03,0.852147,1.366912,
2019-01-04,0.202018,0.773198,
2019-01-05,0.851178,0.374649,0.961546
2019-01-06,,,0.066287


In [40]:
# 新增列
df["D"] = [1,2,3,4,5,6]

# 精确匹配索引
s1 = pd.Series([6,5,4,3,2,1], index=pd.date_range('2019-01-01', periods=6))
df["D"] = s1

# 指定列名赋值
df.loc['2019-01-01', 'A'] = 0


In [41]:
# 通过位置赋值
df.iloc[0,1] = 0

# 使用NumPy数组赋值
df.loc[:, 'D'] = np.array([6] * len(df))

# 条件赋值
df[df < 0] = -df

# 练习二 - 见PPT

# 缺失值处理

In [42]:
# Pandas中用np.nan来表示缺失的数据
df.loc['2019-01-02', 'A'] = np.nan
df.loc['2019-01-03', 'B'] = np.nan


In [43]:
# 删除带有缺失值的行
df.dropna(how='any')


Unnamed: 0,A,B,C,D
2019-01-01,0.0,0.0,0.160011,6
2019-01-04,0.202018,0.773198,1.567076,6
2019-01-05,0.851178,0.374649,0.961546,6
2019-01-06,2.144984,0.470365,0.066287,6


In [44]:
# 填充缺失值
df.fillna(value=5)


Unnamed: 0,A,B,C,D
2019-01-01,0.0,0.0,0.160011,6
2019-01-02,5.0,0.397299,0.958325,6
2019-01-03,0.852147,5.0,0.458827,6
2019-01-04,0.202018,0.773198,1.567076,6
2019-01-05,0.851178,0.374649,0.961546,6
2019-01-06,2.144984,0.470365,0.066287,6


In [45]:
# 判断DataFrame中的元素是否nan
pd.isna(df)


Unnamed: 0,A,B,C,D
2019-01-01,False,False,False,False
2019-01-02,True,False,False,False
2019-01-03,False,True,False,False
2019-01-04,False,False,False,False
2019-01-05,False,False,False,False
2019-01-06,False,False,False,False


# 函数操作

In [46]:
# 获取最大值
print(df)
df.max()


                   A         B         C  D
2019-01-01  0.000000  0.000000  0.160011  6
2019-01-02       NaN  0.397299  0.958325  6
2019-01-03  0.852147       NaN  0.458827  6
2019-01-04  0.202018  0.773198  1.567076  6
2019-01-05  0.851178  0.374649  0.961546  6
2019-01-06  2.144984  0.470365  0.066287  6


A    2.144984
B    0.773198
C    1.567076
D    6.000000
dtype: float64

In [47]:
# 获取最小值
df.min()

A    0.000000
B    0.000000
C    0.066287
D    6.000000
dtype: float64

In [48]:
# 获取平均值
df.mean()

A    0.810065
B    0.403102
C    0.695345
D    6.000000
dtype: float64

In [49]:
# 获取标准差
df.std()

A    0.838361
B    0.276057
C    0.572394
D    0.000000
dtype: float64

In [50]:
# 获取最大值的索引
df.idxmax()

A   2019-01-06
B   2019-01-04
C   2019-01-04
D   2019-01-01
dtype: datetime64[ns]

In [51]:
# 累计求和
df.cumsum()

Unnamed: 0,A,B,C,D
2019-01-01,0.0,0.0,0.160011,6.0
2019-01-02,,0.397299,1.118336,12.0
2019-01-03,0.852147,,1.577163,18.0
2019-01-04,1.054165,1.170496,3.144239,24.0
2019-01-05,1.905342,1.545145,4.105785,30.0
2019-01-06,4.050326,2.01551,4.172071,36.0


In [52]:
# 理解axis参数 - 见PPT
df.max(axis=0) 
df.max(axis="index")
df.max(axis=1)
df.max(axis="columns")

2019-01-01    6.0
2019-01-02    6.0
2019-01-03    6.0
2019-01-04    6.0
2019-01-05    6.0
2019-01-06    6.0
Freq: D, dtype: float64

In [53]:
# 计算相关系数
df.corr()

Unnamed: 0,A,B,C,D
A,1.0,0.205536,-0.447228,
B,0.205536,1.0,0.703738,
C,-0.447228,0.703738,1.0,
D,,,,


In [54]:
# 计算每一个值出现的次数
df['A'].value_counts()

0.202018    1
0.852147    1
2.144984    1
0.851178    1
0.000000    1
Name: A, dtype: int64

In [55]:
# 移动数据
df.shift(1)


Unnamed: 0,A,B,C,D
2019-01-01,,,,
2019-01-02,0.0,0.0,0.160011,6.0
2019-01-03,,0.397299,0.958325,6.0
2019-01-04,0.852147,,0.458827,6.0
2019-01-05,0.202018,0.773198,1.567076,6.0
2019-01-06,0.851178,0.374649,0.961546,6.0


In [56]:
# 自定义函数
df.apply(lambda x: x.max()-x.min())


A    2.144984
B    0.773198
C    1.500789
D    0.000000
dtype: float64

# 练习三 - 见PPT

In [57]:
# 移动窗口函数，rolling函数的计算过程见PPT
df['A'].rolling(window=3).mean()
df['A'].rolling(window=3).apply(lambda x: x.max()-x.min())


  This is separate from the ipykernel package so we can avoid doing imports until


2019-01-01         NaN
2019-01-02         NaN
2019-01-03         NaN
2019-01-04         NaN
2019-01-05    0.650129
2019-01-06    1.942966
Freq: D, Name: A, dtype: float64

# 练习四 - 见PPT

In [58]:
# 移除重复数据
data = pd.DataFrame({'a': ['one']*3+['two']*3,
  'b': [1,1,1,2,2,2]})
data.duplicated()
data.drop_duplicates()
data.drop_duplicates(['b'])
data.drop_duplicates(['b'], keep="last")


Unnamed: 0,a,b
2,one,1
5,two,2


# 数据的连接和合并

In [None]:
# 使用concat连接数据

In [59]:
# 先定义几个DataFrame
df1 = pd.DataFrame({'A':['A0','A1','A2','A3'], 'B':['B0','B1','B2','B3'],
                    'C':['C0','C1','C2','C3'], 'D':['D0','D1','D2','D3']},
                   index=[0,1,2,3])
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [60]:
df2 = pd.DataFrame({'A':['A4','A5','A6','A7'], 'B':['B4','B5','B6','B7'],
                    'C':['C4','C5','C6','C7'], 'D':['D4','D5','D6','D7']},
                   index=[4,5,6,7])
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [61]:
df3 = pd.DataFrame({'A':['A8','A9','A10','A11'], 'B':['B8','B9','B10','B11'],
                    'C':['C8','C9','C10','C11'], 'D':['D8','D9','D10','D11']},
                   index=[8,9,10,11])
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [62]:
# 相同字段的表首尾相接，图见PPT
pd.concat([df1,df2,df3])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [63]:
# 要在相接的时候识别数据源自于哪张表，可以增加key参数
pd.concat([df1,df2,df3], keys=['x', 'y', 'z'])

Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,A0,B0,C0,D0
x,1,A1,B1,C1,D1
x,2,A2,B2,C2,D2
x,3,A3,B3,C3,D3
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7
z,8,A8,B8,C8,D8
z,9,A9,B9,C9,D9


In [66]:
# 横向连接，行对齐
# 指定axis=1
df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
                     'D': ['D2', 'D3', 'D6', 'D7'],
                     'F': ['F2', 'F3', 'F6', 'F7']},
                    index=[2, 3, 6, 7])
pd.concat([df1, df4], axis=1)

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


In [None]:
# 使用merge合并(join)数据

In [None]:
# join的几种方式，见PPT

In [74]:
# 在一个key上作inner join，图见PPT
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
left

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [73]:
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})
right

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K3,C3,D3


In [71]:
# 默认作inner join
pd.merge(left, right, on='key')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


In [75]:
# 在多个key上做join，图见PPT
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                      'key2': ['K0', 'K1', 'K0', 'K1'],
                      'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3']})
left


Unnamed: 0,key1,key2,A,B
0,K0,K0,A0,B0
1,K0,K1,A1,B1
2,K1,K0,A2,B2
3,K2,K1,A3,B3


In [76]:
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                       'key2': ['K0', 'K0', 'K0', 'K0'],
                       'C': ['C0', 'C1', 'C2', 'C3'],
                       'D': ['D0', 'D1', 'D2', 'D3']})
right 

Unnamed: 0,key1,key2,C,D
0,K0,K0,C0,D0
1,K1,K0,C1,D1
2,K1,K0,C2,D2
3,K2,K0,C3,D3


In [79]:
pd.merge(left, right, on=['key1', 'key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


In [80]:
# left join，图见PPT
pd.merge(left, right, how='left', on=['key1', 'key2'])


Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,


In [81]:
# right join，图见PPT
pd.merge(left, right, how='right', on=['key1', 'key2'])


Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2
3,K2,K0,,,C3,D3


In [82]:
# outer join，图见PPT
pd.merge(left, right, how='outer', on=['key1', 'key2'])

Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K0,K1,A1,B1,,
2,K1,K0,A2,B2,C1,D1
3,K1,K0,A2,B2,C2,D2
4,K2,K1,A3,B3,,
5,K2,K0,,,C3,D3


In [83]:
# 通过how参数的指定作inner join，inner join也是默认的join方式，图见PPT
pd.merge(left, right, how='inner', on=['key1', 'key2'])


Unnamed: 0,key1,key2,A,B,C,D
0,K0,K0,A0,B0,C0,D0
1,K1,K0,A2,B2,C1,D1
2,K1,K0,A2,B2,C2,D2


In [87]:
# join的两个DataFrame存在相同列名，pandas会自动对它们重新命名，图见PPT
left = pd.DataFrame({'A': [1, 2], 'B': [2, 2]})
left

Unnamed: 0,A,B
0,1,2
1,2,2


In [88]:
right = pd.DataFrame({'A': [4, 5, 6], 'B': [2, 2, 2]})
right

Unnamed: 0,A,B
0,4,2
1,5,2
2,6,2


In [89]:
pd.merge(left, right, on='B', how='outer')

Unnamed: 0,A_x,B,A_y
0,1,2,4
1,1,2,5
2,1,2,6
3,2,2,4
4,2,2,5
5,2,2,6


In [93]:
# 用索引作join
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                     index=['K0', 'K1', 'K2'])

left

Unnamed: 0,A,B
K0,A0,B0
K1,A1,B1
K2,A2,B2


In [94]:
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                       'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])
right

Unnamed: 0,C,D
K0,C0,D0
K2,C2,D2
K3,C3,D3


In [95]:
# 作left join，图见PPT
left.join(right)

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


In [96]:
# 作outer join，图见PPT
left.join(right, how='outer')

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2
K3,,,C3,D3


In [98]:
# 作inner join，图见PPT
left.join(right, how='inner')
# 用merge也可以实现同样的功能
pd.merge(left, right, left_index=True, right_index=True, how='inner')

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K2,A2,B2,C2,D2


In [100]:
# 列和索引作join，图见PPT
left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                      'B': ['B0', 'B1', 'B2', 'B3'],
                      'key': ['K0', 'K1', 'K0', 'K1']})
left 


Unnamed: 0,A,B,key
0,A0,B0,K0
1,A1,B1,K1
2,A2,B2,K0
3,A3,B3,K1


In [103]:
right = pd.DataFrame({'C': ['C0', 'C1'],
                       'D': ['D0', 'D1']},
                      index=['K0', 'K1'])
right

Unnamed: 0,C,D
K0,C0,D0
K1,C1,D1


In [105]:
pd.merge(left, right, left_on='key', right_index=True, how='left', sort=False)

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K0,C0,D0
3,A3,B3,K1,C1,D1


# 数据分组与聚合

In [107]:
# groupby机制，见PPT

In [141]:
# 分组后计算每一组的平均值，分组图示见PPT
df = pd.DataFrame({
  'sex':['F','M','F','M','M'],
  'height': [170,165,175,172,180],
  'weight': [55,66,60,70,65]})

df_gb = df.groupby("sex")
for index, data in df_gb:
    print("group: ", index)
    print("data: \n", data)

df.groupby(['sex']).mean()

group:  F
data: 
   sex  height  weight
0   F     170      55
2   F     175      60
group:  M
data: 
   sex  height  weight
1   M     165      66
3   M     172      70
4   M     180      65


Unnamed: 0_level_0,height,weight
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,172.5,57.5
M,172.333333,67.0


In [142]:
# 使用agg作聚合操作
df.groupby(['sex']).agg(['mean', 'std'])   

Unnamed: 0_level_0,height,height,weight,weight
Unnamed: 0_level_1,mean,std,mean,std
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
F,172.5,3.535534,57.5,3.535534
M,172.333333,7.505553,67.0,2.645751


In [148]:
# 对不同列作不同的聚合操作，可以用给agg传入字典形式的参数
df.groupby('sex').agg({'height': ['mean', 'std'], 'weight': ['mean']})

Unnamed: 0_level_0,height,height,weight
Unnamed: 0_level_1,mean,std,mean
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
F,172.5,3.535534,57.5
M,172.333333,7.505553,67.0


In [143]:
# agg中传入自定义函数
peak_to_peak = lambda x: x.max() - x.min()
df.groupby(['sex']).agg(peak_to_peak)


Unnamed: 0_level_0,height,weight
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,5,5
M,15,5


# 练习五 - 见PPT

# 透视表和交叉表

In [172]:
# 透视表 pivot table
# 透视表是一种常见的数据汇总工具，根据一个或者多个键对数据进行聚合，并根据行、列分组将数据分配到各个矩形区域
df = pd.DataFrame({
  'sex':['F','M','F','M','M'],
  'height': [150,155,175,172,180],
  'weight': [55,66,60,70,65],
  'age': [10, 15, 25, 35, 30]})

# 年龄离散化
df["age"] = pd.cut(df["age"], [0, 18, 90])
df

Unnamed: 0,sex,height,weight,age
0,F,150,55,"(0, 18]"
1,M,155,66,"(0, 18]"
2,F,175,60,"(18, 90]"
3,M,172,70,"(18, 90]"
4,M,180,65,"(18, 90]"


In [189]:
# index指定透视表建立时要根据哪些字段进行分组
# values指对哪些字段进行聚合操作
# aggfunc指定聚合函数，默认的聚合函数是mean，也就是求平均值
# 分解步骤见PPT
pd.pivot_table(df, index="sex", values="height", aggfunc='mean')

Unnamed: 0_level_0,height
sex,Unnamed: 1_level_1
F,162.5
M,169.0


In [173]:
# 添加列索引，按不同值分组
# 分解步骤见PPT
pd.pivot_table(df, index="sex", columns="age", values="height", aggfunc='mean')

age,"(0, 18]","(18, 90]"
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,150,175
M,155,176


In [175]:
# 添加多个聚合列
pd.pivot_table(df, index="sex", values=["height", "weight"], aggfunc='mean')

Unnamed: 0_level_0,height,weight
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,162.5,57.5
M,169.0,67.0


In [176]:
# 多个聚合函数
# aggfunc传入list，则每个聚合函数对每个列都进行一次聚合
pd.pivot_table(df, index="sex", values="height", aggfunc=['mean', 'max'])

Unnamed: 0_level_0,mean,max
Unnamed: 0_level_1,height,height
sex,Unnamed: 1_level_2,Unnamed: 2_level_2
F,162.5,175
M,169.0,180


In [178]:
# aggfunc传入dict，则每个列仅对其指定的函数进行聚合，此时values参数可以不传
pd.pivot_table(df, index="sex", aggfunc={"height": 'mean', "weight": 'max'})

Unnamed: 0_level_0,height,weight
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,162.5,60
M,169.0,70


In [182]:
# 添加汇总项
# 按行、按列进行汇总，指定汇总列名为 “Total”，默认名为 “ALL”
pd.pivot_table(df, index="sex", values="height", aggfunc='count', margins=True, margins_name="Total")

Unnamed: 0_level_0,height
sex,Unnamed: 1_level_1
F,2
M,3
Total,5


In [183]:
# 交叉表 cross table
# 交叉表是一种常用的分类汇总表格，用于频数分布统计
# 默认计算频数
pd.crosstab(index=df.sex, columns=df.age)

age,"(0, 18]","(18, 90]"
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1,1
M,1,2


In [184]:
# aggfunc用来指定聚合函数，默认为统计频数
pd.crosstab(index=df.sex, columns=df.age, values=df.weight, aggfunc=sum)

age,"(0, 18]","(18, 90]"
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,55,60
M,66,135


In [188]:
# 添加汇总项
pd.crosstab(index=df.sex, columns=df.age, margins=True, margins_name="Total")

age,"(0, 18]","(18, 90]",Total
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,1,1,2
M,1,2,3
Total,2,3,5


# 练习六 - 见PPT

# pandas读写csv

In [195]:
# 保存dataframe到csv
df = pd.DataFrame({
  'sex':['F','M','F','M','M'],
  'height': [150,155,175,172,180],
  'weight': [55,66,60,70,65]})
df.to_csv("people.csv")

In [200]:
# 读取csv到dataframe
df2 = pd.read_csv("people.csv", index_col=0)
df2

Unnamed: 0,sex,height,weight
0,F,150,55
1,M,155,66
2,F,175,60
3,M,172,70
4,M,180,65


# 实战项目一 - 见PPT

# 实战项目二 - 见PPT