In [1]:
import pandas as pd
import numpy as np

In [2]:
# 分层索引
data=pd.Series(np.random.randn(5),index=[['a','a','b','b','b'],[1,2,2,1,3]])
data

a  1   -0.177318
   2   -0.897174
b  2   -0.855772
   1    0.108593
   3    2.346756
dtype: float64

In [3]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 2),
            ('b', 1),
            ('b', 3)],
           )

In [4]:
data.loc['a']

1   -0.177318
2   -0.897174
dtype: float64

In [5]:
data.loc[:,1]
# 选择所有二级索引的2

a   -0.177318
b    0.108593
dtype: float64

In [6]:
data.unstack()
# 多级索引转化成DF
# DF.stack()转化为多级索引

Unnamed: 0,1,2,3
a,-0.177318,-0.897174,
b,0.108593,-0.855772,2.346756


In [7]:
# frame也可用多级索引，index=[[],[]],columns=[[],[]]

In [8]:
df1 = pd.DataFrame(np.random.randint(80, 120, size=(2, 4)),
                   index= ['girl', 'boy'],
                   columns=[['English', 'English', 'Chinese', 'Chinese'],
                         ['like', 'dislike', 'like', 'dislike']])
df1.columns.names=['kemu','ai']
df1

kemu,English,English,Chinese,Chinese
ai,like,dislike,like,dislike
girl,92,101,87,113
boy,101,93,116,104


In [9]:
df1.swaplevel('kemu','ai',axis=1)
# 交换多级索引级别

ai,like,dislike,like,dislike
kemu,English,English,Chinese,Chinese
girl,92,101,87,113
boy,101,93,116,104


In [10]:
df1.sort_index(axis=1,level=0)
# level指示对哪一级进行索引排序

kemu,Chinese,Chinese,English,English
ai,dislike,like,dislike,like
girl,113,87,101,92
boy,104,116,93,101


In [11]:
df1.sum(level=1,axis=1)

ai,like,dislike
girl,179,214
boy,217,197


In [12]:
# df.set_index(['a','b'],drop=T/F)选a，b列作为多级索引,drop作为索引的列是否删除
# df.reset_index()反操作，将多级索引还原到列中

In [13]:
# 联合和合并数据集
# pd.merge()根据键连接
# pd.concat()在轴向上粘合
# combine_first()重叠数据拼接，一个对象填充另一个对象

In [14]:
df2=pd.DataFrame({'key':['a','b','c','d','a'],'data':range(5)})
df3=pd.DataFrame({'key':['a','b','c'],'data':range(3)})
pd.merge(df2,df3,on='key')
# 指定对哪一列进行连接，保留相同的值

Unnamed: 0,key,data_x,data_y
0,a,0,0
1,a,4,0
2,b,1,1
3,c,2,2


In [15]:
# 默认是内连接inner，是键的交集
# 还有outer是键的并集，左连接left对左表联合，右连接right对右表联合
pd.merge(df2,df3,on='key',how='outer',suffixes=('_df1','_df2'))
# on也可以是list
# (suffixes=(_df1,_df2))对相同列名连接后添加后缀
# left_on,right_on左右用于连接的键

Unnamed: 0,key,data_df1,data_df2
0,a,0,0.0
1,a,4,0.0
2,b,1,1.0
3,c,2,2.0
4,d,3,


In [16]:
# right/left_index=True,表示索引作为连接的键
df3

Unnamed: 0,key,data
0,a,0
1,b,1
2,c,2


In [17]:
df4=pd.DataFrame({'data4':range(2)},index=['a','b'])
df4

Unnamed: 0,data4
a,0
b,1


In [18]:
pd.merge(df3,df4,left_on='key',right_index=True,how='outer')
# 若df4为多级索引，left_on=list
# 若两边都为True，则完全按照索引进行连接
# 也可以直接使用df.join（df4）没有on代表索引连接，默认左连接,尽量用concat

Unnamed: 0,key,data,data4
0,a,0,0.0
1,b,1,1.0
2,c,2,


In [19]:
# concat轴线连接，拼接
s1=pd.Series([0,1],index=['a','b'])
s2=pd.Series([2,3],index=['c','d'])
s3=pd.Series([4,5],index=['c','e'])

In [20]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
c    4
e    5
dtype: int64

In [21]:
pd.concat([s1,s2,s3],keys=['s1','s2','s3'])
# keys创建多层索引，或指明属于谁

s1  a    0
    b    1
s2  c    2
    d    3
s3  c    4
    e    5
dtype: int64

In [22]:
pd.concat([s1,s2,s3],keys=['s1','s2','s3']).unstack()

Unnamed: 0,a,b,c,d,e
s1,0.0,1.0,,,
s2,,,2.0,3.0,
s3,,,4.0,,5.0


In [23]:
pd.concat([s1,s2,s3],keys=['s1','s2','s3'],axis=1)
# keys在axis=1轴上为列名

Unnamed: 0,s1,s2,s3
a,0.0,,
b,1.0,,
c,,2.0,4.0
d,,3.0,
e,,,5.0


In [24]:
pd.concat([s2,s3],axis=1)

Unnamed: 0,0,1
c,2.0,4.0
d,3.0,
e,,5.0


In [25]:
pd.concat([s2,s3],axis=1,join='inner')
# 默认是outer

Unnamed: 0,0,1
c,2,4


In [26]:
pd.concat([df2,df3],axis=1,keys=['df2','df3'],names=['zhou1','zhou2'])

zhou1,df2,df2,df3,df3
zhou2,key,data,key,data
0,a,0,a,0.0
1,b,1,b,1.0
2,c,2,c,2.0
3,d,3,,
4,a,4,,


In [27]:
# axis=0仍是向下
pd.concat([df2,df3])

Unnamed: 0,key,data
0,a,0
1,b,1
2,c,2
3,d,3
4,a,4
0,a,0
1,b,1
2,c,2


In [28]:
pd.concat([df2,df3],ignore_index=True)
# 忽视索引

Unnamed: 0,key,data
0,a,0
1,b,1
2,c,2
3,d,3
4,a,4
5,a,0
6,b,1
7,c,2


In [29]:
# combine_first()重叠数据拼接
# df1.combine_first(df2)用df2填充df1空值,df1值不变
df3.loc[0,'key']=np.nan
df3

Unnamed: 0,key,data
0,,0
1,b,1
2,c,2


In [30]:
df2

Unnamed: 0,key,data
0,a,0
1,b,1
2,c,2
3,d,3
4,a,4


In [31]:
df3.combine_first(df2)

Unnamed: 0,key,data
0,a,0.0
1,b,1.0
2,c,2.0
3,d,3.0
4,a,4.0


In [32]:
# 重塑
# stack（）堆叠，列到行,多级索引
# unstack（）拆堆，多级索引，行到列
df3.stack(dropna=True)
# dropna=True默认是1

0  data    0
1  key     b
   data    1
2  key     c
   data    2
dtype: object

In [33]:
df3.stack().unstack()
# 参数中可以指定要拆堆的轴向名

Unnamed: 0,key,data
0,,0
1,b,1
2,c,2


In [34]:
# pivot(行索引，列索引，填充的值),一列变为多列
# pd.melt()多列变一列
df2[:4]

Unnamed: 0,key,data
0,a,0
1,b,1
2,c,2
3,d,3


In [35]:
pd.melt(df2[:4],id_vars=['key'])
# data列变为值，key为分组指标,可通过value_vars=["列名"]指定variable包含的值

Unnamed: 0,key,variable,value
0,a,data,0
1,b,data,1
2,c,data,2
3,d,data,3


In [36]:
pd.melt(df2[:4],['key']).pivot('key','variable','value') 
# key的值变为索引，variable的值变为列名，value的值填充

variable,data
key,Unnamed: 1_level_1
a,0
b,1
c,2
d,3
