In [1]:
import numpy as np
import pandas as pd

# 1.根据索引合并

### 1.1 merge()

In [2]:
df1 = pd.DataFrame({'key':["a", "b", "a", "a", "b", "c", "b"],'value':[11,22,33,44,55,66,77]})
df1

Unnamed: 0,key,value
0,a,11
1,b,22
2,a,33
3,a,44
4,b,55
5,c,66
6,b,77


In [3]:
df2 = pd.DataFrame({'value2':[99,100]},index=["a", "b"])
df2

Unnamed: 0,value2
a,99
b,100


In [4]:
pd.merge(df1,df2,left_on='key',right_index=True)

Unnamed: 0,key,value,value2
0,a,11,99
2,a,33,99
3,a,44,99
1,b,22,100
4,b,55,100
6,b,77,100


In [5]:
pd.merge(df1,df2,left_on='key',right_index=True,how='outer')

Unnamed: 0,key,value,value2
0,a,11,99.0
2,a,33,99.0
3,a,44,99.0
1,b,22,100.0
4,b,55,100.0
6,b,77,100.0
5,c,66,


#### 当一方数据为多层索引时，传入一个键名的列表即可

In [6]:
df1 = pd.DataFrame({"key1": ["a", "a", "a","b", "b"],
                    "key2": [2000, 2001, 2002, 2001, 2002],
                    "value":[11,22,33,44,55]})
df1

Unnamed: 0,key1,key2,value
0,a,2000,11
1,a,2001,22
2,a,2002,33
3,b,2001,44
4,b,2002,55


In [7]:
df2_index = pd.MultiIndex.from_arrays([
    ["b", "b", "a", "a", "a", "a"],
    [2001, 2000, 2000, 2000, 2001, 2002]
])
df2_index

MultiIndex([('b', 2001),
            ('b', 2000),
            ('a', 2000),
            ('a', 2000),
            ('a', 2001),
            ('a', 2002)],
           )

In [8]:
df2 = pd.DataFrame({"val1": pd.Series([0, 2, 4, 6, 8, 10],index=df2_index),
                       "val2": pd.Series([1, 3, 5, 7, 9, 11],index=df2_index)})

df2

Unnamed: 0,Unnamed: 1,val1,val2
b,2001,0,1
b,2000,2,3
a,2000,4,5
a,2000,6,7
a,2001,8,9
a,2002,10,11


In [9]:
pd.merge(df1,df2,left_on=["key1","key2"],right_index=True)

Unnamed: 0,key1,key2,value,val1,val2
0,a,2000,11,4,5
0,a,2000,11,6,7
1,a,2001,22,8,9
2,a,2002,33,10,11
3,b,2001,44,0,1


In [10]:
pd.merge(df1,df2,left_on=["key1","key2"],right_index=True,how="outer")

Unnamed: 0,key1,key2,value,val1,val2
0,a,2000,11.0,4.0,5.0
0,a,2000,11.0,6.0,7.0
1,a,2001,22.0,8.0,9.0
2,a,2002,33.0,10.0,11.0
3,b,2001,44.0,0.0,1.0
4,b,2002,55.0,,
4,b,2000,,2.0,3.0


#### 使用两边的索引进行合并

In [11]:
df1 = pd.DataFrame({'key1':[11,22,33],'key2':[99,88,77]},index={"a","b","c"})
df1

Unnamed: 0,key1,key2
b,11,99
c,22,88
a,33,77


In [12]:
df2 = pd.DataFrame({'value1':[9,10,11],'value2':[1,2,3]},index=["b", "d", "c"])
df2

Unnamed: 0,value1,value2
b,9,1
d,10,2
c,11,3


In [13]:
pd.merge(df1,df2,left_index=True,right_index=True)

Unnamed: 0,key1,key2,value1,value2
b,11,99,9,1
c,22,88,11,3


In [14]:
pd.merge(df1,df2,left_index=True,right_index=True,how="outer")

Unnamed: 0,key1,key2,value1,value2
a,33.0,77.0,,
b,11.0,99.0,9.0,1.0
c,22.0,88.0,11.0,3.0
d,,,10.0,2.0


### 1.2 join()

In [15]:
df1 = pd.DataFrame({'key1':[11,22,33],'key2':[99,88,77]},index={"a","b","c"})
df1

Unnamed: 0,key1,key2
b,11,99
c,22,88
a,33,77


In [16]:
df2 = pd.DataFrame({'value1':[9,10,11],'value2':[1,2,3]},index=["b", "d", "c"])
df2

Unnamed: 0,value1,value2
b,9,1
d,10,2
c,11,3


In [17]:
df1.join(df2,how="outer")

Unnamed: 0,key1,key2,value1,value2
a,33.0,77.0,,
b,11.0,99.0,9.0,1.0
c,22.0,88.0,11.0,3.0
d,,,10.0,2.0


#### 支持在调用 DataFrame 的某列上连接传递的 DataFrame 的索引

In [18]:
df1 = pd.DataFrame({'key':["a", "b", "a", "a", "b", "c", "b"],'value':[11,22,33,44,55,66,77]})
df1

Unnamed: 0,key,value
0,a,11
1,b,22
2,a,33
3,a,44
4,b,55
5,c,66
6,b,77


In [19]:
df2 = pd.DataFrame({'value2':[99,100]},index=["a", "b"])
df2

Unnamed: 0,value2
a,99
b,100


In [20]:
df1.join(df2,on="key",how="outer")

Unnamed: 0,key,value,value2
0,a,11,99.0
2,a,33,99.0
3,a,44,99.0
1,b,22,100.0
4,b,55,100.0
6,b,77,100.0
5,c,66,


#### 传递要连接的 DataFrame 列表

In [21]:
df1 = pd.DataFrame({'key1':[11,22,33],'key2':[99,88,77]},index={"a","b","c"})
df1

Unnamed: 0,key1,key2
b,11,99
c,22,88
a,33,77


In [22]:
df2 = pd.DataFrame({'value1':[9,10,11],'value2':[1,2,3]},index=["b", "d", "c"])
df2

Unnamed: 0,value1,value2
b,9,1
d,10,2
c,11,3


In [23]:
df3 = pd.DataFrame([[7, 8], [9, 10], [11, 12], [16, 17]],index=["a", "c", "e", "f"],columns=["score1", "score2"])
df3

Unnamed: 0,score1,score2
a,7,8
c,9,10
e,11,12
f,16,17


In [24]:
df1.join([df2,df3])

Unnamed: 0,key1,key2,value1,value2,score1,score2
b,11.0,99.0,9.0,1.0,,
c,22.0,88.0,11.0,3.0,9.0,10.0
a,33.0,77.0,,,7.0,8.0


In [25]:
df1.join([df2,df3],how="outer")

Unnamed: 0,key1,key2,value1,value2,score1,score2
b,11.0,99.0,9.0,1.0,,
c,22.0,88.0,11.0,3.0,9.0,10.0
a,33.0,77.0,,,7.0,8.0
d,,,10.0,2.0,,
e,,,,,11.0,12.0
f,,,,,16.0,17.0
