In [2]:
import numpy as np
import pandas as pd

In [3]:
# 官網 https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
# merge 实现对两个数据表进行匹配和拼接的功能
# 4种匹配拼接模式，分别为inner，left，right和outer模式
# result = pd.merge(left, right, how='inner', on=['key1', 'key2'])


In [4]:
loanstats=pd.DataFrame(pd.read_excel('loanStats.xlsx'))
member_grade=pd.DataFrame(pd.read_excel('member_grade.xlsx'))

In [5]:
# inner 交集
loan_inner=pd.merge(loanstats,member_grade,how='inner')

In [6]:
loan_inner

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,grade
0,1077501,1296599,500,36 months,0.1065,B
1,1077175,1313524,2400,36 months,0.1596,C
2,1075269,1311441,5000,36 months,0.079,A
3,1071795,1306957,5600,60 months,0.2128,F


In [7]:
# left 左匹配
loan_left=pd.merge(loanstats,member_grade,how='left')

In [8]:
loan_left

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,grade
0,1077501,1296599,500,36 months,0.1065,B
1,1077175,1313524,2400,36 months,0.1596,C
2,1075358,1311748,3000,60 months,0.1269,
3,1075269,1311441,5000,36 months,0.079,A
4,1072053,1288686,3000,36 months,0.1864,
5,1071795,1306957,5600,60 months,0.2128,F


In [9]:
# right 右匹配
loan_right=pd.merge(loanstats,member_grade,how='right')

In [10]:
loan_right

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,grade
0,1077501.0,1296599,500.0,36 months,0.1065,B
1,1077175.0,1313524,2400.0,36 months,0.1596,C
2,,1277178,,,,C
3,1075269.0,1311441,5000.0,36 months,0.079,A
4,,1304742,,,,C
5,1071795.0,1306957,5600.0,60 months,0.2128,F


In [11]:
# outer 两个表的汇总
loan_outer=pd.merge(loanstats,member_grade,how='outer')

In [12]:
loan_outer

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,grade
0,1077501.0,1296599,500.0,36 months,0.1065,B
1,1077175.0,1313524,2400.0,36 months,0.1596,C
2,1075358.0,1311748,3000.0,60 months,0.1269,
3,1075269.0,1311441,5000.0,36 months,0.079,A
4,1072053.0,1288686,3000.0,36 months,0.1864,
5,1071795.0,1306957,5600.0,60 months,0.2128,F
6,,1277178,,,,C
7,,1304742,,,,C


##### 合并多索引系列和 DataFrame

In [13]:
df = pd.DataFrame({"Let": ["A", "B", "C"], "Num": [1, 2, 3]})

In [14]:
df

Unnamed: 0,Let,Num
0,A,1
1,B,2
2,C,3


In [15]:
ser = pd.Series(
    ["a", "b", "c", "d", "e", "f"],
    index=pd.MultiIndex.from_arrays(
        [["A", "B", "C"] * 2, [1, 2, 3, 4, 5, 6]], names=["Let", "Num"]
    ),
)

In [16]:
ser

Let  Num
A    1      a
B    2      b
C    3      c
A    4      d
B    5      e
C    6      f
dtype: object

In [23]:
# Series.reset_index()在合并之前使用将 Series 转换为 DataFrame 
pd.merge(df, ser.reset_index(), on=["Let", "Num"])


Unnamed: 0,Let,Num,0
0,A,1,a
1,B,2,b
2,C,3,c


##### 检查重复键

In [None]:
# validate参数自动检查合并键中是否存在意外的重复项。在合并操作之前检查密钥的唯一性，因此应该防止内存溢出。检查键唯一性也是确保用户数据结构符合预期的好方法

In [20]:
left = pd.DataFrame({"A": [1, 2], "B": [1, 2]})
left

Unnamed: 0,A,B
0,1,1
1,2,2


In [21]:
right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]})
right

Unnamed: 0,A,B
0,4,2
1,5,2
2,6,2


In [30]:
pd.merge(left, right, on="B", how="outer", validate="one_to_many")

Unnamed: 0,A_x,B,A_y
0,1,1,
1,2,2,4.0
2,2,2,5.0
3,2,2,6.0


##### 合并指示器¶


In [31]:
df1 = pd.DataFrame({"col1": [0, 1], "col_left": ["a", "b"]})
df1

Unnamed: 0,col1,col_left
0,0,a
1,1,b


In [32]:
df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]})
df2

Unnamed: 0,col1,col_right
0,1,2
1,2,2
2,2,2


In [33]:
pd.merge(df1, df2, on="col1", how="outer", indicator=True)


Unnamed: 0,col1,col_left,col_right,_merge
0,0,a,,left_only
1,1,b,2.0,both
2,2,,2.0,right_only
3,2,,2.0,right_only


In [34]:
# indicator参数还将接受字符串参数
pd.merge(df1, df2, on="col1", how="outer", indicator="indicator_column")

Unnamed: 0,col1,col_left,col_right,indicator_column
0,0,a,,left_only
1,1,b,2.0,both
2,2,,2.0,right_only
3,2,,2.0,right_only


##### 合并数据类型


In [36]:
pd.merge(df1, df2, on="col1", how="outer", indicator="indicator_column").dtypes

col1                   int64
col_left              object
col_right            float64
indicator_column    category
dtype: object

In [37]:
from pandas.api.types import CategoricalDtype
X = pd.Series(np.random.choice(["foo", "bar"], size=(10,)))
X

0    bar
1    bar
2    bar
3    foo
4    foo
5    foo
6    bar
7    bar
8    bar
9    foo
dtype: object

In [38]:
X = X.astype(CategoricalDtype(categories=["foo", "bar"]))
X

0    bar
1    bar
2    bar
3    foo
4    foo
5    foo
6    bar
7    bar
8    bar
9    foo
dtype: category
Categories (2, object): ['foo', 'bar']

In [39]:
left = pd.DataFrame(
    {"X": X, "Y": np.random.choice(["one", "two", "three"], size=(10,))}
)
left

Unnamed: 0,X,Y
0,bar,two
1,bar,two
2,bar,three
3,foo,two
4,foo,two
5,foo,one
6,bar,two
7,bar,three
8,bar,one
9,foo,three


In [40]:
left.dtypes

X    category
Y      object
dtype: object

In [41]:
right = pd.DataFrame(
    {
        "X": pd.Series(["foo", "bar"], dtype=CategoricalDtype(["foo", "bar"])),
        "Z": [1, 2],
    }
)
right

Unnamed: 0,X,Z
0,foo,1
1,bar,2


In [42]:
right.dtypes

X    category
Z       int64
dtype: object

In [43]:
result = pd.merge(left, right, how="outer")
result

Unnamed: 0,X,Y,Z
0,bar,two,2
1,bar,two,2
2,bar,three,2
3,bar,two,2
4,bar,three,2
5,bar,one,2
6,foo,two,1
7,foo,two,1
8,foo,one,1
9,foo,three,1
