#### Merge通过索引来合并数组

In [2]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np


data_frame1 = DataFrame({
    'name': ['John', 'Edward', 'Smith', 'Obama', 'Clinton' ],
    'ages1': [18, 16, 15, 14, 19]
})

data_frame2 = DataFrame({
    'name': ['John', 'Edward', 'Polly', 'Obama' ],
    'ages2': [19, 16, 15, 14]
})

data_frame3 = DataFrame({
    'other_name': ['John', 'Edward', 'Smith', 'Obama', 'Bush' ],
    'other_ages': [18, 16, 15, 14, 20]
})

In [3]:
print(data_frame1)
print('-'*10)
print(data_frame2)
print('-'*10)
print(data_frame3)

   ages1     name
0     18     John
1     16   Edward
2     15    Smith
3     14    Obama
4     19  Clinton
----------
   ages2    name
0     19    John
1     16  Edward
2     15   Polly
3     14   Obama
----------
   other_ages other_name
0          18       John
1          16     Edward
2          15      Smith
3          14      Obama
4          20       Bush


In [5]:
# 默认根据相同列名作为键连接两个DataFrame, 并且内连接方式(inner, 取两边都有的数据, 交集)
# 那么请问, 当我将data_frame2的列名ages2改为ages1时，结果是多少?
pd.merge(data_frame1, data_frame2, on='name')

Unnamed: 0,ages1,name,ages2
0,18,John,19
1,16,Edward,16
2,14,Obama,14


In [6]:
# 可修改连接方式为外连接(outer),取两遍都有数组, 缺失数据使用NaN表示
pd.merge(data_frame1, data_frame2, how='outer')

Unnamed: 0,ages1,name,ages2
0,18.0,John,19.0
1,16.0,Edward,16.0
2,15.0,Smith,
3,14.0,Obama,14.0
4,19.0,Clinton,
5,,Polly,15.0


In [7]:
# 当两个DataFrame没有公共相同的列名的时候，那么合并就会报错.
# MergeError: No common columns to perform merge on
# 此时我们可指定DataFrame用那一列名来连接另一个DataFrame的那个列名
# 使用参数 left_on 和 right_on
pd.merge(data_frame1, data_frame3, left_on='name', right_on='other_name')

Unnamed: 0,ages1,name,other_ages,other_name
0,18,John,18,John
1,16,Edward,16,Edward
2,15,Smith,15,Smith
3,14,Obama,14,Obama


In [8]:
# 还可以在此基础上使用外连接
pd.merge(data_frame1, data_frame3, left_on='name', right_on='other_name', how='outer')

Unnamed: 0,ages1,name,other_ages,other_name
0,18.0,John,18.0,John
1,16.0,Edward,16.0,Edward
2,15.0,Smith,15.0,Smith
3,14.0,Obama,14.0,Obama
4,19.0,Clinton,,
5,,,20.0,Bush


In [9]:
# 除了内外连接，还有左连接(left)和右连接(right)
# 左连接以左DataFrame为准, 右DataFrame不存在的列用NaN代替.
pd.merge(data_frame1, data_frame2, how='left')

Unnamed: 0,ages1,name,ages2
0,18,John,19.0
1,16,Edward,16.0
2,15,Smith,
3,14,Obama,14.0
4,19,Clinton,


In [10]:
# 左连接以右DataFrame为准, 左DataFrame不存在的列用NaN代替.
pd.merge(data_frame1, data_frame2, how='right')

Unnamed: 0,ages1,name,ages2
0,18.0,John,19
1,16.0,Edward,16
2,14.0,Obama,14
3,,Polly,15


In [11]:
# 有时候，DataFrame中连接的键在索引(index)中。
# 在这种情况下传入left_index=True, right_index=True
# 说明索引被应用于连接键.
pd.merge(data_frame1, data_frame2, left_index=True, right_index=True, suffixes=['_a', '_b'])

Unnamed: 0,ages1,name_a,ages2,name_b
0,18,John,19,John
1,16,Edward,16,Edward
2,15,Smith,15,Polly
3,14,Obama,14,Obama
