In [1]:
import pandas as pd
import numpy as np

# DataFrame 합치기
* **merge(join)** : 두 개의 DataFrame을 공통된 컬럼(들)을 기준으로 합치는 것
* **concat** : DataFrame 축(axis)을 따라서 합치는 것.

In [2]:
df1 = pd.DataFrame({
    'kind' : ['a', 'a', 'b', 'b', 'a', 'c'],
    'val_1' : range(6)
})
df1

Unnamed: 0,kind,val_1
0,a,0
1,a,1
2,b,2
3,b,3
4,a,4
5,c,5


In [4]:
df2 = pd.DataFrame({
    'kind' : ['a', 'c', 'd'],
    'val_2' : [10, 20, 30]
})
df2

Unnamed: 0,kind,val_2
0,a,10
1,c,20
2,d,30


## pd.merge()
* **Oracle, R의 join과 같은 기능**
* **`pd.merge(left=df1, right=df2, how='inner', on='kind')`**
* how='inner' : merge의 기본값은 inner join이므로, 생략 가능함
* on : join 시 기준으로 사용되는 컬럼(들). **join 조건 컬럼 이름이 두 DF에서 같은 경우, 생략 가능**

### inner join

In [5]:
pd.merge(left = df1, right = df2) #> 기준 컬럼(on)은 자동으로 kind가 됨.

Unnamed: 0,kind,val_1,val_2
0,a,0,10
1,a,1,10
2,a,4,10
3,c,5,20


In [6]:
# 또는, .merge() 메서드를 호출하여 merge 할 수도 있다.
# df1.merge(df2, how='inner', on='kind')
df1.merge(df2)

Unnamed: 0,kind,val_1,val_2
0,a,0,10
1,a,1,10
2,a,4,10
3,c,5,20


### left join

In [7]:
pd.merge(left = df1, right = df2, how = 'left')
# NaN : Not a Number
# NA : Not Available

Unnamed: 0,kind,val_1,val_2
0,a,0,10.0
1,a,1,10.0
2,b,2,
3,b,3,
4,a,4,10.0
5,c,5,20.0


### right join

In [8]:
pd.merge(left = df1, right = df2, how = 'right')

Unnamed: 0,kind,val_1,val_2
0,a,0.0,10
1,a,1.0,10
2,a,4.0,10
3,c,5.0,20
4,d,,30


### outer join(full join)

In [9]:
pd.merge(left = df1, right = df2, how = 'outer')

Unnamed: 0,kind,val_1,val_2
0,a,0.0,10.0
1,a,1.0,10.0
2,a,4.0,10.0
3,b,2.0,
4,b,3.0,
5,c,5.0,20.0
6,d,,30.0


### merge(join) 기준 변수의 이름이 서로 다른 경우
* left_on, right_on 인수를 사용한다.

In [10]:
emp = pd.DataFrame({
    'empno' : [100, 101],
    'ename' : ['Allen', 'Scott'],
    'deptno' : [20, 10]
})
emp

Unnamed: 0,empno,ename,deptno
0,100,Allen,20
1,101,Scott,10


In [11]:
dept = pd.DataFrame({
    'dno' : [10, 20],
    'dname' : ['IT', 'HR']
})
dept

Unnamed: 0,dno,dname
0,10,IT
1,20,HR


In [12]:
pd.merge(left = emp, right = dept, left_on = 'deptno', right_on = 'dno')

Unnamed: 0,empno,ename,deptno,dno,dname
0,100,Allen,20,20,HR
1,101,Scott,10,10,IT


### row index를 사용한 merge

In [13]:
df1 = pd.DataFrame(data = {'data1' : range(6)},
                   index = ['a', 'b', 'c'] * 2)
df1

Unnamed: 0,data1
a,0
b,1
c,2
a,3
b,4
c,5


In [14]:
df1.shape

(6, 1)

In [15]:
df1.loc['a', :]

Unnamed: 0,data1
a,0
a,3


In [16]:
df2 = pd.DataFrame(data = {'data2' : [11, 22, 33, 44]},
                  index = ['a', 'b', 'c', 'd'])
df2

Unnamed: 0,data2
a,11
b,22
c,33
d,44


In [17]:
# inner join 시
pd.merge(left = df1, right = df2, how = 'inner',
        left_index = True, right_index = True)

Unnamed: 0,data1,data2
a,0,11
a,3,11
b,1,22
b,4,22
c,2,33
c,5,33


In [18]:
# left join 시
pd.merge(left = df1, right = df2, how = 'left',
        left_index = True, right_index = True)

Unnamed: 0,data1,data2
a,0,11
a,3,11
b,1,22
b,4,22
c,2,33
c,5,33


In [19]:
# full outer join 시
pd.merge(left = df1, right = df2, how = 'outer',
        left_index = True, right_index = True)

Unnamed: 0,data1,data2
a,0.0,11
a,3.0,11
b,1.0,22
b,4.0,22
c,2.0,33
c,5.0,33
d,,44


In [20]:
df1

Unnamed: 0,data1
a,0
b,1
c,2
a,3
b,4
c,5


In [21]:
df2 = pd.DataFrame(data = {
    'key' : ['a', 'b', 'c', 'd'],
    'data2' : range(10, 50, 10)
})
df2

Unnamed: 0,key,data2
0,a,10
1,b,20
2,c,30
3,d,40


In [22]:
pd.merge(left = df1, right = df2, how = 'inner', left_index = True, right_on = 'key')

Unnamed: 0,data1,key,data2
0,0,a,10
0,3,a,10
1,1,b,20
1,4,b,20
2,2,c,30
2,5,c,30
