#### 데이터프레임 합성

In [1]:
import numpy as np
import pandas as pd

##### merge 함수

In [2]:
df1 = pd.DataFrame({'고객번호':[1001,1002,1003,1004,1005,1006,1007],
                    '이름':['둘리','도우너','또치','길동','희동','마이클','영희']})
df1

Unnamed: 0,고객번호,이름
0,1001,둘리
1,1002,도우너
2,1003,또치
3,1004,길동
4,1005,희동
5,1006,마이클
6,1007,영희


In [3]:
df2 = pd.DataFrame({'고객번호':[1001,1001,1005,1006,1008,1001],
                    '금액':[10000,20000,15000,5000,10000,30000]})
df2

Unnamed: 0,고객번호,금액
0,1001,10000
1,1001,20000
2,1005,15000
3,1006,5000
4,1008,10000
5,1001,30000


In [4]:
# inner join
pd.merge(df1, df2)

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000
1,1001,둘리,20000
2,1001,둘리,30000
3,1005,희동,15000
4,1006,마이클,5000


In [5]:
# Left outer join
pd.merge(df1, df2, how='left')

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000.0
1,1001,둘리,20000.0
2,1001,둘리,30000.0
3,1002,도우너,
4,1003,또치,
5,1004,길동,
6,1005,희동,15000.0
7,1006,마이클,5000.0
8,1007,영희,


In [6]:
# Right outer join
pd.merge(df1, df2, how='right')

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000
1,1001,둘리,20000
2,1005,희동,15000
3,1006,마이클,5000
4,1008,,10000
5,1001,둘리,30000


In [7]:
# Full outer join
pd.merge(df1, df2, how='outer')

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000.0
1,1001,둘리,20000.0
2,1001,둘리,30000.0
3,1002,도우너,
4,1003,또치,
5,1004,길동,
6,1005,희동,15000.0
7,1006,마이클,5000.0
8,1007,영희,
9,1008,,10000.0


##### 동일한 컬럼명이 여러개인 경우

In [8]:
df1 = pd.DataFrame({'고객명':['춘향','춘향','몽룡'], '날짜':['2018-01-01','2018-01-02','2018-01-01'],
                    '데이터':['20000','30000','100000']})
df1

Unnamed: 0,고객명,날짜,데이터
0,춘향,2018-01-01,20000
1,춘향,2018-01-02,30000
2,몽룡,2018-01-01,100000


In [9]:
df2 = pd.DataFrame({'고객명':['춘향','몽룡'], '데이터':['여자','남자']})
df2

Unnamed: 0,고객명,데이터
0,춘향,여자
1,몽룡,남자


In [10]:
pd.merge(df1, df2, on='고객명')

Unnamed: 0,고객명,날짜,데이터_x,데이터_y
0,춘향,2018-01-01,20000,여자
1,춘향,2018-01-02,30000,여자
2,몽룡,2018-01-01,100000,남자


##### 동일한 컬럼명이 없는 경우

In [11]:
df1 = pd.DataFrame({'이름':['영희','철수','철수'], '성적':[1,2,3]})
df1

Unnamed: 0,이름,성적
0,영희,1
1,철수,2
2,철수,3


In [12]:
df2 = pd.DataFrame({'성명':['영희','영희','철수'], '성적2':[4,5,6]})
df2

Unnamed: 0,성명,성적2
0,영희,4
1,영희,5
2,철수,6


In [13]:
pd.merge(df1, df2, left_on='이름', right_on='성명')

Unnamed: 0,이름,성적,성명,성적2
0,영희,1,영희,4
1,영희,1,영희,5
2,철수,2,철수,6
3,철수,3,철수,6


##### JOIN 메서드

In [14]:
df1 = pd.DataFrame({'고객번호':[1001, 1002, 1003, 1004, 1005, 1006, 1007],
                    '이름':['둘리', '도우너', '또치', '길동', '희동', '마이콜', '영희']}, 
                    columns=['고객번호', '이름'])
df1.set_index('고객번호', inplace=True)
df1

Unnamed: 0_level_0,이름
고객번호,Unnamed: 1_level_1
1001,둘리
1002,도우너
1003,또치
1004,길동
1005,희동
1006,마이콜
1007,영희


In [15]:
df2 = pd.DataFrame({'고객번호':[1001, 1001, 1005, 1006, 1008, 1001],
                    '금액':[10000, 20000, 15000, 5000, 100000, 30000]},
                    columns=['고객번호', '금액'])
df2.set_index('고객번호', inplace=True)
df2

Unnamed: 0_level_0,금액
고객번호,Unnamed: 1_level_1
1001,10000
1001,20000
1005,15000
1006,5000
1008,100000
1001,30000


In [16]:
df1.join(df2, how='inner')

Unnamed: 0_level_0,이름,금액
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,둘리,10000
1001,둘리,20000
1001,둘리,30000
1005,희동,15000
1006,마이콜,5000


In [18]:
df2.join(df1, how='inner')

Unnamed: 0_level_0,금액,이름
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,10000,둘리
1001,20000,둘리
1001,30000,둘리
1005,15000,희동
1006,5000,마이콜


In [19]:
# Left Outer Join
df1.join(df2)  # how = 'left'가 default

Unnamed: 0_level_0,이름,금액
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,둘리,10000.0
1001,둘리,20000.0
1001,둘리,30000.0
1002,도우너,
1003,또치,
1004,길동,
1005,희동,15000.0
1006,마이콜,5000.0
1007,영희,


##### concat 함수

In [20]:
s1 = pd.Series([0,1], index=['A','B'])
s1

A    0
B    1
dtype: int64

In [21]:
s2 = pd.Series([2,3,4], index=['A','B','C'])
s2

A    2
B    3
C    4
dtype: int64

In [23]:
pd.concat(objs=[s1, s2], axis=0)  # axis=0가 default

A    0
B    1
A    2
B    3
C    4
dtype: int64

In [24]:
df1 = pd.DataFrame(np.arange(6).reshape(3,2), index=['a','b','c'], columns=['데이터1', '데이터2'])
df1

Unnamed: 0,데이터1,데이터2
a,0,1
b,2,3
c,4,5


In [25]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2), index=['a','c'], columns=['데이터3','데이터4'])
df2

Unnamed: 0,데이터3,데이터4
a,5,6
c,7,8


In [26]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,데이터1,데이터2,데이터3,데이터4
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0
