# 데이터프레임 합성

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### merge

In [2]:
df1 = pd.DataFrame({
    '고객번호': [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    '이름': ['둘리', '도우너', '또치', '길동', '희동', '마이콜', '영희']
}, columns=['고객번호', '이름'])
df1

Unnamed: 0,고객번호,이름
0,1001,둘리
1,1002,도우너
2,1003,또치
3,1004,길동
4,1005,희동
5,1006,마이콜
6,1007,영희


In [3]:
df2 = pd.DataFrame({
    '고객번호': [1001, 1001, 1005, 1006, 1008, 1001],
    '금액': [10000, 20000, 15000, 5000, 100000, 30000]
}, columns=['고객번호', '금액'])
df2

Unnamed: 0,고객번호,금액
0,1001,10000
1,1001,20000
2,1005,15000
3,1006,5000
4,1008,100000
5,1001,30000


In [4]:
# Inner join
pd.merge(df1, df2, how='inner', on='고객번호')  # how = 'inner'

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000
1,1001,둘리,20000
2,1001,둘리,30000
3,1005,희동,15000
4,1006,마이콜,5000


In [11]:
# Left outer join
pd.merge(df1, df2, on='고객번호', how='left')

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000.0
1,1001,둘리,20000.0
2,1001,둘리,30000.0
3,1002,도우너,
4,1003,또치,
5,1004,길동,
6,1005,희동,15000.0
7,1006,마이콜,5000.0
8,1007,영희,


In [12]:
# Right outer join
pd.merge(df1, df2, on='고객번호', how='right')

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000
1,1001,둘리,20000
2,1001,둘리,30000
3,1005,희동,15000
4,1006,마이콜,5000
5,1008,,100000


In [13]:
# Full outer join
pd.merge(df1, df2, on='고객번호', how='outer')

Unnamed: 0,고객번호,이름,금액
0,1001,둘리,10000.0
1,1001,둘리,20000.0
2,1001,둘리,30000.0
3,1002,도우너,
4,1003,또치,
5,1004,길동,
6,1005,희동,15000.0
7,1006,마이콜,5000.0
8,1007,영희,
9,1008,,100000.0


In [17]:
df1 = pd.DataFrame({
    '이름': ['영희', '철수', '철수'],
    '성적': [1, 2, 3]})
df1

Unnamed: 0,이름,성적
0,영희,1
1,철수,2
2,철수,3


In [18]:
df2 = pd.DataFrame({
    '성명': ['영희', '영희', '철수'],
    '성적2': [4, 5, 6]})
df2

Unnamed: 0,성명,성적2
0,영희,4
1,영희,5
2,철수,6


In [19]:
pd.merge(df1, df2, left_on='이름', right_on='성명')

Unnamed: 0,이름,성적,성명,성적2
0,영희,1,영희,4
1,영희,1,영희,5
2,철수,2,철수,6
3,철수,3,철수,6


## join 메서드
- 인덱스를 같게 해야 한다

In [26]:
df1 = pd.DataFrame({
    '고객번호': [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    '이름': ['둘리', '도우너', '또치', '길동', '희동', '마이콜', '영희']
}, columns=['고객번호', '이름'])
df1.set_index('고객번호', inplace=True)
df1

Unnamed: 0_level_0,이름
고객번호,Unnamed: 1_level_1
1001,둘리
1002,도우너
1003,또치
1004,길동
1005,희동
1006,마이콜
1007,영희


In [27]:
df2 = pd.DataFrame({
    '고객번호': [1001, 1001, 1005, 1006, 1008, 1001],
    '금액': [10000, 20000, 15000, 5000, 100000, 30000]
}, columns=['고객번호', '금액'])
df2.set_index('고객번호', inplace=True)
df2

Unnamed: 0_level_0,금액
고객번호,Unnamed: 1_level_1
1001,10000
1001,20000
1005,15000
1006,5000
1008,100000
1001,30000


In [28]:
df1.join(df2, how='inner')

Unnamed: 0_level_0,이름,금액
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,둘리,10000
1001,둘리,20000
1001,둘리,30000
1005,희동,15000
1006,마이콜,5000


In [29]:
df1.join(df2)

Unnamed: 0_level_0,이름,금액
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,둘리,10000.0
1001,둘리,20000.0
1001,둘리,30000.0
1002,도우너,
1003,또치,
1004,길동,
1005,희동,15000.0
1006,마이콜,5000.0
1007,영희,


### concat 함수

In [30]:
s1 = pd.Series([0, 1], index=list('ab'))
s1

a    0
b    1
dtype: int64

In [31]:
s2 = pd.Series([2,3,4], index=list('ABC'))
s2

A    2
B    3
C    4
dtype: int64

In [32]:
pd.concat((s1,s2))

a    0
b    1
A    2
B    3
C    4
dtype: int64

In [35]:
pd.concat((s1, s2), axis=1)

Unnamed: 0,0,1
a,0.0,
b,1.0,
A,,2.0
B,,3.0
C,,4.0


In [36]:
df1 = pd.DataFrame(
    np.arange(6).reshape(3, 2),
    index=['a', 'b', 'c'],
    columns=['데이터1', '데이터2'])
df1

Unnamed: 0,데이터1,데이터2
a,0,1
b,2,3
c,4,5


In [37]:
df2 = pd.DataFrame(
    5 + np.arange(4).reshape(2, 2),
    index=['a', 'c'],
    columns=['데이터3', '데이터4'])
df2

Unnamed: 0,데이터3,데이터4
a,5,6
c,7,8


In [39]:
pd.concat((df1, df2), axis=1)  # join의 default는 outer join이다

Unnamed: 0,데이터1,데이터2,데이터3,데이터4
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [40]:
pd.concat((df1, df2), axis=0)

Unnamed: 0,데이터1,데이터2,데이터3,데이터4
a,0.0,1.0,,
b,2.0,3.0,,
c,4.0,5.0,,
a,,,5.0,6.0
c,,,7.0,8.0


어느 회사의 전반기(1월 ~ 6월) 실적을 나타내는 데이터프레임과 후반기(7월 ~ 12월) 실적을 나타내는 데이터프레임을 만든 뒤 합친다. 실적 정보는 “매출”, “비용”, “이익” 으로 이루어진다. (이익 = 매출 - 비용).

또한 1년간의 총 실적을 마지막 행으로 덧붙인다.

In [56]:
df1 = pd.DataFrame(data=np.random.randint(100, 999, size=(6,3)), columns=['매출','비용','이익'], index=['1월','2월','3월','4월','5월','6월'])
df1


Unnamed: 0,매출,비용,이익
1월,836,948,663
2월,523,766,988
3월,383,357,559
4월,138,681,698
5월,707,124,848
6월,233,415,824


In [57]:
df2 = pd.DataFrame(data=np.random.randint(100, 999, size=(6,3)), columns=['매출','비용','이익'], index=['7월','8월','9월','10월','11월','12월'])
df2

Unnamed: 0,매출,비용,이익
7월,602,561,386
8월,159,560,815
9월,236,751,641
10월,385,870,723
11월,134,648,218
12월,566,288,618


In [58]:
all_year = pd.concat((df1, df2), axis=0)
tdf = pd.DataFrame(data=all_year.sum()).T
tdf.index = ['합계']
all_year = pd.concat([all_year, tdf])
all_year

Unnamed: 0,매출,비용,이익
1월,836,948,663
2월,523,766,988
3월,383,357,559
4월,138,681,698
5월,707,124,848
6월,233,415,824
7월,602,561,386
8월,159,560,815
9월,236,751,641
10월,385,870,723


In [59]:
all_year.iloc[:-1, :].sum()

매출    4902
비용    6969
이익    7981
dtype: int64