In [1]:
import numpy as np
import pandas as pd

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

# pandas 데이터 재구조화(reshaping)

- 피봇팅(pivoting)
- 스태킹(stacking)과 언스태킹(unstacking)
- 멜팅(melting)과 와이드투롱(wide_to_long)
- 교차표(crosstab)

## 2. 스태킹(stacking)과 언스태깅(unstacking)

: 피벗팅과 유사하지만 계층형 인덱스의 특정 수준도 회전이 가능함

- 스태킹(stacking) : column labels과 그 값을 row index와 값으로 회전시킴
- 언스태킹(unstacking) : row index와 그 값이 column labels과 값으로 회전시킴

![image.png](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Ft1.daumcdn.net%2Fcfile%2Ftistory%2F99BBDC48601405E621)

- 출처 : https://rfriend.tistory.com/276

### **1. 스태킹(stacking)**

![image.png](https://pandas.pydata.org/docs/_images/reshaping_stack.png)

- **DataFrame.stack(level=- 1, dropna=True)**
    - level : int, str, list, default=-1
        - 언스태킹을 적용하는 레벨
        - 기본값은 마지막 레벨 : 언스태킹 결과 항상 마지막 레벨로 이동
    - dropna : bool, default True
        - 스태킹 결과 결측치 처리 여부, 기본값은 True로 결측치 제외

- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.stack.html

#### 예제1. single level columns를 갖는 데이터

In [4]:
df = pd.DataFrame([[0,1],[2,3]],index=['cat','dog'], columns=['weight', 'height'])
df

Unnamed: 0,weight,height
cat,0,1
dog,2,3


- stack() : 컬럼이 인덱스 마지막 레벨로 변경 -> 시리즈 데이터로 변환

In [5]:
df.stack()

cat  weight    0
     height    1
dog  weight    2
     height    3
dtype: int64

#### 예제2. multi-level columns을 갖는 데이터1

In [6]:
multicol= pd.MultiIndex.from_tuples([('weight', 'kg'),
                                    ('weight', 'pound')])
df = pd.DataFrame([[0,1],[2,3]],index=['cat','dog'], 
                  columns=multicol)
df

Unnamed: 0_level_0,weight,weight
Unnamed: 0_level_1,kg,pound
cat,0,1
dog,2,3


- stack() : 컬럼의 마지막 레벨이 인덱스의 마지막 레벨로 이동

In [7]:
df.stack()

Unnamed: 0,Unnamed: 1,weight
cat,kg,0
cat,pound,1
dog,kg,2
dog,pound,3


In [11]:
result = df.stack(0)
result

Unnamed: 0,Unnamed: 1,kg,pound
cat,weight,0,1
dog,weight,2,3


In [12]:
result.swaplevel(1,0)

Unnamed: 0,Unnamed: 1,kg,pound
weight,cat,0,1
weight,dog,2,3


#### 예제3. multi-level columns을 갖는 데이터2

In [13]:
multicol = pd.MultiIndex.from_tuples([('weight', 'kg'),('height', 'm')])
df = pd.DataFrame([[0,1],[2,3]],index=['cat','dog'], 
                  columns=multicol)
df

Unnamed: 0_level_0,weight,height
Unnamed: 0_level_1,kg,m
cat,0,1
dog,2,3


- stack(level=-1)

In [15]:
#3개 같은 것
df.stack(level=-1)
df.stack()
df.stack(-1)

Unnamed: 0,Unnamed: 1,weight,height
cat,kg,0.0,
cat,m,,1.0
dog,kg,2.0,
dog,m,,3.0


Unnamed: 0,Unnamed: 1,weight,height
cat,kg,0.0,
cat,m,,1.0
dog,kg,2.0,
dog,m,,3.0


Unnamed: 0,Unnamed: 1,weight,height
cat,kg,0.0,
cat,m,,1.0
dog,kg,2.0,
dog,m,,3.0


- stack(0) : 컬럼의 첫번째 레벨이 인덱스의 마지막 레벨로 이동

In [16]:
df.stack(0)

Unnamed: 0,Unnamed: 1,kg,m
cat,height,,1.0
cat,weight,0.0,
dog,height,,3.0
dog,weight,2.0,


- stack([0,1]) : 컬럼의 두 레벨이 인덱스의 마지막 두 레벨로 이동

In [18]:
df.stack([0,1])

cat  height  m     1.0
     weight  kg    0.0
dog  height  m     3.0
     weight  kg    2.0
dtype: float64

#### 예제4. multi-level columns을 갖는 데이터3 : 결측치를 포함하는 경우

In [20]:
multicol = pd.MultiIndex.from_tuples([('weight', 'kg'),('height', 'm')])
df = pd.DataFrame([[None,1],[2,3]],index=['cat','dog'], 
                  columns=multicol)
df

Unnamed: 0_level_0,weight,height
Unnamed: 0_level_1,kg,m
cat,,1
dog,2.0,3


- stack()

In [21]:
df.stack() #cat kg은 NaN이라서 없음

Unnamed: 0,Unnamed: 1,weight,height
cat,m,,1.0
dog,kg,2.0,
dog,m,,3.0


- stack(, dropna=True|False)

In [22]:
df.stack(dropna=True)

Unnamed: 0,Unnamed: 1,weight,height
cat,m,,1.0
dog,kg,2.0,
dog,m,,3.0


In [23]:
df.stack(dropna=False)

Unnamed: 0,Unnamed: 1,weight,height
cat,kg,,
cat,m,,1.0
dog,kg,2.0,
dog,m,,3.0


### **2. 언스태킹(unstacking)**

![image.png](https://pandas.pydata.org/docs/_images/reshaping_unstack.png)

- **DataFrame.unstack(level=- 1, fill_value=None, sort=True)**
    - level : int, str, list, default=-1
        - 언스태킹을 적용하는 레벨
        - 기본값은 마지막 레벨 : 언스태킹 결과 항상 마지막 레벨로 이동
    - fill_value : int, str or dict
        - 언스태킹 결과 결측치는 NaN으로 대체
    - sort : bool, default True
        - 멀티인덱스 컬럼의 레벨

- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.unstack.html

- 예제5. 시리즈 데이터

In [25]:
index = pd.MultiIndex.from_tuples([('one','a'),('one','b'),
                                  ('two','a'),('two','b')])
s = pd.Series(np.arange(1.0,5.0), index=index) #1.0하면 float 형태로 만들어짐
s

one  a    1.0
     b    2.0
two  a    3.0
     b    4.0
dtype: float64

- unstack() : 마지막레벨로 unstacking

In [28]:
s.unstack()
s.unstack(level=-1)
s.unstack(-1)

Unnamed: 0,a,b
one,1.0,2.0
two,3.0,4.0


Unnamed: 0,a,b
one,1.0,2.0
two,3.0,4.0


Unnamed: 0,a,b
one,1.0,2.0
two,3.0,4.0


- unstack(level=0)

In [29]:
s.unstack(0)
s.unstack(level=0)

Unnamed: 0,one,two
a,1.0,3.0
b,2.0,4.0


Unnamed: 0,one,two
a,1.0,3.0
b,2.0,4.0


#### 예제6. 행이 MultiIndex를 갖는 데이터

In [32]:
list = [['bar', 'bar', 'baz', 'baz','foo','foo','qux', 'qux'],
         ['one', 'two']*4]
tuples
index = pd.MultiIndex.from_arrays(list, names=['first', 'second'])
index

df = pd.DataFrame(np.round(np.random.randn(8,2),2),
                 index=index, columns=['A','B'])
df

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.19,-0.18
bar,two,-0.27,1.76
baz,one,-0.39,1.82
baz,two,-0.78,1.73
foo,one,-0.88,-0.02
foo,two,-1.29,1.65
qux,one,0.25,0.3
qux,two,-0.48,0.52


- unstack() : index의 마지막레벨이 컬럼의 마지막 레벨로 이동

In [33]:
df.unstack()

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,0.19,-0.27,-0.18,1.76
baz,-0.39,-0.78,1.82,1.73
foo,-0.88,-1.29,-0.02,1.65
qux,0.25,-0.48,0.3,0.52


- unstack(0) : index의 첫번째 레벨이 컬럼의 마지막 레벨로 이동

![image.png](https://pandas.pydata.org/docs/_images/reshaping_unstack_0.png)

In [34]:
df.unstack(level=0)
df.unstack(0)
df.unstack('first')

Unnamed: 0_level_0,A,A,A,A,B,B,B,B
first,bar,baz,foo,qux,bar,baz,foo,qux
second,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
one,0.19,-0.39,-0.88,0.25,-0.18,1.82,-0.02,0.3
two,-0.27,-0.78,-1.29,-0.48,1.76,1.73,1.65,0.52


Unnamed: 0_level_0,A,A,A,A,B,B,B,B
first,bar,baz,foo,qux,bar,baz,foo,qux
second,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
one,0.19,-0.39,-0.88,0.25,-0.18,1.82,-0.02,0.3
two,-0.27,-0.78,-1.29,-0.48,1.76,1.73,1.65,0.52


Unnamed: 0_level_0,A,A,A,A,B,B,B,B
first,bar,baz,foo,qux,bar,baz,foo,qux
second,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
one,0.19,-0.39,-0.88,0.25,-0.18,1.82,-0.02,0.3
two,-0.27,-0.78,-1.29,-0.48,1.76,1.73,1.65,0.52


- unstack(1) : index의 두첫번째 레벨이 컬럼의 마지막 레벨로 이동

![image.png](https://pandas.pydata.org/docs/_images/reshaping_unstack_1.png)

In [37]:
df.unstack(1)
df.unstack(level=1)
df.unstack('second')
df.unstack(level = 'second')

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,0.19,-0.27,-0.18,1.76
baz,-0.39,-0.78,1.82,1.73
foo,-0.88,-1.29,-0.02,1.65
qux,0.25,-0.48,0.3,0.52


Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,0.19,-0.27,-0.18,1.76
baz,-0.39,-0.78,1.82,1.73
foo,-0.88,-1.29,-0.02,1.65
qux,0.25,-0.48,0.3,0.52


Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,0.19,-0.27,-0.18,1.76
baz,-0.39,-0.78,1.82,1.73
foo,-0.88,-1.29,-0.02,1.65
qux,0.25,-0.48,0.3,0.52


Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,0.19,-0.27,-0.18,1.76
baz,-0.39,-0.78,1.82,1.73
foo,-0.88,-1.29,-0.02,1.65
qux,0.25,-0.48,0.3,0.52


#### 예제7. 행과 열이 모두 MultiIndex를 갖는 데이터

In [47]:
index = pd.MultiIndex.from_product([('bar','baz','foo','qux'),
                                   ('one','two')],
                                  names = ['first','second'])
index
columns = pd.MultiIndex.from_tuples([('A','cat'),('B','dog'),
                                     ('B','cat'),('A','dog')],
                                   names=['exp','animal'])
columns
df = pd.DataFrame(np.round(np.random.randn(8,4),2), 
                  index=index, columns=columns)
df

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

MultiIndex([('A', 'cat'),
            ('B', 'dog'),
            ('B', 'cat'),
            ('A', 'dog')],
           names=['exp', 'animal'])

Unnamed: 0_level_0,exp,A,B,B,A
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,0.37,-0.69,-0.93,-0.34
bar,two,-0.74,-0.23,0.58,-1.66
baz,one,-1.03,0.94,-1.38,-0.05
baz,two,0.1,0.96,-1.31,-0.64
foo,one,1.54,0.88,-0.89,0.95
foo,two,1.1,0.6,-0.41,-1.18
qux,one,-1.18,0.11,1.35,-0.07
qux,two,-1.03,0.6,-0.7,0.77


In [44]:
df.unstack()

exp,A,A,B,B,B,B,A,A
animal,cat,cat,dog,dog,cat,cat,dog,dog
second,one,two,one,two,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
bar,-1.14,-0.2,0.03,-1.62,0.55,-1.04,-0.39,-1.82
baz,-0.75,-1.83,0.61,-1.15,1.18,0.48,-2.07,0.07
foo,0.46,-1.24,-0.58,-0.6,0.18,-1.47,0.33,-1.54
qux,1.06,-0.62,-0.55,-1.61,0.37,-0.43,-1.83,-1.17


In [45]:
df2 = df.iloc[[0,1,4,7],[1,2]]
df2

Unnamed: 0_level_0,exp,B,B
Unnamed: 0_level_1,animal,dog,cat
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,one,0.03,0.55
bar,two,-1.62,-1.04
foo,one,-0.58,0.18
qux,two,-1.61,-0.43


In [48]:
df2.unstack()

exp,B,B,B,B
animal,dog,dog,cat,cat
second,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
bar,0.03,-1.62,0.55,-1.04
foo,-0.58,,0.18,
qux,,-1.61,,-0.43


In [49]:
df2.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,B
first,second,animal,Unnamed: 3_level_1
bar,one,cat,0.55
bar,one,dog,0.03
bar,two,cat,-1.04
bar,two,dog,-1.62
foo,one,cat,0.18
foo,one,dog,-0.58
qux,two,cat,-0.43
qux,two,dog,-1.61


- unstack( , **fill_value=**)

In [50]:
df2.unstack(fill_value=0)

exp,B,B,B,B
animal,dog,dog,cat,cat
second,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
bar,0.03,-1.62,0.55,-1.04
foo,-0.58,0.0,0.18,0.0
qux,0.0,-1.61,0.0,-0.43


----