In [1]:
import pandas as pd
import numpy as np

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

# pandas 데이터 재구조화(reshaping)

- 피봇팅(pivoting)
- 스태킹(stacking)과 언스태킹(unstacking)
- 멜팅(melting)과 와이드투롱(wide_to_long)
- 교차표(crosstab)

## 2. 스태킹(stacking)과 언스태깅(unstacking)

: 피벗팅과 유사하지만 계층형 인덱스의 특정 수준도 회전이 가능함

- 스태킹(stacking) : column labels과 그 값을 row index와 값으로 회전시킴
- 언스태킹(unstacking) : row index와 그 값이 column labels과 값으로 회전시킴

### **1. 스태킹(stacking)**

- **DataFrame.stack(level=- 1, dropna=True)**
    - level : int, str, list, default=-1
        - 언스태킹을 적용하는 레벨
        - 기본값은 마지막 레벨 : 언스태킹 결과 항상 마지막 레벨로 이동
    - dropna : bool, default True
        - 스태킹 결과 결측치 처리 여부, 기본값은 True로 결측치 제외

- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.stack.html

#### 예제1. single level columns를 갖는 데이터

In [13]:
data = np.random.rand(10,2)
df = pd.DataFrame(data, columns=['num','num1'])

- stack() : 컬럼이 인덱스 마지막 레벨로 변경 -> 시리즈 데이터로 변환

In [14]:
df

Unnamed: 0,num,num1
0,0.751399,0.765222
1,0.472978,0.215119
2,0.704123,0.598393
3,0.418627,0.73981
4,0.101374,0.641987
5,0.763013,0.979001
6,0.203577,0.54434
7,0.52356,0.364777
8,0.588144,0.299717
9,0.387708,0.939811


In [15]:
df.stack()

0  num     0.751399
   num1    0.765222
1  num     0.472978
   num1    0.215119
2  num     0.704123
   num1    0.598393
3  num     0.418627
   num1    0.739810
4  num     0.101374
   num1    0.641987
5  num     0.763013
   num1    0.979001
6  num     0.203577
   num1    0.544340
7  num     0.523560
   num1    0.364777
8  num     0.588144
   num1    0.299717
9  num     0.387708
   num1    0.939811
dtype: float64

#### 예제2. multi-level columns을 갖는 데이터1

In [22]:
data = np.random.rand(10,2)
columns = pd.MultiIndex.from_product([['num'],[1,2]])
df1 = pd.DataFrame(data, columns=columns)
df1

Unnamed: 0_level_0,num,num
Unnamed: 0_level_1,1,2
0,0.541267,0.149207
1,0.324621,0.060807
2,0.702815,0.318356
3,0.37918,0.686021
4,0.557478,0.817309
5,0.090806,0.452994
6,0.537792,0.958245
7,0.796839,0.125816
8,0.541091,0.599304
9,0.101982,0.947992


- stack() : 컬럼의 마지막 레벨이 인덱스의 마지막 레벨로 이동

In [23]:
df1.stack()

Unnamed: 0,Unnamed: 1,num
0,1,0.541267
0,2,0.149207
1,1,0.324621
1,2,0.060807
2,1,0.702815
2,2,0.318356
3,1,0.37918
3,2,0.686021
4,1,0.557478
4,2,0.817309


#### 예제3. multi-level columns을 갖는 데이터2

In [24]:
data = np.random.rand(10,6)
columns = pd.MultiIndex.from_product([['num','num1'],[1,2,3]])
df2 = pd.DataFrame(data, columns=columns)
df2

Unnamed: 0_level_0,num,num,num,num1,num1,num1
Unnamed: 0_level_1,1,2,3,1,2,3
0,0.675054,0.729111,0.96796,0.619696,0.285334,0.517051
1,0.078899,0.426505,0.190787,0.991513,0.530692,0.876437
2,0.89815,0.464059,0.272851,0.581235,0.88608,0.158019
3,0.829154,0.580743,0.6055,0.876682,0.187532,0.225861
4,0.615729,0.962956,0.910463,0.230371,0.847893,0.237778
5,0.182875,0.379068,0.756305,0.381416,0.31756,0.136227
6,0.468889,0.011383,0.213875,0.655688,0.93173,0.950715
7,0.480401,0.275467,0.425726,0.219176,0.758009,0.42976
8,0.759509,0.342057,0.039312,0.864689,0.346945,0.484414
9,0.001986,0.325419,0.471894,0.588183,0.930652,0.727128


- stack(level=-1)

In [25]:
df2.stack(level=-1)

Unnamed: 0,Unnamed: 1,num,num1
0,1,0.675054,0.619696
0,2,0.729111,0.285334
0,3,0.96796,0.517051
1,1,0.078899,0.991513
1,2,0.426505,0.530692
1,3,0.190787,0.876437
2,1,0.89815,0.581235
2,2,0.464059,0.88608
2,3,0.272851,0.158019
3,1,0.829154,0.876682


- stack(0) : 컬럼의 첫번째 레벨이 인덱스의 마지막 레벨로 이동

In [26]:
df2.stack(level=0)

Unnamed: 0,Unnamed: 1,1,2,3
0,num,0.675054,0.729111,0.96796
0,num1,0.619696,0.285334,0.517051
1,num,0.078899,0.426505,0.190787
1,num1,0.991513,0.530692,0.876437
2,num,0.89815,0.464059,0.272851
2,num1,0.581235,0.88608,0.158019
3,num,0.829154,0.580743,0.6055
3,num1,0.876682,0.187532,0.225861
4,num,0.615729,0.962956,0.910463
4,num1,0.230371,0.847893,0.237778


- stack([0,1]) : 컬럼의 두 레벨이 인덱스의 마지막 두 레벨로 이동

In [27]:
df2.stack([0,1])

0  num   1    0.675054
         2    0.729111
         3    0.967960
   num1  1    0.619696
         2    0.285334
         3    0.517051
1  num   1    0.078899
         2    0.426505
         3    0.190787
   num1  1    0.991513
         2    0.530692
         3    0.876437
2  num   1    0.898150
         2    0.464059
         3    0.272851
   num1  1    0.581235
         2    0.886080
         3    0.158019
3  num   1    0.829154
         2    0.580743
         3    0.605500
   num1  1    0.876682
         2    0.187532
         3    0.225861
4  num   1    0.615729
         2    0.962956
         3    0.910463
   num1  1    0.230371
         2    0.847893
         3    0.237778
5  num   1    0.182875
         2    0.379068
         3    0.756305
   num1  1    0.381416
         2    0.317560
         3    0.136227
6  num   1    0.468889
         2    0.011383
         3    0.213875
   num1  1    0.655688
         2    0.931730
         3    0.950715
7  num   1    0.480401
         2 

### **2. 언스태킹(unstacking)**

- **DataFrame.unstack(level=- 1, fill_value=None, sort=True)**
    - level : int, str, list, default=-1
        - 언스태킹을 적용하는 레벨
        - 기본값은 마지막 레벨 : 언스태킹 결과 항상 마지막 레벨로 이동
    - fill_value : int, str or dict
        - 언스태킹 결과 결측치는 NaN으로 대체
    - sort : bool, default True
        - 멀티인덱스 컬럼의 레벨

- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.unstack.html

- 예제5. 시리즈 데이터

In [40]:
s1 = df2.stack([0,1])
s1

0  num   1    0.675054
         2    0.729111
         3    0.967960
   num1  1    0.619696
         2    0.285334
         3    0.517051
1  num   1    0.078899
         2    0.426505
         3    0.190787
   num1  1    0.991513
         2    0.530692
         3    0.876437
2  num   1    0.898150
         2    0.464059
         3    0.272851
   num1  1    0.581235
         2    0.886080
         3    0.158019
3  num   1    0.829154
         2    0.580743
         3    0.605500
   num1  1    0.876682
         2    0.187532
         3    0.225861
4  num   1    0.615729
         2    0.962956
         3    0.910463
   num1  1    0.230371
         2    0.847893
         3    0.237778
5  num   1    0.182875
         2    0.379068
         3    0.756305
   num1  1    0.381416
         2    0.317560
         3    0.136227
6  num   1    0.468889
         2    0.011383
         3    0.213875
   num1  1    0.655688
         2    0.931730
         3    0.950715
7  num   1    0.480401
         2 

- unstack() : 마지막레벨로 unstacking

In [41]:
s1.unstack()

Unnamed: 0,Unnamed: 1,1,2,3
0,num,0.675054,0.729111,0.96796
0,num1,0.619696,0.285334,0.517051
1,num,0.078899,0.426505,0.190787
1,num1,0.991513,0.530692,0.876437
2,num,0.89815,0.464059,0.272851
2,num1,0.581235,0.88608,0.158019
3,num,0.829154,0.580743,0.6055
3,num1,0.876682,0.187532,0.225861
4,num,0.615729,0.962956,0.910463
4,num1,0.230371,0.847893,0.237778


- unstack(level=0)

In [42]:
s1.unstack(level=0)

Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9
num,1,0.675054,0.078899,0.89815,0.829154,0.615729,0.182875,0.468889,0.480401,0.759509,0.001986
num,2,0.729111,0.426505,0.464059,0.580743,0.962956,0.379068,0.011383,0.275467,0.342057,0.325419
num,3,0.96796,0.190787,0.272851,0.6055,0.910463,0.756305,0.213875,0.425726,0.039312,0.471894
num1,1,0.619696,0.991513,0.581235,0.876682,0.230371,0.381416,0.655688,0.219176,0.864689,0.588183
num1,2,0.285334,0.530692,0.88608,0.187532,0.847893,0.31756,0.93173,0.758009,0.346945,0.930652
num1,3,0.517051,0.876437,0.158019,0.225861,0.237778,0.136227,0.950715,0.42976,0.484414,0.727128


#### 예제7. 행과 열이 모두 MultiIndex를 갖는 데이터

In [45]:
data = np.random.rand(4,6)
columns = pd.MultiIndex.from_product([['num','num1'],[1,2,3]])
index = pd.MultiIndex.from_product([list('ab'),list('AB')])
dfm = pd.DataFrame(data, index=index, columns=columns)

In [48]:
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,num,num,num,num1,num1,num1
Unnamed: 0_level_1,Unnamed: 1_level_1,1,2,3,1,2,3
a,A,0.025318,0.323488,0.102514,0.210796,0.522875,0.586557
a,B,0.754757,0.260093,0.281335,0.162424,0.937954,0.387324
b,A,0.818093,0.904796,0.521263,0.981281,0.206174,0.26384
b,B,0.919475,0.230917,0.826829,0.623699,0.94342,0.654448


In [51]:
dfm.unstack(fill_value=0)

Unnamed: 0_level_0,num,num,num,num,num,num,num1,num1,num1,num1,num1,num1
Unnamed: 0_level_1,1,1,2,2,3,3,1,1,2,2,3,3
Unnamed: 0_level_2,A,B,A,B,A,B,A,B,A,B,A,B
a,0.025318,0.754757,0.323488,0.260093,0.102514,0.281335,0.210796,0.162424,0.522875,0.937954,0.586557,0.387324
b,0.818093,0.919475,0.904796,0.230917,0.521263,0.826829,0.981281,0.623699,0.206174,0.94342,0.26384,0.654448


In [53]:
dfm.unstack().unstack()

num   1  A  a    0.025318
            b    0.818093
         B  a    0.754757
            b    0.919475
      2  A  a    0.323488
            b    0.904796
         B  a    0.260093
            b    0.230917
      3  A  a    0.102514
            b    0.521263
         B  a    0.281335
            b    0.826829
num1  1  A  a    0.210796
            b    0.981281
         B  a    0.162424
            b    0.623699
      2  A  a    0.522875
            b    0.206174
         B  a    0.937954
            b    0.943420
      3  A  a    0.586557
            b    0.263840
         B  a    0.387324
            b    0.654448
dtype: float64

----