In [1]:
import numpy as np
import pandas as pd

# DataFrame 모양(shape) 변경 

* wide -> long (row)
* long -> wide (column)

## stack/unstack

In [2]:
df = pd.DataFrame(data=np.arange(1, 7).reshape((2, 3)),
                  columns=['a', 'b', 'c'],
                  index=['A', 'B'])
df

Unnamed: 0,a,b,c
A,1,2,3
B,4,5,6


In [3]:
df_stacked = df.stack()
df_stacked

A  a    1
   b    2
   c    3
B  a    4
   b    5
   c    6
dtype: int64

In [4]:
df_unstacked = df_stacked.unstack()
df_unstacked

Unnamed: 0,a,b,c
A,1,2,3
B,4,5,6


In [5]:
df_unstacked2 = df_stacked.unstack(level=0)
df_unstacked2

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [6]:
df = pd.DataFrame(data=np.arange(1, 13).reshape((2, 6)), 
                  columns=[['A'] * 3 + ['B'] * 3,
                           ['a', 'b', 'c'] * 2])
df

Unnamed: 0_level_0,A,A,A,B,B,B
Unnamed: 0_level_1,a,b,c,a,b,c
0,1,2,3,4,5,6
1,7,8,9,10,11,12


In [7]:
df.stack()

Unnamed: 0,Unnamed: 1,A,B
0,a,1,4
0,b,2,5
0,c,3,6
1,a,7,10
1,b,8,11
1,c,9,12


In [8]:
df.stack(level=0)

Unnamed: 0,Unnamed: 1,a,b,c
0,A,1,2,3
0,B,4,5,6
1,A,7,8,9
1,B,10,11,12


## pivot/melt

In [9]:
df = pd.DataFrame(data={'A': ['one'] * 3 + ['two'] * 3,
                        'B': ['a', 'b', 'c'] * 2,
                        'C': np.arange(1, 7),
                        'D': np.arange(2, 13, 2)})
df

Unnamed: 0,A,B,C,D
0,one,a,1,2
1,one,b,2,4
2,one,c,3,6
3,two,a,4,8
4,two,b,5,10
5,two,c,6,12


`pivot()` 메서드의 parameter:
* `ìndex`: 원본 데이터프레임을 pivoting을 할 때, row index로 사용할 컬럼(들)의 이름(레이블)
* `columns`: 원본 데이터프레임을 pivoting을 할 때, column 이름(레이블)으로 사용할 컬럼(들)의 이름(레이블)
* `values`: 원본 데이터프레임을 pivoting을 할 때, 각 셀에 저장될 값으로 사용할 컬럼(들)의 이름(레이블)

In [10]:
pivoted1 = df.pivot(index='A',
                    columns='B',
                    values='C')
pivoted1

B,a,b,c
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [11]:
pivoted1.melt()

Unnamed: 0,B,value
0,a,1
1,a,4
2,b,2
3,b,5
4,c,3
5,c,6


In [12]:
pivoted2 = df.pivot(index='A', columns='B', values='D')
pivoted2

B,a,b,c
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,2,4,6
two,8,10,12


In [13]:
df

Unnamed: 0,A,B,C,D
0,one,a,1,2
1,one,b,2,4
2,one,c,3,6
3,two,a,4,8
4,two,b,5,10
5,two,c,6,12


In [14]:
pivoted3 = df.pivot(index='A', columns='B', values=['C', 'D'])
pivoted3

Unnamed: 0_level_0,C,C,C,D,D,D
B,a,b,c,a,b,c
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
one,1,2,3,2,4,6
two,4,5,6,8,10,12


In [15]:
countries = ['Korea', 'Korea', 'China', 'China', 'GB', 'GB', 'US', 'US']
continents = ['Asia'] * 4 + ['Europe'] * 2 + ['America'] * 2
years = [2020, 2021] * 4

np.random.seed(1)
df = pd.DataFrame(data={'country': countries,
                        'continent': continents,
                        'year': years,
                        'pop': np.random.randint(10000, size=8),
                        'gdp': np.random.rand(8)})

In [16]:
df

Unnamed: 0,country,continent,year,pop,gdp
0,Korea,Asia,2020,235,0.18626
1,Korea,Asia,2021,5192,0.345561
2,China,Asia,2020,905,0.396767
3,China,Asia,2021,7813,0.538817
4,GB,Europe,2020,2895,0.419195
5,GB,Europe,2021,5056,0.68522
6,US,America,2020,144,0.204452
7,US,America,2021,4225,0.878117


In [17]:
# 국가별 연도별 인구
df.pivot(index='country', columns='year', values='pop')

year,2020,2021
country,Unnamed: 1_level_1,Unnamed: 2_level_1
China,905,7813
GB,2895,5056
Korea,235,5192
US,144,4225


In [18]:
# 국가별 연도별 인구, GDP 
df.pivot(index='country', columns='year', values=['pop', 'gdp'])

Unnamed: 0_level_0,pop,pop,gdp,gdp
year,2020,2021,2020,2021
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
China,905.0,7813.0,0.396767,0.538817
GB,2895.0,5056.0,0.419195,0.68522
Korea,235.0,5192.0,0.18626,0.345561
US,144.0,4225.0,0.204452,0.878117


In [19]:
# 연도별(index) 국가별(column) 대륙별(column) pop, gdp 
df.pivot(index='year', columns=['country', 'continent'], 
         values=['pop', 'gdp'])

Unnamed: 0_level_0,pop,pop,pop,pop,gdp,gdp,gdp,gdp
country,Korea,China,GB,US,Korea,China,GB,US
continent,Asia,Asia,Europe,America,Asia,Asia,Europe,America
year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
2020,235.0,905.0,2895.0,144.0,0.18626,0.396767,0.419195,0.204452
2021,5192.0,7813.0,5056.0,4225.0,0.345561,0.538817,0.68522,0.878117


In [20]:
# 연도별 대륙별 국가별 pop, gdp
df.pivot(index='year', columns=['continent', 'country'], 
         values=['pop', 'gdp'])

Unnamed: 0_level_0,pop,pop,pop,pop,gdp,gdp,gdp,gdp
continent,Asia,Asia,Europe,America,Asia,Asia,Europe,America
country,Korea,China,GB,US,Korea,China,GB,US
year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
2020,235.0,905.0,2895.0,144.0,0.18626,0.396767,0.419195,0.204452
2021,5192.0,7813.0,5056.0,4225.0,0.345561,0.538817,0.68522,0.878117


In [21]:
df = pd.DataFrame(data={'gender': ['Male', 'Female'] * 3,
                        'time': ['Lunch'] * 3 + ['Dinner'] * 3,
                        'day': ['Fri', 'Sat', 'Sun'] * 2,
                        'bill': np.arange(10, 70, 10),
                        'tip': np.arange(1, 7)})
df

Unnamed: 0,gender,time,day,bill,tip
0,Male,Lunch,Fri,10,1
1,Female,Lunch,Sat,20,2
2,Male,Lunch,Sun,30,3
3,Female,Dinner,Fri,40,4
4,Male,Dinner,Sat,50,5
5,Female,Dinner,Sun,60,6


In [22]:
# df.pivot(index='gender', columns='time', values='bill')
#> ValueError 발생: pivot 테이블의 셀의 개수가 value들의 개수보다 작기 때문에 에러 발생 

In [23]:
df.pivot(index='gender', columns='day', values='bill')

day,Fri,Sat,Sun
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,40,20,60
Male,10,50,30


In [24]:
df.pivot(index=['gender', 'time'], columns='day', values='bill')

Unnamed: 0_level_0,day,Fri,Sat,Sun
gender,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,Dinner,40.0,,60.0
Female,Lunch,,20.0,
Male,Dinner,,50.0,
Male,Lunch,10.0,,30.0


In [25]:
df.pivot(index='gender', columns=['day', 'time'], values='bill')

day,Fri,Sat,Sun,Fri,Sat,Sun
time,Lunch,Lunch,Lunch,Dinner,Dinner,Dinner
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,,20.0,,40.0,,60.0
Male,10.0,,30.0,,50.0,


`melt()` 메서드 파라미터: 
* `id_vars`: melting을 할 때, unpivot되지 않을 변수(컬럼)의 이름(들)
* `var_name`: 변수(variable) 컬럼의 이름을 설정
* `value_name`: 값(value) 컬럼의 이름을 설정 
* `value_vars`: melting할 컬럼(들)의 이름 

In [26]:
df = pd.DataFrame(data={'gender': ['Female', 'Male'],
                        'Breakfast': [0, 0],
                        'Lunch': [1, 3],
                        'Dinner': [2, 4]})
df

Unnamed: 0,gender,Breakfast,Lunch,Dinner
0,Female,0,1,2
1,Male,0,3,4


In [27]:
df.melt(id_vars='gender')

Unnamed: 0,gender,variable,value
0,Female,Breakfast,0
1,Male,Breakfast,0
2,Female,Lunch,1
3,Male,Lunch,3
4,Female,Dinner,2
5,Male,Dinner,4


In [28]:
df.melt(id_vars='gender', var_name='time', value_name='size')

Unnamed: 0,gender,time,size
0,Female,Breakfast,0
1,Male,Breakfast,0
2,Female,Lunch,1
3,Male,Lunch,3
4,Female,Dinner,2
5,Male,Dinner,4


In [29]:
df.melt(id_vars='gender', value_vars=['Lunch', 'Dinner'],
        var_name='time', value_name='count')

Unnamed: 0,gender,time,count
0,Female,Lunch,1
1,Male,Lunch,3
2,Female,Dinner,2
3,Male,Dinner,4


## `pivot_table()`
* `index`: pivoting을 할 때, row index로 설정할 컬럼(들)
* `columns`: pivoting을 할 때, 컬럼 이름으로 설정할 컬럼(들)
* `values`: pivoting을 해서 각 셀에 value로 설정할 컬럼(들)
* `aggfunc`: 집계 함수(aggregating function). 기본값은 mean 
    * median, max, min, count, ... 

In [30]:
import seaborn as sns

In [31]:
tips = sns.load_dataset('tips')
tips.shape

(244, 7)

In [32]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [33]:
# 성별 tip의 평균 
tips.pivot_table(index='sex', values='tip', aggfunc='mean')

Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Male,3.089618
Female,2.833448


In [34]:
tips[tips['sex'] == 'Male']['tip'].mean()

3.0896178343949052

In [35]:
tips[tips['sex'] == 'Female']['tip'].mean()

2.833448275862069

In [36]:
# 요일별 tip의 평균
tips.pivot_table(index='day', values='tip')

Unnamed: 0_level_0,tip
day,Unnamed: 1_level_1
Thur,2.771452
Fri,2.734737
Sat,2.993103
Sun,3.255132


In [37]:
# 성별(index), 시간별(columns) tip의 평균(value/aggfunc)
tips.pivot_table(index=['sex', 'time'], values='tip')

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,time,Unnamed: 2_level_1
Male,Lunch,2.882121
Male,Dinner,3.144839
Female,Lunch,2.582857
Female,Dinner,3.002115


In [38]:
tips.pivot_table(index='sex', columns='time', values='tip').stack()

sex     time  
Male    Lunch     2.882121
        Dinner    3.144839
Female  Lunch     2.582857
        Dinner    3.002115
dtype: float64

In [39]:
tips.pivot_table(index=['sex', 'time'], values='tip')

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,time,Unnamed: 2_level_1
Male,Lunch,2.882121
Male,Dinner,3.144839
Female,Lunch,2.582857
Female,Dinner,3.002115


In [40]:
# 성별, 흡연 여부별 tip의 평균
tips.pivot_table(index='sex', columns='smoker', values='tip')

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,3.051167,3.113402
Female,2.931515,2.773519


In [41]:
tips.pivot_table(columns=['sex', 'smoker'], values='tip')

sex,Male,Male,Female,Female
smoker,Yes,No,Yes,No
tip,3.051167,3.113402,2.931515,2.773519


In [42]:
# 성별 tip의 최댓값, 최솟값
df = tips.pivot_table(index='sex', values='tip', aggfunc=['max', 'min'])
df

Unnamed: 0_level_0,max,min
Unnamed: 0_level_1,tip,tip
sex,Unnamed: 1_level_2,Unnamed: 2_level_2
Male,10.0,1.0
Female,6.5,1.0


In [43]:
df.columns.nlevels

2

In [44]:
# 성별, 요일별 tip의 최댓값, 최솟값
df = tips.pivot_table(index='sex', columns='day', values='tip', aggfunc=['max', 'min'])
df

Unnamed: 0_level_0,max,max,max,max,min,min,min,min
day,Thur,Fri,Sat,Sun,Thur,Fri,Sat,Sun
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Male,6.7,4.73,10.0,6.5,1.44,1.5,1.0,1.32
Female,5.17,4.3,6.5,5.2,1.25,1.0,1.0,1.01


In [45]:
df.melt(ignore_index=False, var_name=['agg', 'day'], value_name='tip')

Unnamed: 0_level_0,agg,day,tip
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,max,Thur,6.7
Female,max,Thur,5.17
Male,max,Fri,4.73
Female,max,Fri,4.3
Male,max,Sat,10.0
Female,max,Sat,6.5
Male,max,Sun,6.5
Female,max,Sun,5.2
Male,min,Thur,1.44
Female,min,Thur,1.25


In [46]:
tips.pivot_table(index=['sex', 'day'], values='tip', aggfunc=['max', 'min'])

Unnamed: 0_level_0,Unnamed: 1_level_0,max,min
Unnamed: 0_level_1,Unnamed: 1_level_1,tip,tip
sex,day,Unnamed: 2_level_2,Unnamed: 3_level_2
Male,Thur,6.7,1.44
Male,Fri,4.73,1.5
Male,Sat,10.0,1.0
Male,Sun,6.5,1.32
Female,Thur,5.17,1.25
Female,Fri,4.3,1.0
Female,Sat,6.5,1.0
Female,Sun,5.2,1.01


In [47]:
# 성별, 흡연 여부별, 요일별 tip의 중앙값
tips.pivot_table(index=['sex', 'smoker'], columns='day', values='tip', aggfunc='median')

Unnamed: 0_level_0,day,Thur,Fri,Sat,Sun
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Male,Yes,2.78,2.6,3.0,3.5
Male,No,2.405,2.5,2.86,3.0
Female,Yes,2.5,2.5,2.5,3.5
Female,No,2.0,3.125,2.75,3.5


In [48]:
# 성별, 흡연 여부별, 요일별, 시간별 tip의 중앙값 
tips.pivot_table(index=['sex', 'smoker'], columns=['time', 'day'], values='tip', aggfunc='median')

Unnamed: 0_level_0,time,Lunch,Lunch,Dinner,Dinner,Dinner,Dinner
Unnamed: 0_level_1,day,Thur,Fri,Thur,Fri,Sat,Sun
sex,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Male,Yes,2.78,1.92,,3.0,3.0,3.5
Male,No,2.405,,,2.5,2.86,3.0
Female,Yes,2.5,2.5,,2.75,2.5,3.5
Female,No,2.0,3.0,3.0,3.25,2.75,3.5


In [49]:
# 요일별 손님 숫자
tips[tips['day'] == 'Thur']['size'].sum()

152

In [50]:
tips.pivot_table(index='day', values='size', aggfunc='sum')

Unnamed: 0_level_0,size
day,Unnamed: 1_level_1
Thur,152
Fri,40
Sat,219
Sun,216


In [51]:
# 시간별, 요일별 손님 숫자
tips.pivot_table(index='day', columns='time', values='size', aggfunc='sum')

time,Lunch,Dinner
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,150.0,2.0
Fri,14.0,26.0
Sat,,219.0
Sun,,216.0


In [52]:
tips.pivot_table(index=['day', 'time'], values='size', aggfunc='sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,size
day,time,Unnamed: 2_level_1
Thur,Lunch,150
Thur,Dinner,2
Fri,Lunch,14
Fri,Dinner,26
Sat,Dinner,219
Sun,Dinner,216


In [53]:
tips.size

1708

In [54]:
tips.shape[0] * tips.shape[1]

1708