# データの変形

### ピボットとアンピボット

### ピボット

In [1]:
import pandas as pd

tips = pd.read_csv(
    "https://raw.githubusercontent.com/plotly/datasets/master/tips.csv",
    dtype={
        "sex": "category",
        "smoker": "category",
        "day": "category",
        "time": "category",
    },
)
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
tips.pivot_table(
    index="smoker",
    columns="time",
    values="total_bill",
)

time,Dinner,Lunch
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
No,20.09566,17.050889
Yes,21.859429,17.39913


In [3]:
import numpy as np

tips.pivot_table(
    index="smoker",
    columns="time",
    values="total_bill",
    aggfunc="median",
)

time,Dinner,Lunch
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
No,18.265,15.95
Yes,20.695,16.0


In [4]:
tips.pivot_table(
    index=["day", "time"],
    columns=["smoker", "sex"],
    values=["size"],
    aggfunc="count",
)

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,size,size
Unnamed: 0_level_1,smoker,No,No,Yes,Yes
Unnamed: 0_level_2,sex,Female,Male,Female,Male
day,time,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Fri,Dinner,1,2,4,5
Fri,Lunch,1,0,3,3
Sat,Dinner,13,32,15,27
Sat,Lunch,0,0,0,0
Sun,Dinner,14,43,4,15
Sun,Lunch,0,0,0,0
Thur,Dinner,1,0,0,0
Thur,Lunch,24,20,7,10


In [5]:
tips.pivot_table(
    index="time",
    columns="smoker",
    values="size",
    margins=True,
)

smoker,No,Yes,All
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,2.735849,2.471429,2.630682
Lunch,2.511111,2.217391,2.411765
All,2.668874,2.408602,2.569672


In [6]:
# 下記のコードと同じ処理
# tips.pivot_table(
#     index="day", columns="time", values="tip", aggfunc="count"
# )
pd.crosstab(
    index=tips.loc[:, "day"],
    columns=tips.loc[:, "time"],
    values=tips.loc[:, "tip"],
    aggfunc="count",
)

time,Dinner,Lunch
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,12,7
Sat,87,0
Sun,76,0
Thur,1,61


In [7]:
pd.crosstab(
    index=tips.loc[:, "day"],
    columns=tips.loc[:, "time"],
    normalize=True,
)

time,Dinner,Lunch
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,0.04918,0.028689
Sat,0.356557,0.0
Sun,0.311475,0.0
Thur,0.004098,0.25


## アンピボット

In [8]:
pivoted_tips = tips.pivot_table(
    index="smoker",
    columns="time",
    values="total_bill",
).reset_index()
pivoted_tips

time,smoker,Dinner,Lunch
0,No,20.09566,17.050889
1,Yes,21.859429,17.39913


In [9]:
pivoted_tips.melt(
    id_vars=["smoker"],
    value_vars=["Dinner", "Lunch"],
    var_name=["time"],
    value_name="total_bill",
)

Unnamed: 0,smoker,time,total_bill
0,No,Dinner,20.09566
1,Yes,Dinner,21.859429
2,No,Lunch,17.050889
3,Yes,Lunch,17.39913


## スタックとアンスタック

### スタック

In [10]:
tips_stacked = tips.stack()
tips_stacked.head(14)

0  total_bill     16.99
   tip             1.01
   sex           Female
   smoker            No
   day              Sun
   time          Dinner
   size               2
1  total_bill     10.34
   tip             1.66
   sex             Male
   smoker            No
   day              Sun
   time          Dinner
   size               3
dtype: object

In [11]:
tips_stacked.index[:14]

MultiIndex([(0, 'total_bill'),
            (0,        'tip'),
            (0,        'sex'),
            (0,     'smoker'),
            (0,        'day'),
            (0,       'time'),
            (0,       'size'),
            (1, 'total_bill'),
            (1,        'tip'),
            (1,        'sex'),
            (1,     'smoker'),
            (1,        'day'),
            (1,       'time'),
            (1,       'size')],
           )

### アンスタック

In [12]:
tips_stacked.unstack()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### MultiIndexのスタック・アンスタック

In [13]:
groupby_time = tips.groupby("time")[  # ①time列でグルーピング
    # ①グループ化したオブジェクトから列を選択
    ["total_bill", "tip"]
].agg(
    ("mean", "median")
)  # ①平均値と中央値を算出
groupby_time.columns.names = (
    "value",
    "agg",
)  # ②インデックスに名前を設定
groupby_time

value,total_bill,total_bill,tip,tip
agg,mean,median,mean,median
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Dinner,20.797159,18.39,3.10267,3.0
Lunch,17.168676,15.965,2.728088,2.25


In [14]:
# level=0がvalue、level=1がagg
groupby_time.columns.names

FrozenList(['value', 'agg'])

In [15]:
groupby_time.stack(0)  # インデックスの順序

Unnamed: 0_level_0,agg,mean,median
time,value,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,tip,3.10267,3.0
Dinner,total_bill,20.797159,18.39
Lunch,tip,2.728088,2.25
Lunch,total_bill,17.168676,15.965


In [16]:
groupby_time.stack("agg")  # インデックスの名前

Unnamed: 0_level_0,value,total_bill,tip
time,agg,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,mean,20.797159,3.10267
Dinner,median,18.39,3.0
Lunch,mean,17.168676,2.728088
Lunch,median,15.965,2.25


In [17]:
groupby_time_stack = groupby_time.stack("value")
groupby_time_stack

Unnamed: 0_level_0,agg,mean,median
time,value,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,tip,3.10267,3.0
Dinner,total_bill,20.797159,18.39
Lunch,tip,2.728088,2.25
Lunch,total_bill,17.168676,15.965


In [18]:
# level=0がtime、level=1がvalue
groupby_time_stack.index.names

FrozenList(['time', 'value'])

In [19]:
groupby_time_stack.unstack(1)

agg,mean,mean,median,median
value,tip,total_bill,tip,total_bill
time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Dinner,3.10267,20.797159,3.0,18.39
Lunch,2.728088,17.168676,2.25,15.965


In [20]:
groupby_time_stack.unstack("time")

agg,mean,mean,median,median
time,Dinner,Lunch,Dinner,Lunch
value,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
tip,3.10267,2.728088,3.0,2.25
total_bill,20.797159,17.168676,18.39,15.965


## ダミー変数

In [21]:
pd.get_dummies(tips.loc[:, "day"])

Unnamed: 0,Fri,Sat,Sun,Thur
0,False,False,True,False
1,False,False,True,False
2,False,False,True,False
3,False,False,True,False
4,False,False,True,False
...,...,...,...,...
239,False,True,False,False
240,False,True,False,False
241,False,True,False,False
242,False,True,False,False


In [22]:
pd.get_dummies(tips, columns=["smoker", "time"]).head()

Unnamed: 0,total_bill,tip,sex,day,size,smoker_No,smoker_Yes,time_Dinner,time_Lunch
0,16.99,1.01,Female,Sun,2,True,False,True,False
1,10.34,1.66,Male,Sun,3,True,False,True,False
2,21.01,3.5,Male,Sun,3,True,False,True,False
3,23.68,3.31,Male,Sun,2,True,False,True,False
4,24.59,3.61,Female,Sun,4,True,False,True,False


## 要素の展開

In [23]:
students = pd.DataFrame(
    {
        "名前": ["寺田", "辻"],
        "履修科目": [
            ["国語", "英語", "数学"],
            ["英語", "物理"],
        ],
        "得点": [[78, 65, 89], [90, 82]],
    }
)
students

Unnamed: 0,名前,履修科目,得点
0,寺田,"[国語, 英語, 数学]","[78, 65, 89]"
1,辻,"[英語, 物理]","[90, 82]"


In [24]:
students.loc[:, "得点"].explode()

0    78
0    65
0    89
1    90
1    82
Name: 得点, dtype: object

In [25]:
students.explode(["履修科目", "得点"])

Unnamed: 0,名前,履修科目,得点
0,寺田,国語,78
0,寺田,英語,65
0,寺田,数学,89
1,辻,英語,90
1,辻,物理,82
