D14 用 pandas 撰寫樞紐分析表

In [1]:
import pandas as pd
import numpy as np

In [11]:
#建立 df 資料

#1.設定index名稱
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],  #第一層index、第二層index
                                   names=['year', 'visit']) #欄位名稱

#2.設定column名稱
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

#3.設定資料
# mock some data
data = np.round(np.random.randn(4, 6), 1)
#print(data)

#4.把index, column, data(ndarray) 組合成 dataframe
df = pd.DataFrame(data, index=index, columns=columns)
df

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,1.0,-0.6,1.4,0.1,0.7,1.1
2013,2,1.3,2.7,-0.4,-0.3,0.8,0.2
2014,1,0.1,0.5,1.2,0.8,-0.5,-0.4
2014,2,-0.5,-1.6,-2.1,-1.3,-1.4,0.1


## 欄位轉索引

### .stack()：將column轉成index

- .stack() 會由最內層的欄位開始轉換，若原表格有兩層，則需使用df.stack().stack()，依此類推 

In [17]:
#將欄位轉索引，會先從裡面那層開始轉
df.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,subject,Bob,Guido,Sue
year,visit,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013,1,HR,1.0,1.4,0.7
2013,1,Temp,-0.6,0.1,1.1
2013,2,HR,1.3,-0.4,0.8
2013,2,Temp,2.7,-0.3,0.2
2014,1,HR,0.1,1.2,-0.5
2014,1,Temp,0.5,0.8,-0.4
2014,2,HR,-0.5,-2.1,-1.4
2014,2,Temp,-1.6,-1.3,0.1


In [18]:
#將欄位轉索引，會先從裡面那層開始轉；若要轉兩層，則使用兩個.stack()
df.stack().stack()

year  visit  type  subject
2013  1      HR    Bob        1.0
                   Guido      1.4
                   Sue        0.7
             Temp  Bob       -0.6
                   Guido      0.1
                   Sue        1.1
      2      HR    Bob        1.3
                   Guido     -0.4
                   Sue        0.8
             Temp  Bob        2.7
                   Guido     -0.3
                   Sue        0.2
2014  1      HR    Bob        0.1
                   Guido      1.2
                   Sue       -0.5
             Temp  Bob        0.5
                   Guido      0.8
                   Sue       -0.4
      2      HR    Bob       -0.5
                   Guido     -2.1
                   Sue       -1.4
             Temp  Bob       -1.6
                   Guido     -1.3
                   Sue        0.1
dtype: float64

## 索引轉欄位

In [19]:
#索引轉欄位: 將一索引(index)轉成一欄位(column)
df.unstack()

subject,Bob,Bob,Bob,Bob,Guido,Guido,Guido,Guido,Sue,Sue,Sue,Sue
type,HR,HR,Temp,Temp,HR,HR,Temp,Temp,HR,HR,Temp,Temp
visit,1,2,1,2,1,2,1,2,1,2,1,2
year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
2013,1.0,1.3,-0.6,2.7,1.4,-0.4,0.1,-0.3,0.7,0.8,1.1,0.2
2014,0.1,-0.5,0.5,-1.6,1.2,-2.1,0.8,-1.3,-0.5,-1.4,-0.4,0.1


## 欄位名稱轉欄位值(wide data -> long data)

#### 參數
- id_vars：不需要被轉換的列名
- value_vars：需要轉換的列名，如果剩下的列全部都要轉換，就不用寫了。


In [20]:
#建立原始資料集
df = pd.DataFrame({'Name':{0:'John',1:'Bob',2:'Shiela'},
                   'Course':{0:'Master', 1:'Graduate', 2:'Graduate'},
                   'Age':{0:27, 1:23, 2:21}
                  })
df

Unnamed: 0,Name,Course,Age
0,John,Master,27
1,Bob,Graduate,23
2,Shiela,Graduate,21


### 無指定特定欄位，將所有欄位轉成欄位值

In [21]:
# 無指定特定欄位，將所有欄位轉成欄位值
df.melt()

Unnamed: 0,variable,value
0,Name,John
1,Name,Bob
2,Name,Shiela
3,Course,Master
4,Course,Graduate
5,Course,Graduate
6,Age,27
7,Age,23
8,Age,21


### 保留 Name 欄位，其餘轉成欄位值

In [22]:
#保留Name欄位，其餘轉成欄位值
df.melt(id_vars='Name')

Unnamed: 0,Name,variable,value
0,John,Course,Master
1,Bob,Course,Graduate
2,Shiela,Course,Graduate
3,John,Age,27
4,Bob,Age,23
5,Shiela,Age,21


## .pivot()：重新組織資料

.pivot()函數根據給定的索引/列值重新組織給定的DataFrame

參數
- index : 新資料的索引名稱
- columns: 新資料的欄位名稱
- values :新資料的值名稱

In [23]:
#建立資料集
df = pd.DataFrame({'fff': ['one', 'one', 'one', 'two', 'two',
                           'two'],
                   'bbb': ['P', 'Q', 'R', 'P', 'Q', 'R'],
                   'baa': [2, 3, 4, 5, 6, 7],
                   'zzz': ['h', 'i', 'j', 'k', 'l', 'm']})
df

Unnamed: 0,fff,bbb,baa,zzz
0,one,P,2,h
1,one,Q,3,i
2,one,R,4,j
3,two,P,5,k
4,two,Q,6,l
5,two,R,7,m


In [24]:
#重新指定index, columns, values(單一value)
df.pivot(index = 'fff', 
         columns = 'bbb',
         values = 'baa')

df.pivot(index = ['sex','class','student_id'],
         columns = ['chinese_score','english_score','math_score'],
         values)

bbb,P,Q,R
fff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,2,3,4
two,5,6,7


In [25]:
#重新指定index, columns, values(多values)
df.pivot(index = 'fff', 
         columns = 'bbb',
         values = ['baa', 'zzz'])

Unnamed: 0_level_0,baa,baa,baa,zzz,zzz,zzz
bbb,P,Q,R,P,Q,R
fff,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
one,2,3,4,h,i,j
two,5,6,7,k,l,m
