#  <font color=red> Module_07_結合、關聯以及重塑資料</font>

## 串連 (concatenation) 幾個物件的資料

In [1]:
import pandas as pd
import numpy as np
from datetime import date

s1 = pd.Series(np.arange(0, 3))
s2 = pd.Series(np.arange(5, 8))
s1

0    0
1    1
2    2
dtype: int32

In [2]:
s2

0    5
1    6
2    7
dtype: int32

In [3]:
# 預設對索引標籤做串連，對欄位做聯集對齊
# 此做法會導致重複的索引標籤
pd.concat([s1, s2])

0    0
1    1
2    2
0    5
1    6
2    7
dtype: int32

In [4]:
# 對欄位做串連，對索引標籤做聯集對齊
pd.concat([s1, s2], axis = 1)

Unnamed: 0,0,1
0,0,5
1,1,6
2,2,7


---

In [5]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [6]:
s1

a    0
b    1
dtype: int64

In [7]:
s2

c    2
d    3
e    4
dtype: int64

In [8]:
s3

f    5
g    6
dtype: int64

In [9]:
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [10]:
pd.concat([s1, s2, s3], axis = 1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [11]:
s4 = pd.concat([s1, s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [12]:
s1

a    0
b    1
dtype: int64

In [13]:
pd.concat([s1, s4], axis = 1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [14]:
# 預設的 join ='outer'，在這裡也就是對索引標籤串連，對欄位取【聯集】對齊
pd.concat([s1, s4])

a    0
b    1
a    0
b    1
f    5
g    6
dtype: int64

In [15]:
 # 當 join = 'inner'，在這裡也就是對索引標籤串連，對欄位取【交集】對齊
pd.concat([s1, s4], join = 'inner')

a    0
b    1
a    0
b    1
f    5
g    6
dtype: int64

In [16]:
s1

a    0
b    1
dtype: int64

In [17]:
s4

a    0
b    1
f    5
g    6
dtype: int64

In [18]:
pd.concat([s1, s4], axis = 1, join = 'inner')

Unnamed: 0,0,1
a,0,0
b,1,1


---

In [19]:
s1

a    0
b    1
dtype: int64

In [20]:
s3

f    5
g    6
dtype: int64

In [21]:
# 利用 keys 參數，在結果中給出每組資料來源的名字
# keys 是給在串連的地方
# 當然可以把相同的資料做串連
result = pd.concat([s1, s1, s3], keys = ['one', 'two', 'three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [22]:
pd.concat([s1, s1, s3], keys = ['one', 'two', 'three'], axis = 1)

Unnamed: 0,one,two,three
a,0.0,0.0,
b,1.0,1.0,
f,,,5.0
g,,,6.0


---

In [23]:
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [24]:
# 解堆疊
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [25]:
pd.concat([s1, s2, s3], keys = ['one', 'two', 'three'], axis = 1)

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


---

In [26]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3),
                  columns = ['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),
                  columns = ['a', 'b', 'c'])
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [27]:
df2

Unnamed: 0,a,b,c
0,9,10,11
1,12,13,14
2,15,16,17


In [28]:
pd.concat([df1, df2]) # 預設對索引標籤 (第 0 軸) 做串連，對欄位 (第 1 軸) 做聯集對齊

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
0,9,10,11
1,12,13,14
2,15,16,17


In [29]:
pd.concat([df1, df2], axis = 1) # 對欄位 (第 1 軸) 做串聯，對索引標籤 (第 0 軸) 做聯集對齊

Unnamed: 0,a,b,c,a.1,b.1,c.1
0,0,1,2,9,10,11
1,3,4,5,12,13,14
2,6,7,8,15,16,17


---

In [30]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3),
                  columns = ['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),
                  columns = ['a', 'c', 'd'])
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [31]:
df2

Unnamed: 0,a,c,d
0,9,10,11
1,12,13,14
2,15,16,17


In [32]:
pd.concat([df1, df2])

Unnamed: 0,a,b,c,d
0,0,1.0,2,
1,3,4.0,5,
2,6,7.0,8,
0,9,,10,11.0
1,12,,13,14.0
2,15,,16,17.0


In [33]:
c = pd.concat([df1, df2], keys = ['df1', 'df2'])
c

Unnamed: 0,Unnamed: 1,a,b,c,d
df1,0,0,1.0,2,
df1,1,3,4.0,5,
df1,2,6,7.0,8,
df2,0,9,,10,11.0
df2,1,12,,13,14.0
df2,2,15,,16,17.0


In [34]:
c.loc['df2']

Unnamed: 0,a,b,c,d
0,9,,10,11.0
1,12,,13,14.0
2,15,,16,17.0


In [35]:
pd.concat([df1, df2], axis = 1)

Unnamed: 0,a,b,c,a.1,c.1,d
0,0,1,2,9,10,11
1,3,4,5,12,13,14
2,6,7,8,15,16,17


In [36]:
df3 = pd.DataFrame(np.arange(20, 26).reshape(3, 2),
                  columns = ['a', 'd'],
                  index = [2, 3, 4])
df3

Unnamed: 0,a,d
2,20,21
3,22,23
4,24,25


In [37]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [38]:
pd.concat([df1, df3], axis = 1)

Unnamed: 0,a,b,c,a.1,d
0,0.0,1.0,2.0,,
1,3.0,4.0,5.0,,
2,6.0,7.0,8.0,20.0,21.0
3,,,,22.0,23.0
4,,,,24.0,25.0


---

In [39]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [40]:
df2

Unnamed: 0,a,c,d
0,9,10,11
1,12,13,14
2,15,16,17


In [41]:
df3

Unnamed: 0,a,d
2,20,21
3,22,23
4,24,25


In [42]:
# 預設是 join = 'outer'，也就是聯集對齊
# 當 join = 'inner'，在這裡就是對欄位串連，對索引標籤取交集對齊
pd.concat([df1, df3], axis = 1, join = 'inner') 

Unnamed: 0,a,b,c,a.1,d
2,6,7,8,20,21


In [43]:
df = pd.concat([df1, df2], axis = 1, keys = ['df1', 'df2'])
df

Unnamed: 0_level_0,df1,df1,df1,df2,df2,df2
Unnamed: 0_level_1,a,b,c,a,c,d
0,0,1,2,9,10,11
1,3,4,5,12,13,14
2,6,7,8,15,16,17


In [44]:
# df['df2'] 在這會得到同樣的答案
df.loc[:, 'df2']

Unnamed: 0,a,c,d
0,9,10,11
1,12,13,14
2,15,16,17


In [45]:
# 忽略索引標籤，用預設
pd.concat([df1, df2], ignore_index = True)

Unnamed: 0,a,b,c,d
0,0,1.0,2,
1,3,4.0,5,
2,6,7.0,8,
3,9,,10,11.0
4,12,,13,14.0
5,15,,16,17.0


---

In [46]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [47]:
df2

Unnamed: 0,a,c,d
0,9,10,11
1,12,13,14
2,15,16,17


In [48]:
# 資料框的 .append() 方法只是對索引標籤做串連，對欄位作聯集對齊
df1.append(df2)

Unnamed: 0,a,b,c,d
0,0,1.0,2,
1,3,4.0,5,
2,6,7.0,8,
0,9,,10,11.0
1,12,,13,14.0
2,15,,16,17.0


In [49]:
df1.append(df2, ignore_index = True)

Unnamed: 0,a,b,c,d
0,0,1.0,2,
1,3,4.0,5,
2,6,7.0,8,
3,9,,10,11.0
4,12,,13,14.0
5,15,,16,17.0


## 合併與連接資料

In [50]:
customers = {'CustomerID': [10, 11],
             'Name': ['Mike', 'Marcia'],
             'Address': ['Address for Mike',
                         'Address for Marcia']}
customers = pd.DataFrame(customers)
customers

Unnamed: 0,CustomerID,Name,Address
0,10,Mike,Address for Mike
1,11,Marcia,Address for Marcia


In [51]:
# date() 是日期資料型態，datetime() 是日期時間資料型態，用法類似
orders = {'CustomerID': [10, 11, 10],
          'OrderDate': [date(2014, 12, 1),
                        date(2014, 12, 1),
                        date(2014, 12, 1)]}
orders = pd.DataFrame(orders)
orders

Unnamed: 0,CustomerID,OrderDate
0,10,2014-12-01
1,11,2014-12-01
2,10,2014-12-01


In [52]:
# 類似關聯式資料庫的 join， on 可以自己下，預設會找共同的欄位
# on 下的欄位會放在最前面的欄位，不會重複出現
# 也可以用全域函式 pd.merge()
customers.merge(orders) 

Unnamed: 0,CustomerID,Name,Address,OrderDate
0,10,Mike,Address for Mike,2014-12-01
1,10,Mike,Address for Mike,2014-12-01
2,11,Marcia,Address for Marcia,2014-12-01


In [53]:
orders.merge(customers)

Unnamed: 0,CustomerID,OrderDate,Name,Address
0,10,2014-12-01,Mike,Address for Mike
1,10,2014-12-01,Mike,Address for Mike
2,11,2014-12-01,Marcia,Address for Marcia


---

In [54]:
left_data = {'key1': ['a', 'b', 'c'], 
            'key2': ['x', 'y', 'z'],
            'lval1': [ 0, 1, 2]}
right_data = {'key1': ['a', 'b', 'c'],
              'key2': ['x', 'a', 'z'], 
              'rval1': [ 6, 7, 8 ]}
left = pd.DataFrame(left_data, index=[0, 1, 2])
right = pd.DataFrame(right_data, index=[1, 2, 3])
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [55]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [56]:
left.merge(right)

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6
1,c,z,2,8


In [57]:
# _x 表示來自左物件 _y表示來自右物件 
# 可用 suffixes 修改
left.merge(right, on = 'key1') 

Unnamed: 0,key1,key2_x,lval1,key2_y,rval1
0,a,x,0,x,6
1,b,y,1,a,7
2,c,z,2,z,8


In [58]:
left.merge(right, on = 'key1', suffixes = ['_l', '_r'])

Unnamed: 0,key1,key2_l,lval1,key2_r,rval1
0,a,x,0,x,6
1,b,y,1,a,7
2,c,z,2,z,8


---

In [59]:
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [60]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [61]:
left.merge(right, left_on = 'key1', right_on = 'key2') # 也可以對不同行的行名來合併

Unnamed: 0,key1_x,key2_x,lval1,key1_y,key2_y,rval1
0,a,x,0,b,a,7


---

In [62]:
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [63]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [64]:
# 如果是要對索引標籤做合併的話，可以使用 left_index = True 和  right_index = True，兩個都要設定
# 這樣就變成了對索引標籤做交集對齊，對欄位做串連了。也就是結果會跟下面一樣
left.merge(right, left_index = True, right_index = True) 

Unnamed: 0,key1_x,key2_x,lval1,key1_y,key2_y,rval1
1,b,y,1,a,x,6
2,c,z,2,b,a,7


In [65]:
pd.concat([left, right], axis = 1, join = 'inner')

Unnamed: 0,key1,key2,lval1,key1.1,key2.1,rval1
1,b,y,1,a,x,6
2,c,z,2,b,a,7


---

In [66]:
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [67]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [68]:
left.merge(right, how = 'inner') # 預設 how = 'inner'，兩個 Dataframe 物件的共同欄位的值的交集

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6
1,c,z,2,8


In [69]:
left.merge(right, how = 'outer') # 預設 how = 'outer'，兩個 Dataframe 物件的共同欄位的值的聯集

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0.0,6.0
1,b,y,1.0,
2,c,z,2.0,8.0
3,b,a,,7.0


In [70]:
left.merge(right, how = 'left') # 只使用來自左方 Dataframe 物件

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6.0
1,b,y,1,
2,c,z,2,8.0


In [71]:
left.merge(right, how = 'right') # 只使用來自右方 Dataframe 物件

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0.0,6
1,b,a,,7
2,c,z,2.0,8


---

In [72]:
left.merge(right, how = 'cross') 

Unnamed: 0,key1_x,key2_x,lval1,key1_y,key2_y,rval1
0,a,x,0,a,x,6
1,a,x,0,b,a,7
2,a,x,0,c,z,8
3,b,y,1,a,x,6
4,b,y,1,b,a,7
5,b,y,1,c,z,8
6,c,z,2,a,x,6
7,c,z,2,b,a,7
8,c,z,2,c,z,8


---

In [73]:
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [74]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [75]:
left.join(right)

ValueError: columns overlap but no suffix specified: Index(['key1', 'key2'], dtype='object')

In [76]:
# 資料框的 .join() 方法是對兩個 Dataframe 物件的索引標籤進行連結操作，而非裡面的資料值
# how = 'left' 是預設

left.join(right, lsuffix='_left', rsuffix='_right') 

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
0,a,x,0,,,
1,b,y,1,a,x,6.0
2,c,z,2,b,a,7.0


In [77]:
left.join(right, lsuffix='_left', rsuffix='_right', how = 'inner') 

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
1,b,y,1,a,x,6
2,c,z,2,b,a,7


In [78]:
left.join(right, lsuffix='_left', rsuffix='_right', how = 'outer') 

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
0,a,x,0.0,,,
1,b,y,1.0,a,x,6.0
2,c,z,2.0,b,a,7.0
3,,,,c,z,8.0


## 合併有重複的資料

In [79]:
a = pd.Series([np.nan, 2.5, 0, 3.5, 4.5, np.nan],
              index = ['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series([0, np.nan, 2, np.nan, np.nan, 5],
              index = ['a', 'b', 'c', 'd', 'e', 'f'])
a

f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64

In [80]:
b

a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64

In [81]:
# 返回 ndarray 
# 跟索引標籤無關
np.where(b.isnull(), a, b) 

array([0. , 2.5, 2. , 3.5, 4.5, 5. ])

In [82]:
# Update null elements with value in the same location in 'other'
# 以左邊的資料為主，只要兩邊有一邊是 NaN，就會用另一邊的值合併
# 這題剛好跟 b.fillna(a) 結果一樣， 但是 .combine_first() 方法是合併有重複的資料，下題可看出不同的地方
b.combine_first(a)

a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64

---

In [83]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [84]:
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [85]:
df1.combine_first(df2) # 逐欄做一樣的動作

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


---

In [86]:
df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
df1

Unnamed: 0,A,B
0,,4.0
1,0.0,


In [87]:
df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
df2

Unnamed: 0,B,C
1,3,1
2,3,1


In [88]:
df1.combine_first(df2)

Unnamed: 0,A,B,C
0,,4.0,
1,0.0,3.0,1.0
2,,3.0,1.0


## 資料值與索引的樞紐操作

In [89]:
# 資料常存成堆疊格式，或稱為紀錄格式
sensor_readings = pd.read_csv('./mod07/accel.csv')
sensor_readings

Unnamed: 0,interval,axis,reading
0,0,X,0.0
1,0,Y,0.5
2,0,Z,1.0
3,1,X,0.1
4,1,Y,0.4
5,1,Z,0.9
6,2,X,0.2
7,2,Y,0.3
8,2,Z,0.8
9,3,X,0.3


In [90]:
# 如何著手找出特定軸的讀數
sensor_readings[sensor_readings['axis'] == 'X']

Unnamed: 0,interval,axis,reading
0,0,X,0.0
3,1,X,0.1
6,2,X,0.2
9,3,X,0.3


In [91]:
# 更好的辦法是使用樞紐操作
# 資料框的 .pivot() 方法的用法就是要決定 index、columns、values
# 樞紐的動作，也就是將資料框從長格式 (long format) 轉成寬格式 (wide format) 的操作
sensor_readings.pivot(index = 'interval',
                      columns = 'axis',
                      values = 'reading')

axis,X,Y,Z
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,0.5,1.0
1,0.1,0.4,0.9
2,0.2,0.3,0.8
3,0.3,0.2,0.7


---

In [92]:
# 樞紐操作其實就是在操作堆疊跟解堆疊
sensor_readings.set_index(['interval', 'axis'])

Unnamed: 0_level_0,Unnamed: 1_level_0,reading
interval,axis,Unnamed: 2_level_1
0,X,0.0
0,Y,0.5
0,Z,1.0
1,X,0.1
1,Y,0.4
1,Z,0.9
2,X,0.2
2,Y,0.3
2,Z,0.8
3,X,0.3


In [93]:
sensor_readings.set_index(['interval', 'axis']).unstack()

Unnamed: 0_level_0,reading,reading,reading
axis,X,Y,Z
interval,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0.0,0.5,1.0
1,0.1,0.4,0.9
2,0.2,0.3,0.8
3,0.3,0.2,0.7


## 堆疊與解堆疊

In [94]:
df = pd.DataFrame({'a': [1, 2]}, index = ['one', 'two'])
df

Unnamed: 0,a
one,1
two,2


In [95]:
stacked1 = df.stack()
stacked1

one  a    1
two  a    2
dtype: int64

In [96]:
# 因為原本的資料框只有 level = 0 的欄位，堆疊後就退化成序列了
type(stacked1)

pandas.core.series.Series

In [97]:
stacked1['one']['a'] # 等於 stacked1['one', 'a']

1

In [98]:
df.unstack()

a  one    1
   two    2
dtype: int64

---

In [99]:
df = pd.DataFrame({'a': [1, 2],
                   'b': [3, 4]},
                   index = ['one', 'two'])
df

Unnamed: 0,a,b
one,1,3
two,2,4


In [100]:
stacked2 = df.stack()
stacked2

one  a    1
     b    3
two  a    2
     b    4
dtype: int64

In [101]:
type(stacked2)

pandas.core.series.Series

In [102]:
stacked2['one']['b'] # 等於 stacked2['one', 'b']

3

In [103]:
df.unstack()

a  one    1
   two    2
b  one    3
   two    4
dtype: int64

---

In [104]:
sensor_readings = pd.read_csv('./mod07/accel.csv')
sensor_readings

Unnamed: 0,interval,axis,reading
0,0,X,0.0
1,0,Y,0.5
2,0,Z,1.0
3,1,X,0.1
4,1,Y,0.4
5,1,Z,0.9
6,2,X,0.2
7,2,Y,0.3
8,2,Z,0.8
9,3,X,0.3


In [105]:
user1 = sensor_readings.copy()
user2 = sensor_readings.copy()
user1['who'] = 'Mike'
user2['who'] = 'Mikael'
user2['reading'] *= 100

user1[:5]

Unnamed: 0,interval,axis,reading,who
0,0,X,0.0,Mike
1,0,Y,0.5,Mike
2,0,Z,1.0,Mike
3,1,X,0.1,Mike
4,1,Y,0.4,Mike


In [106]:
user2[:5]

Unnamed: 0,interval,axis,reading,who
0,0,X,0.0,Mikael
1,0,Y,50.0,Mikael
2,0,Z,100.0,Mikael
3,1,X,10.0,Mikael
4,1,Y,40.0,Mikael


In [107]:
pd.concat([user1, user2])

Unnamed: 0,interval,axis,reading,who
0,0,X,0.0,Mike
1,0,Y,0.5,Mike
2,0,Z,1.0,Mike
3,1,X,0.1,Mike
4,1,Y,0.4,Mike
5,1,Z,0.9,Mike
6,2,X,0.2,Mike
7,2,Y,0.3,Mike
8,2,Z,0.8,Mike
9,3,X,0.3,Mike


In [108]:
multi_user_sensor_data = pd.concat([user1, user2]).set_index(['who', 'interval', 'axis'])
multi_user_sensor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,reading
who,interval,axis,Unnamed: 3_level_1
Mike,0,X,0.0
Mike,0,Y,0.5
Mike,0,Z,1.0
Mike,1,X,0.1
Mike,1,Y,0.4
Mike,1,Z,0.9
Mike,2,X,0.2
Mike,2,Y,0.3
Mike,2,Z,0.8
Mike,3,X,0.3


In [109]:
multi_user_sensor_data.loc['Mike']

Unnamed: 0_level_0,Unnamed: 1_level_0,reading
interval,axis,Unnamed: 2_level_1
0,X,0.0
0,Y,0.5
0,Z,1.0
1,X,0.1
1,Y,0.4
1,Z,0.9
2,X,0.2
2,Y,0.3
2,Z,0.8
3,X,0.3


In [110]:
# 在 mod03 有遇到過，複習一下!!
multi_user_sensor_data.xs(1, level = 'interval')

Unnamed: 0_level_0,Unnamed: 1_level_0,reading
who,axis,Unnamed: 2_level_1
Mike,X,0.1
Mike,Y,0.4
Mike,Z,0.9
Mikael,X,10.0
Mikael,Y,40.0
Mikael,Z,90.0


In [111]:
multi_user_sensor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,reading
who,interval,axis,Unnamed: 3_level_1
Mike,0,X,0.0
Mike,0,Y,0.5
Mike,0,Z,1.0
Mike,1,X,0.1
Mike,1,Y,0.4
Mike,1,Z,0.9
Mike,2,X,0.2
Mike,2,Y,0.3
Mike,2,Z,0.8
Mike,3,X,0.3


In [112]:
# 解堆疊預設的是 level = -1，也就是最裡面的 level，在這題就是 level = 2
# 我們可以把欄位想成是我們想要觀察的主角，這裡就是想知道 X、Y、Z 的 reading
multi_user_sensor_data.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,reading,reading,reading
Unnamed: 0_level_1,axis,X,Y,Z
who,interval,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Mikael,0,0.0,50.0,100.0
Mikael,1,10.0,40.0,90.0
Mikael,2,20.0,30.0,80.0
Mikael,3,30.0,20.0,70.0
Mike,0,0.0,0.5,1.0
Mike,1,0.1,0.4,0.9
Mike,2,0.2,0.3,0.8
Mike,3,0.3,0.2,0.7


---

In [113]:
multi_user_sensor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,reading
who,interval,axis,Unnamed: 3_level_1
Mike,0,X,0.0
Mike,0,Y,0.5
Mike,0,Z,1.0
Mike,1,X,0.1
Mike,1,Y,0.4
Mike,1,Z,0.9
Mike,2,X,0.2
Mike,2,Y,0.3
Mike,2,Z,0.8
Mike,3,X,0.3


In [114]:
# 我們可以把欄位想成是我們想要觀察的主角，這裡就是想知道 Mikael 與 Mike 分別收集的 reading
multi_user_sensor_data.unstack(level = 0)

Unnamed: 0_level_0,Unnamed: 1_level_0,reading,reading
Unnamed: 0_level_1,who,Mikael,Mike
interval,axis,Unnamed: 2_level_2,Unnamed: 3_level_2
0,X,0.0,0.0
0,Y,50.0,0.5
0,Z,100.0,1.0
1,X,10.0,0.1
1,Y,40.0,0.4
1,Z,90.0,0.9
2,X,20.0,0.2
2,Y,30.0,0.3
2,Z,80.0,0.8
3,X,30.0,0.3


In [115]:
multi_user_sensor_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,reading
who,interval,axis,Unnamed: 3_level_1
Mike,0,X,0.0
Mike,0,Y,0.5
Mike,0,Z,1.0
Mike,1,X,0.1
Mike,1,Y,0.4
Mike,1,Z,0.9
Mike,2,X,0.2
Mike,2,Y,0.3
Mike,2,Z,0.8
Mike,3,X,0.3


In [116]:
unstacked = multi_user_sensor_data.unstack(['who', 'axis'])
unstacked

Unnamed: 0_level_0,reading,reading,reading,reading,reading,reading
who,Mike,Mike,Mike,Mikael,Mikael,Mikael
axis,X,Y,Z,X,Y,Z
interval,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
0,0.0,0.5,1.0,0.0,50.0,100.0
1,0.1,0.4,0.9,10.0,40.0,90.0
2,0.2,0.3,0.8,20.0,30.0,80.0
3,0.3,0.2,0.7,30.0,20.0,70.0


In [117]:
unstacked.stack(level = 'who')

Unnamed: 0_level_0,Unnamed: 1_level_0,reading,reading,reading
Unnamed: 0_level_1,axis,X,Y,Z
interval,who,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,Mikael,0.0,50.0,100.0
0,Mike,0.0,0.5,1.0
1,Mikael,10.0,40.0,90.0
1,Mike,0.1,0.4,0.9
2,Mikael,20.0,30.0,80.0
2,Mike,0.2,0.3,0.8
3,Mikael,30.0,20.0,70.0
3,Mike,0.3,0.2,0.7


---

In [118]:
data = pd.DataFrame({'Name': ['Mike', 'Mikael'],
                     'Height': [6.1, 6.0],
                     'Weight': [220, 185]})
data

Unnamed: 0,Name,Height,Weight
0,Mike,6.1,220
1,Mikael,6.0,185


In [119]:
pd.melt(data, 
        id_vars = 'Name',
        value_vars = ['Height', 'Weight'])

Unnamed: 0,Name,variable,value
0,Mike,Height,6.1
1,Mikael,Height,6.0
2,Mike,Weight,220.0
3,Mikael,Weight,185.0


---

In [120]:
sensor_readings = pd.read_csv('./mod07/accel.csv')
sensor_readings

Unnamed: 0,interval,axis,reading
0,0,X,0.0
1,0,Y,0.5
2,0,Z,1.0
3,1,X,0.1
4,1,Y,0.4
5,1,Z,0.9
6,2,X,0.2
7,2,Y,0.3
8,2,Z,0.8
9,3,X,0.3


In [121]:
pivoted = sensor_readings.pivot(index = 'interval', columns = 'axis', values = 'reading')
pivoted

axis,X,Y,Z
interval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,0.5,1.0
1,0.1,0.4,0.9
2,0.2,0.3,0.8
3,0.3,0.2,0.7


In [122]:
# 要融解前要把 interval 放回欄位，不然會產生例外
pivoted = pivoted.reset_index()
pivoted

axis,interval,X,Y,Z
0,0,0.0,0.5,1.0
1,1,0.1,0.4,0.9
2,2,0.2,0.3,0.8
3,3,0.3,0.2,0.7


In [123]:
pd.melt(pivoted, id_vars = 'interval', value_vars = ['X', 'Y', 'Z'] )

Unnamed: 0,interval,axis,value
0,0,X,0.0
1,1,X,0.1
2,2,X,0.2
3,3,X,0.3
4,0,Y,0.5
5,1,Y,0.4
6,2,Y,0.3
7,3,Y,0.2
8,0,Z,1.0
9,1,Z,0.9


In [124]:
pd.melt(pivoted, id_vars = 'interval', value_vars = ['X', 'Y', 'Z'] ).sort_values(by = 'interval')

Unnamed: 0,interval,axis,value
0,0,X,0.0
4,0,Y,0.5
8,0,Z,1.0
1,1,X,0.1
5,1,Y,0.4
9,1,Z,0.9
2,2,X,0.2
6,2,Y,0.3
10,2,Z,0.8
3,3,X,0.3


## 綜合應用

In [125]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [126]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [127]:
# 當然也可以用資料框的 .merge() 方法
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [128]:
pd.merge(df1, df2, on = 'key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


---

In [129]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})

df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [130]:
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [131]:
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


---

In [132]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [133]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [134]:
pd.merge(df1, df2, how = 'outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


---

In [135]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [136]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [137]:
pd.merge(df1, df2, on = 'key', how = 'left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [138]:
pd.merge(df1, df2, on = 'key', how = 'inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


---

In [139]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})
left

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [140]:
right

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7


In [141]:
pd.merge(left, right, on = ['key1', 'key2'], how = 'outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [142]:
pd.merge(left, right, on = 'key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [143]:
pd.merge(left, right, on = 'key1', suffixes = ['_left', '_right'])

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


---

In [144]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, 
                       index = ['a', 'b'])
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [145]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [146]:
# 一邊選欄位，一邊選索引標籤做合併
pd.merge(left1, right1, left_on = 'key', right_index = True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [147]:
pd.merge(left1, right1, left_on = 'key', right_index = True, how = 'outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


---

In [148]:
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
                               'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})
righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                      index = [['Nevada', 'Nevada', 'Ohio', 'Ohio',
                              'Ohio', 'Ohio'],
                             [2001, 2000, 2000, 2000, 2001, 2002]],
                      columns = ['event1', 'event2'])
lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0.0
1,Ohio,2001,1.0
2,Ohio,2002,2.0
3,Nevada,2001,3.0
4,Nevada,2002,4.0


In [149]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [150]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index = True)

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4,5
0,Ohio,2000,0.0,6,7
1,Ohio,2001,1.0,8,9
2,Ohio,2002,2.0,10,11
3,Nevada,2001,3.0,0,1


In [151]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index = True, how = 'outer')

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0
3,Nevada,2001,3.0,0.0,1.0
4,Nevada,2002,4.0,,
4,Nevada,2000,,2.0,3.0


---

In [152]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [153]:
right2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [154]:
pd.merge(left2, right2, left_index = True, right_index = True, how = 'outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


---

In [155]:
# 資料框的 .join() 方法是對兩個 Dataframe 物件的索引標籤進行連結操作，而非裡面的資料值
# how = 'left' 是預設
# 答案跟上面一樣
# .join 方法是 .merge() 方法的弱化
left2.join(right2, how = 'outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


---

In [156]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [157]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [158]:
# 資料框的 .join() 方法是對兩個 Dataframe 物件的索引標籤進行連結操作，而非裡面的資料值
# how = 'left' 是預設
left1.join(right1)

Unnamed: 0,key,value,group_val
0,a,0,
1,b,1,
2,a,2,
3,a,3,
4,b,4,
5,c,5,


In [159]:
# 有 on 參數可選，on 屬性是給 left1 的
# 所以現在 left1 看 'key'，right1 還是看索引標籤
left1.join(right1, on = 'key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [160]:
right1.join(left1, on = 'key') # right1 根本沒有 'key' 的欄位

KeyError: 'key'

---

In [161]:
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [162]:
right2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [163]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index = ['a', 'c', 'e', 'f'],
                       columns = ['New York', 'Oregon'])
another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [164]:
left2.join([right2, another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


In [165]:
left2.join([right2, another], how = 'outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0
b,,,7.0,8.0,,
d,,,11.0,12.0,,
f,,,,,16.0,17.0


---

In [166]:
arr = np.arange(12).reshape(3, 4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [167]:
# 這是 numpy 的 np.concatenate() 函式，作用在 ndarray 上
np.concatenate([arr, arr], axis = 1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

---

In [168]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index = ['a', 'b', 'c'],
                   columns = ['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index = ['a', 'c'],
                   columns = ['three', 'four'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [169]:
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [170]:
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [171]:
# 另一種給資料來源的方式
pd.concat({'level1': df1, 'level2': df2 }, axis = 1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [172]:
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'], names = [ 'upper', 'lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


---

In [173]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns = ['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns = ['b', 'd', 'a'])
df1

Unnamed: 0,a,b,c,d
0,0.427692,-0.782577,-0.374397,1.196809
1,1.110701,0.580273,-1.16375,-1.096008
2,-1.301159,1.194308,-0.325053,-1.004583


In [174]:
df2

Unnamed: 0,b,d,a
0,0.104476,0.539977,-0.020649
1,1.059525,-0.132106,1.020243


In [175]:
pd.concat([df1, df2], ignore_index = True)

Unnamed: 0,a,b,c,d
0,0.427692,-0.782577,-0.374397,1.196809
1,1.110701,0.580273,-1.16375,-1.096008
2,-1.301159,1.194308,-0.325053,-1.004583
3,-0.020649,0.104476,,0.539977
4,1.020243,1.059525,,-0.132106


---

In [176]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index = pd.Index(['Ohio', 'Colorado'], name = 'state'),
                    columns = pd.Index(['one', 'two', 'three'], name = 'number'))
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [177]:
result = data.stack()
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [178]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [179]:
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [180]:
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


---

In [181]:
s1 = pd.Series([0, 1, 2, 3], index = ['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index = ['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys = ['one', 'two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [182]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [183]:
data2.unstack().stack() # .stack() 方法會自動濾掉遺失值

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [184]:
data2.unstack().stack(dropna = False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

---

In [185]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [186]:
# pd.Index() 函式有參數 name 可以用
df = pd.DataFrame({'left': result, 'right': result + 5},
                  columns = pd.Index(['left', 'right'], name = 'side'))
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [187]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [188]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Colorado,Ohio
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


---

In [189]:
data = pd.read_csv('./mod07/macrodata.csv')
data.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [190]:
periods = pd.PeriodIndex(year = data.year, quarter = data.quarter, name = 'date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name = 'item')
periods

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203)

In [191]:
columns

Index(['realgdp', 'infl', 'unemp'], dtype='object', name='item')

In [192]:
data = data.reindex(columns = columns)
data

item,realgdp,infl,unemp
0,2710.349,0.00,5.8
1,2778.801,2.34,5.1
2,2775.488,2.74,5.3
3,2785.204,0.27,5.6
4,2847.699,2.31,5.2
...,...,...,...
198,13324.600,-3.16,6.0
199,13141.920,-8.79,6.9
200,12925.410,0.94,8.1
201,12901.504,3.37,9.2


In [193]:
data.index = periods.to_timestamp(freq = 'D', how = 'end')
data

item,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,2710.349,0.00,5.8
1959-06-30 23:59:59.999999999,2778.801,2.34,5.1
1959-09-30 23:59:59.999999999,2775.488,2.74,5.3
1959-12-31 23:59:59.999999999,2785.204,0.27,5.6
1960-03-31 23:59:59.999999999,2847.699,2.31,5.2
...,...,...,...
2008-09-30 23:59:59.999999999,13324.600,-3.16,6.0
2008-12-31 23:59:59.999999999,13141.920,-8.79,6.9
2009-03-31 23:59:59.999999999,12925.410,0.94,8.1
2009-06-30 23:59:59.999999999,12901.504,3.37,9.2


---

In [194]:
data.stack()

date                           item   
1959-03-31 23:59:59.999999999  realgdp     2710.349
                               infl           0.000
                               unemp          5.800
1959-06-30 23:59:59.999999999  realgdp     2778.801
                               infl           2.340
                                            ...    
2009-06-30 23:59:59.999999999  infl           3.370
                               unemp          9.200
2009-09-30 23:59:59.999999999  realgdp    12990.341
                               infl           3.560
                               unemp          9.600
Length: 609, dtype: float64

In [195]:
data.stack().reset_index()

Unnamed: 0,date,item,0
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.000
2,1959-03-31 23:59:59.999999999,unemp,5.800
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.340
...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370
605,2009-06-30 23:59:59.999999999,unemp,9.200
606,2009-09-30 23:59:59.999999999,realgdp,12990.341
607,2009-09-30 23:59:59.999999999,infl,3.560


In [196]:
ldata = data.stack().reset_index().rename(columns = {0: 'value'})

In [197]:
ldata

Unnamed: 0,date,item,value
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.000
2,1959-03-31 23:59:59.999999999,unemp,5.800
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.340
...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370
605,2009-06-30 23:59:59.999999999,unemp,9.200
606,2009-09-30 23:59:59.999999999,realgdp,12990.341
607,2009-09-30 23:59:59.999999999,infl,3.560


---

In [198]:
data_cp = data.copy()
data_cp = data_cp.reset_index()
data_cp

item,date,realgdp,infl,unemp
0,1959-03-31 23:59:59.999999999,2710.349,0.00,5.8
1,1959-06-30 23:59:59.999999999,2778.801,2.34,5.1
2,1959-09-30 23:59:59.999999999,2775.488,2.74,5.3
3,1959-12-31 23:59:59.999999999,2785.204,0.27,5.6
4,1960-03-31 23:59:59.999999999,2847.699,2.31,5.2
...,...,...,...,...
198,2008-09-30 23:59:59.999999999,13324.600,-3.16,6.0
199,2008-12-31 23:59:59.999999999,13141.920,-8.79,6.9
200,2009-03-31 23:59:59.999999999,12925.410,0.94,8.1
201,2009-06-30 23:59:59.999999999,12901.504,3.37,9.2


In [199]:
pd.melt(data_cp, id_vars = 'date', value_vars = ['realgdp', 'infl', 'unemp'])

Unnamed: 0,date,item,value
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-06-30 23:59:59.999999999,realgdp,2778.801
2,1959-09-30 23:59:59.999999999,realgdp,2775.488
3,1959-12-31 23:59:59.999999999,realgdp,2785.204
4,1960-03-31 23:59:59.999999999,realgdp,2847.699
...,...,...,...
604,2008-09-30 23:59:59.999999999,unemp,6.000
605,2008-12-31 23:59:59.999999999,unemp,6.900
606,2009-03-31 23:59:59.999999999,unemp,8.100
607,2009-06-30 23:59:59.999999999,unemp,9.200


In [200]:
# 可以再把 index 更換一下!
my_df = pd.melt(data_cp, id_vars = 'date', value_vars = ['realgdp', 'infl', 'unemp']).sort_values(by = 'date')
my_df

Unnamed: 0,date,item,value
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
203,1959-03-31 23:59:59.999999999,infl,0.000
406,1959-03-31 23:59:59.999999999,unemp,5.800
1,1959-06-30 23:59:59.999999999,realgdp,2778.801
204,1959-06-30 23:59:59.999999999,infl,2.340
...,...,...,...
404,2009-06-30 23:59:59.999999999,infl,3.370
201,2009-06-30 23:59:59.999999999,realgdp,12901.504
202,2009-09-30 23:59:59.999999999,realgdp,12990.341
405,2009-09-30 23:59:59.999999999,infl,3.560


In [201]:
my_df.reset_index(drop = True)

Unnamed: 0,date,item,value
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.000
2,1959-03-31 23:59:59.999999999,unemp,5.800
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.340
...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370
605,2009-06-30 23:59:59.999999999,realgdp,12901.504
606,2009-09-30 23:59:59.999999999,realgdp,12990.341
607,2009-09-30 23:59:59.999999999,infl,3.560


---

In [202]:
ldata

Unnamed: 0,date,item,value
0,1959-03-31 23:59:59.999999999,realgdp,2710.349
1,1959-03-31 23:59:59.999999999,infl,0.000
2,1959-03-31 23:59:59.999999999,unemp,5.800
3,1959-06-30 23:59:59.999999999,realgdp,2778.801
4,1959-06-30 23:59:59.999999999,infl,2.340
...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370
605,2009-06-30 23:59:59.999999999,unemp,9.200
606,2009-09-30 23:59:59.999999999,realgdp,12990.341
607,2009-09-30 23:59:59.999999999,infl,3.560


In [203]:
pivoted = ldata.pivot(index = 'date', columns = 'item', values = 'value') # 長格式旋轉成寬格式
pivoted

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,0.00,2710.349,5.8
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2
...,...,...,...
2008-09-30 23:59:59.999999999,-3.16,13324.600,6.0
2008-12-31 23:59:59.999999999,-8.79,13141.920,6.9
2009-03-31 23:59:59.999999999,0.94,12925.410,8.1
2009-06-30 23:59:59.999999999,3.37,12901.504,9.2


In [204]:
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]

Unnamed: 0,date,item,value,value2
0,1959-03-31 23:59:59.999999999,realgdp,2710.349,-0.553177
1,1959-03-31 23:59:59.999999999,infl,0.0,-1.131478
2,1959-03-31 23:59:59.999999999,unemp,5.8,-0.206069
3,1959-06-30 23:59:59.999999999,realgdp,2778.801,0.840009
4,1959-06-30 23:59:59.999999999,infl,2.34,-0.389901
5,1959-06-30 23:59:59.999999999,unemp,5.1,1.643061
6,1959-09-30 23:59:59.999999999,realgdp,2775.488,-0.914611
7,1959-09-30 23:59:59.999999999,infl,2.74,-0.408285
8,1959-09-30 23:59:59.999999999,unemp,5.3,-0.137281
9,1959-12-31 23:59:59.999999999,realgdp,2785.204,2.201281


In [205]:
pivoted = ldata.pivot(index = 'date', columns = 'item') # 值可以不給
pivoted[:10]

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31 23:59:59.999999999,0.0,2710.349,5.8,-1.131478,-0.553177,-0.206069
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1,-0.389901,0.840009,1.643061
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3,-0.408285,-0.914611,-0.137281
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6,0.190345,2.201281,0.451095
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2,-0.649373,-1.572722,2.236344
1960-06-30 23:59:59.999999999,0.14,2834.39,5.2,-0.055455,-1.285118,0.600856
1960-09-30 23:59:59.999999999,2.7,2839.022,5.6,0.544794,-0.600085,2.452978
1960-12-31 23:59:59.999999999,1.21,2802.616,6.3,0.074479,-1.063593,-0.861568
1961-03-31 23:59:59.999999999,-0.4,2819.264,6.8,0.263003,-0.126695,0.484215
1961-06-30 23:59:59.999999999,1.47,2872.005,7.0,0.595578,0.224938,-1.226792


In [206]:
pivoted['value'][:5]

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,0.0,2710.349,5.8
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2


---

In [207]:
ldata

Unnamed: 0,date,item,value,value2
0,1959-03-31 23:59:59.999999999,realgdp,2710.349,-0.553177
1,1959-03-31 23:59:59.999999999,infl,0.000,-1.131478
2,1959-03-31 23:59:59.999999999,unemp,5.800,-0.206069
3,1959-06-30 23:59:59.999999999,realgdp,2778.801,0.840009
4,1959-06-30 23:59:59.999999999,infl,2.340,-0.389901
...,...,...,...,...
604,2009-06-30 23:59:59.999999999,infl,3.370,-1.021488
605,2009-06-30 23:59:59.999999999,unemp,9.200,-1.373707
606,2009-09-30 23:59:59.999999999,realgdp,12990.341,-0.066042
607,2009-09-30 23:59:59.999999999,infl,3.560,0.742919


In [208]:
ldata.set_index(['date', 'item'])

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value2
date,item,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31 23:59:59.999999999,realgdp,2710.349,-0.553177
1959-03-31 23:59:59.999999999,infl,0.000,-1.131478
1959-03-31 23:59:59.999999999,unemp,5.800,-0.206069
1959-06-30 23:59:59.999999999,realgdp,2778.801,0.840009
1959-06-30 23:59:59.999999999,infl,2.340,-0.389901
...,...,...,...
2009-06-30 23:59:59.999999999,infl,3.370,-1.021488
2009-06-30 23:59:59.999999999,unemp,9.200,-1.373707
2009-09-30 23:59:59.999999999,realgdp,12990.341,-0.066042
2009-09-30 23:59:59.999999999,infl,3.560,0.742919


In [209]:
unstacked = ldata.set_index(['date', 'item']).unstack('item') # 跟上面的 .pivot() 方法是等效的
unstacked

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-03-31 23:59:59.999999999,0.00,2710.349,5.8,-1.131478,-0.553177,-0.206069
1959-06-30 23:59:59.999999999,2.34,2778.801,5.1,-0.389901,0.840009,1.643061
1959-09-30 23:59:59.999999999,2.74,2775.488,5.3,-0.408285,-0.914611,-0.137281
1959-12-31 23:59:59.999999999,0.27,2785.204,5.6,0.190345,2.201281,0.451095
1960-03-31 23:59:59.999999999,2.31,2847.699,5.2,-0.649373,-1.572722,2.236344
...,...,...,...,...,...,...
2008-09-30 23:59:59.999999999,-3.16,13324.600,6.0,-0.132448,-0.662626,-1.021218
2008-12-31 23:59:59.999999999,-8.79,13141.920,6.9,1.527985,-1.342494,1.281614
2009-03-31 23:59:59.999999999,0.94,12925.410,8.1,-0.079319,0.854617,-0.706719
2009-06-30 23:59:59.999999999,3.37,12901.504,9.2,-1.021488,-1.067597,-1.373707


---

In [210]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [211]:
# 寬格式旋轉成長格式
# value_vars 沒給會全部轉
melted = pd.melt(df, id_vars = ['key'] ) 
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


---

In [212]:
reshaped = melted.pivot('key', 'variable', 'value')
reshaped

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


In [213]:
reshaped.reset_index()

variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


---

In [214]:
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [215]:
pd.melt(df, id_vars = ['key'], value_vars = ['A', 'B'])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6


In [216]:
# 沒有給 id_vars，就沒有識別行
pd.melt(df, value_vars = ['A', 'B', 'C'])

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


In [217]:
pd.melt(df, value_vars = ['key', 'A', 'B'])

Unnamed: 0,variable,value
0,key,foo
1,key,bar
2,key,baz
3,A,1
4,A,2
5,A,3
6,B,4
7,B,5
8,B,6
