#  <font color=red> Module_07_結合、關聯以及重塑資料</font>

## 串連 (concatenation) 幾個物件的資料

In [None]:
import pandas as pd
import numpy as np
from datetime import date

s1 = pd.Series(np.arange(0, 3))
s2 = pd.Series(np.arange(5, 8))
s1

In [None]:
s2

In [None]:
# 預設對索引標籤做串連，對欄位做聯集對齊
# 此做法會導致重複的索引標籤
pd.concat([s1, s2])

In [None]:
# 對欄位做串連，對索引標籤做聯集對齊
pd.concat([s1, s2], axis = 1)

---

In [None]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [None]:
s1

In [None]:
s2

In [None]:
s3

In [None]:
pd.concat([s1, s2, s3])

In [None]:
pd.concat([s1, s2, s3], axis = 1)

In [None]:
s4 = pd.concat([s1, s3])
s4

In [None]:
s1

In [None]:
pd.concat([s1, s4], axis = 1)

In [None]:
# 預設的 join ='outer'，在這裡也就是對索引標籤串連，對欄位取【聯集】對齊
pd.concat([s1, s4])

In [None]:
 # 當 join = 'inner'，在這裡也就是對索引標籤串連，對欄位取【交集】對齊
pd.concat([s1, s4], join = 'inner')

In [None]:
s1

In [None]:
s4

In [None]:
pd.concat([s1, s4], axis = 1, join = 'inner')

---

In [None]:
s1

In [None]:
s3

In [None]:
# 利用 keys 參數，在結果中給出每組資料來源的名字
# keys 是給在串連的地方
# 當然可以把相同的資料做串連
result = pd.concat([s1, s1, s3], keys = ['one', 'two', 'three'])
result

In [None]:
pd.concat([s1, s1, s3], keys = ['one', 'two', 'three'], axis = 1)

---

In [None]:
result

In [None]:
# 解堆疊
result.unstack()

In [None]:
pd.concat([s1, s2, s3], keys = ['one', 'two', 'three'], axis = 1)

---

In [None]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3),
                  columns = ['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),
                  columns = ['a', 'b', 'c'])
df1

In [None]:
df2

In [None]:
pd.concat([df1, df2]) # 預設對索引標籤 (第 0 軸) 做串連，對欄位 (第 1 軸) 做聯集對齊

In [None]:
pd.concat([df1, df2], axis = 1) # 對欄位 (第 1 軸) 做串聯，對索引標籤 (第 0 軸) 做聯集對齊

---

In [None]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3),
                  columns = ['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),
                  columns = ['a', 'c', 'd'])
df1

In [None]:
df2

In [None]:
pd.concat([df1, df2])

In [None]:
c = pd.concat([df1, df2], keys = ['df1', 'df2'])
c

In [None]:
c.loc['df2']

In [None]:
pd.concat([df1, df2], axis = 1)

In [None]:
df3 = pd.DataFrame(np.arange(20, 26).reshape(3, 2),
                  columns = ['a', 'd'],
                  index = [2, 3, 4])
df3

In [None]:
df1

In [None]:
pd.concat([df1, df3], axis = 1)

---

In [None]:
df1

In [None]:
df2

In [None]:
df3

In [None]:
# 預設是 join = 'outer'，也就是聯集對齊
# 當 join = 'inner'，在這裡就是對欄位串連，對索引標籤取交集對齊
pd.concat([df1, df3], axis = 1, join = 'inner') 

In [None]:
df = pd.concat([df1, df2], axis = 1, keys = ['df1', 'df2'])
df

In [None]:
# df['df2'] 在這會得到同樣的答案
df.loc[:, 'df2']

In [None]:
# 忽略索引標籤，用預設
pd.concat([df1, df2], ignore_index = True)

---

In [None]:
df1

In [None]:
df2

In [None]:
# 資料框的 .append() 方法只是對索引標籤做串連，對欄位作聯集對齊
df1.append(df2)

In [None]:
df1.append(df2, ignore_index = True)

## 合併與連接資料

In [None]:
customers = {'CustomerID': [10, 11],
             'Name': ['Mike', 'Marcia'],
             'Address': ['Address for Mike',
                         'Address for Marcia']}
customers = pd.DataFrame(customers)
customers

In [None]:
# date() 是日期資料型態，datetime() 是日期時間資料型態，用法類似
orders = {'CustomerID': [10, 11, 10],
          'OrderDate': [date(2014, 12, 1),
                        date(2014, 12, 1),
                        date(2014, 12, 1)]}
orders = pd.DataFrame(orders)
orders

In [None]:
# 類似關聯式資料庫的 join， on 可以自己下，預設會找共同的欄位
# on 下的欄位會放在最前面的欄位，不會重複出現
# 也可以用全域函式 pd.merge()
customers.merge(orders) 

In [None]:
orders.merge(customers)

---

In [None]:
left_data = {'key1': ['a', 'b', 'c'], 
            'key2': ['x', 'y', 'z'],
            'lval1': [ 0, 1, 2]}
right_data = {'key1': ['a', 'b', 'c'],
              'key2': ['x', 'a', 'z'], 
              'rval1': [ 6, 7, 8 ]}
left = pd.DataFrame(left_data, index=[0, 1, 2])
right = pd.DataFrame(right_data, index=[1, 2, 3])
left

In [None]:
right

In [None]:
left.merge(right)

In [None]:
# _x 表示來自左物件 _y表示來自右物件 
# 可用 suffixes 修改
left.merge(right, on = 'key1') 

In [None]:
left.merge(right, on = 'key1', suffixes = ['_l', '_r'])

---

In [None]:
left

In [None]:
right

In [None]:
left.merge(right, left_on = 'key1', right_on = 'key2') # 也可以對不同行的行名來合併

---

In [None]:
left

In [None]:
right

In [None]:
# 如果是要對索引標籤做合併的話，可以使用 left_index = True 和  right_index = True，兩個都要設定
# 這樣就變成了對索引標籤做交集對齊，對欄位做串連了。也就是結果會跟下面一樣
left.merge(right, left_index = True, right_index = True) 

In [None]:
pd.concat([left, right], axis = 1, join = 'inner')

---

In [None]:
left

In [None]:
right

In [None]:
left.merge(right, how = 'inner') # 預設 how = 'inner'，兩個 Dataframe 物件的共同欄位的值的交集

In [None]:
left.merge(right, how = 'outer') # 預設 how = 'outer'，兩個 Dataframe 物件的共同欄位的值的聯集

In [None]:
left.merge(right, how = 'left') # 只使用來自左方 Dataframe 物件

In [None]:
left.merge(right, how = 'right') # 只使用來自右方 Dataframe 物件

---

In [None]:
left.merge(right, how = 'cross') 

---

In [None]:
left

In [None]:
right

In [None]:
left.join(right)

In [None]:
# 資料框的 .join() 方法是對兩個 Dataframe 物件的索引標籤進行連結操作，而非裡面的資料值
# how = 'left' 是預設

left.join(right, lsuffix='_left', rsuffix='_right') 

In [None]:
left.join(right, lsuffix='_left', rsuffix='_right', how = 'inner') 

In [None]:
left.join(right, lsuffix='_left', rsuffix='_right', how = 'outer') 

## 合併有重複的資料

In [None]:
a = pd.Series([np.nan, 2.5, 0, 3.5, 4.5, np.nan],
              index = ['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series([0, np.nan, 2, np.nan, np.nan, 5],
              index = ['a', 'b', 'c', 'd', 'e', 'f'])
a

In [None]:
b

In [None]:
# 返回 ndarray 
# 跟索引標籤無關
np.where(b.isnull(), a, b) 

In [None]:
# Update null elements with value in the same location in 'other'
# 以左邊的資料為主，只要兩邊有一邊是 NaN，就會用另一邊的值合併
# 這題剛好跟 b.fillna(a) 結果一樣， 但是 .combine_first() 方法是合併有重複的資料，下題可看出不同的地方
b.combine_first(a)

---

In [None]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})
df1

In [None]:
df2

In [None]:
df1.combine_first(df2) # 逐欄做一樣的動作

---

In [None]:
df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
df1

In [None]:
df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
df2

In [None]:
df1.combine_first(df2)

## 資料值與索引的樞紐操作

In [None]:
# 資料常存成堆疊格式，或稱為紀錄格式
sensor_readings = pd.read_csv('./mod07/accel.csv')
sensor_readings

In [None]:
# 如何著手找出特定軸的讀數
sensor_readings[sensor_readings['axis'] == 'X']

In [None]:
# 更好的辦法是使用樞紐操作
# 資料框的 .pivot() 方法的用法就是要決定 index、columns、values
# 樞紐的動作，也就是將資料框從長格式 (long format) 轉成寬格式 (wide format) 的操作
sensor_readings.pivot(index = 'interval',
                      columns = 'axis',
                      values = 'reading')

---

In [None]:
# 樞紐操作其實就是在操作堆疊跟解堆疊
sensor_readings.set_index(['interval', 'axis'])

In [None]:
sensor_readings.set_index(['interval', 'axis']).unstack()

## 堆疊與解堆疊

In [None]:
df = pd.DataFrame({'a': [1, 2]}, index = ['one', 'two'])
df

In [None]:
stacked1 = df.stack()
stacked1

In [None]:
# 因為原本的資料框只有 level = 0 的欄位，堆疊後就退化成序列了
type(stacked1)

In [None]:
stacked1['one']['a'] # 等於 stacked1['one', 'a']

In [None]:
df.unstack()

---

In [None]:
df = pd.DataFrame({'a': [1, 2],
                   'b': [3, 4]},
                   index = ['one', 'two'])
df

In [None]:
stacked2 = df.stack()
stacked2

In [None]:
type(stacked2)

In [None]:
stacked2['one']['b'] # 等於 stacked2['one', 'b']

In [None]:
df.unstack()

---

In [None]:
sensor_readings = pd.read_csv('./mod07/accel.csv')
sensor_readings

In [None]:
user1 = sensor_readings.copy()
user2 = sensor_readings.copy()
user1['who'] = 'Mike'
user2['who'] = 'Mikael'
user2['reading'] *= 100

user1[:5]

In [None]:
user2[:5]

In [None]:
pd.concat([user1, user2])

In [None]:
multi_user_sensor_data = pd.concat([user1, user2]).set_index(['who', 'interval', 'axis'])
multi_user_sensor_data

In [None]:
multi_user_sensor_data.loc['Mike']

In [None]:
# 在 mod03 有遇到過，複習一下!!
multi_user_sensor_data.xs(1, level = 'interval')

In [None]:
multi_user_sensor_data

In [None]:
# 解堆疊預設的是 level = -1，也就是最裡面的 level，在這題就是 level = 2
# 我們可以把欄位想成是我們想要觀察的主角，這裡就是想知道 X、Y、Z 的 reading
multi_user_sensor_data.unstack()

---

In [None]:
multi_user_sensor_data

In [None]:
# 我們可以把欄位想成是我們想要觀察的主角，這裡就是想知道 Mikael 與 Mike 分別收集的 reading
multi_user_sensor_data.unstack(level = 0)

In [None]:
multi_user_sensor_data

In [None]:
unstacked = multi_user_sensor_data.unstack(['who', 'axis'])
unstacked

In [None]:
unstacked.stack(level = 'who')

---

In [None]:
data = pd.DataFrame({'Name': ['Mike', 'Mikael'],
                     'Height': [6.1, 6.0],
                     'Weight': [220, 185]})
data

In [None]:
pd.melt(data, 
        id_vars = 'Name',
        value_vars = ['Height', 'Weight'])

---

In [None]:
sensor_readings = pd.read_csv('./mod07/accel.csv')
sensor_readings

In [None]:
pivoted = sensor_readings.pivot(index = 'interval', columns = 'axis', values = 'reading')
pivoted

In [None]:
# 要融解前要把 interval 放回欄位，不然會產生例外
pivoted = pivoted.reset_index()
pivoted

In [None]:
pd.melt(pivoted, id_vars = 'interval', value_vars = ['X', 'Y', 'Z'] )

In [None]:
pd.melt(pivoted, id_vars = 'interval', value_vars = ['X', 'Y', 'Z'] ).sort_values(by = 'interval')

## 綜合應用

In [None]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
df1

In [None]:
df2

In [None]:
# 當然也可以用資料框的 .merge() 方法
pd.merge(df1, df2)

In [None]:
pd.merge(df1, df2, on = 'key')

---

In [None]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})

df3

In [None]:
df4

In [None]:
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey')

---

In [None]:
df1

In [None]:
df2

In [None]:
pd.merge(df1, df2, how = 'outer')

---

In [None]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})
df1

In [None]:
df2

In [None]:
pd.merge(df1, df2, on = 'key', how = 'left')

In [None]:
pd.merge(df1, df2, on = 'key', how = 'inner')

---

In [None]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})
left

In [None]:
right

In [None]:
pd.merge(left, right, on = ['key1', 'key2'], how = 'outer')

In [None]:
pd.merge(left, right, on = 'key1')

In [None]:
pd.merge(left, right, on = 'key1', suffixes = ['_left', '_right'])

---

In [None]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, 
                       index = ['a', 'b'])
left1

In [None]:
right1

In [None]:
# 一邊選欄位，一邊選索引標籤做合併
pd.merge(left1, right1, left_on = 'key', right_index = True)

In [None]:
pd.merge(left1, right1, left_on = 'key', right_index = True, how = 'outer')

---

In [None]:
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
                               'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})
righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                      index = [['Nevada', 'Nevada', 'Ohio', 'Ohio',
                              'Ohio', 'Ohio'],
                             [2001, 2000, 2000, 2000, 2001, 2002]],
                      columns = ['event1', 'event2'])
lefth

In [None]:
righth

In [None]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index = True)

In [None]:
pd.merge(lefth, righth, left_on = ['key1', 'key2'], right_index = True, how = 'outer')

---

In [None]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])
left2

In [None]:
right2

In [None]:
pd.merge(left2, right2, left_index = True, right_index = True, how = 'outer')

---

In [None]:
# 資料框的 .join() 方法是對兩個 Dataframe 物件的索引標籤進行連結操作，而非裡面的資料值
# how = 'left' 是預設
# 答案跟上面一樣
# .join 方法是 .merge() 方法的弱化
left2.join(right2, how = 'outer')

---

In [None]:
left1

In [None]:
right1

In [None]:
# 資料框的 .join() 方法是對兩個 Dataframe 物件的索引標籤進行連結操作，而非裡面的資料值
# how = 'left' 是預設
left1.join(right1)

In [None]:
# 有 on 參數可選，on 屬性是給 left1 的
# 所以現在 left1 看 'key'，right1 還是看索引標籤
left1.join(right1, on = 'key')

In [None]:
right1.join(left1, on = 'key') # right1 根本沒有 'key' 的欄位

---

In [None]:
left2

In [None]:
right2

In [None]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index = ['a', 'c', 'e', 'f'],
                       columns = ['New York', 'Oregon'])
another

In [None]:
left2.join([right2, another])

In [None]:
left2.join([right2, another], how = 'outer')

---

In [None]:
arr = np.arange(12).reshape(3, 4)
arr

In [None]:
# 這是 numpy 的 np.concatenate() 函式，作用在 ndarray 上
np.concatenate([arr, arr], axis = 1)

---

In [None]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index = ['a', 'b', 'c'],
                   columns = ['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index = ['a', 'c'],
                   columns = ['three', 'four'])
df1

In [None]:
df2

In [None]:
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'])

In [None]:
# 另一種給資料來源的方式
pd.concat({'level1': df1, 'level2': df2 }, axis = 1)

In [None]:
pd.concat([df1, df2], axis = 1, keys = ['level1', 'level2'], names = [ 'upper', 'lower'])

---

In [None]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns = ['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns = ['b', 'd', 'a'])
df1

In [None]:
df2

In [None]:
pd.concat([df1, df2], ignore_index = True)

---

In [None]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index = pd.Index(['Ohio', 'Colorado'], name = 'state'),
                    columns = pd.Index(['one', 'two', 'three'], name = 'number'))
data

In [None]:
result = data.stack()
result

In [None]:
result.unstack()

In [None]:
result.unstack(0)

In [None]:
result.unstack('state')

---

In [None]:
s1 = pd.Series([0, 1, 2, 3], index = ['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index = ['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys = ['one', 'two'])
data2

In [None]:
data2.unstack()

In [None]:
data2.unstack().stack() # .stack() 方法會自動濾掉遺失值

In [None]:
data2.unstack().stack(dropna = False)

---

In [None]:
result

In [None]:
# pd.Index() 函式有參數 name 可以用
df = pd.DataFrame({'left': result, 'right': result + 5},
                  columns = pd.Index(['left', 'right'], name = 'side'))
df

In [None]:
df.unstack('state')

In [None]:
df.unstack('state').stack('side')

---

In [None]:
data = pd.read_csv('./mod07/macrodata.csv')
data.head()

In [None]:
periods = pd.PeriodIndex(year = data.year, quarter = data.quarter, name = 'date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name = 'item')
periods

In [None]:
columns

In [None]:
data = data.reindex(columns = columns)
data

In [None]:
data.index = periods.to_timestamp(freq = 'D', how = 'end')
data

---

In [None]:
data.stack()

In [None]:
data.stack().reset_index()

In [None]:
ldata = data.stack().reset_index().rename(columns = {0: 'value'})

In [None]:
ldata

---

In [None]:
data_cp = data.copy()
data_cp = data_cp.reset_index()
data_cp

In [None]:
pd.melt(data_cp, id_vars = 'date', value_vars = ['realgdp', 'infl', 'unemp'])

In [None]:
# 可以再把 index 更換一下!
my_df = pd.melt(data_cp, id_vars = 'date', value_vars = ['realgdp', 'infl', 'unemp']).sort_values(by = 'date')
my_df

In [None]:
my_df.reset_index(drop = True)

---

In [None]:
ldata

In [None]:
pivoted = ldata.pivot(index = 'date', columns = 'item', values = 'value') # 長格式旋轉成寬格式
pivoted

In [None]:
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]

In [None]:
pivoted = ldata.pivot(index = 'date', columns = 'item') # 值可以不給
pivoted[:10]

In [None]:
pivoted['value'][:5]

---

In [None]:
ldata

In [None]:
ldata.set_index(['date', 'item'])

In [None]:
unstacked = ldata.set_index(['date', 'item']).unstack('item') # 跟上面的 .pivot() 方法是等效的
unstacked

---

In [None]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df

In [None]:
# 寬格式旋轉成長格式
# value_vars 沒給會全部轉
melted = pd.melt(df, id_vars = ['key'] ) 
melted

---

In [None]:
reshaped = melted.pivot('key', 'variable', 'value')
reshaped

In [None]:
reshaped.reset_index()

---

In [None]:
df

In [None]:
pd.melt(df, id_vars = ['key'], value_vars = ['A', 'B'])

In [None]:
# 沒有給 id_vars，就沒有識別行
pd.melt(df, value_vars = ['A', 'B', 'C'])

In [None]:
pd.melt(df, value_vars = ['key', 'A', 'B'])