#  <font color=red> Module_08_資料聚合</font>

## 拆開資料

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

In [None]:
# 分組的 key 可以是序列
# 對應關係要注意
# 返回的是一個 GroupBy 的物件
grouped = df['data1'].groupby(df['key1'])  
grouped 

In [None]:
# 一個序列
# 分組的 key 會當索引標籤
grouped.mean() 

---

In [None]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
means

In [None]:
means.unstack()

---

In [None]:
df

In [None]:
# 分組的 key 也可以是長度相符的陣列
# 分組的 key 會當索引標籤
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

---

In [None]:
df

In [None]:
# 直接使用資料框
# 分組的 key 也可以是欄位名稱，此時前面的資料框就要跟欄位有關
# key2 的欄位因為不是數值型態，所以不會被包含在輸出結果中，但儘量不要這樣寫
df.groupby('key1').mean() 

In [None]:
# 這樣會是比較好的寫法
# 先 df.groupby('key1') 得到 GroupBy 的物件
# 再用 index 的方式表達要顯示那些欄位 df.groupby('key1')[['data1', 'data2']]
# 最後套用聚合方法
df.groupby('key1')[['data1', 'data2']].mean() 

---

In [None]:
df

In [None]:
df.groupby(['key1', 'key2']).mean()

In [None]:
# 返回序列
# 資料列的個數，是不是遺失值都要算
df.groupby(['key1', 'key2']).size() 

---

In [None]:
df

In [None]:
df.groupby('key1') # GroupBy 物件

In [None]:
# name 就是分到的每個組
# group 就是滿足這個組的所有資料 
for name, group in df.groupby('key1'): # GroupBy 物件支援疊代，會產生 2-tuple 的序列 
    print(name)
    print(group)

In [None]:
df

In [None]:
for name, group in df.groupby(['key1', 'key2']):
    print(name)
    print(group)

---

In [None]:
# 放進函式 list() 看會變成如何
# 裝成 (key, value) 的 tuple
list(df.groupby('key1'))

In [None]:
pieces = dict(list(df.groupby('key1')))
pieces

In [None]:
# 可以用 key 找 value
pieces['b']

---

In [None]:
df

In [None]:
df.dtypes

In [None]:
# 預設 axis = 0 對列操作，axis = 1 對行做操作
grouped = df.groupby(df.dtypes, axis = 1)

In [None]:
for name, group in grouped:
    print(name)
    print(group)

---

In [None]:
df

In [None]:
df.groupby('key1')['data1'].mean()

In [None]:
# 注意上面那個跟這裡的寫法，兩個出來同一個效果
df['data1'].groupby(df['key1']).mean()

In [None]:
# 兩個框框就變資料框了
df.groupby('key1')[['data1']].mean()

In [None]:
df[['data1']].groupby(df['key1']).mean()

---

In [None]:
df

In [None]:
df.groupby(['key1', 'key2'])['data2'].mean() # 返回序列

In [None]:
df.groupby(['key1', 'key2'])[['data2']].mean() # 返回資料框

---

In [None]:
# 分組的 key 也可以是字典或序列 
# 對應關係要注意
people = pd.DataFrame(np.random.randn(5, 5),
                      columns = ['a', 'b', 'c', 'd', 'e'],
                      index = ['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

In [None]:
people.iloc[2:3, [1, 2]] = np.nan
people

In [None]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [None]:
# 一個不會用到分組的 key 'f'，是不會有問題的
# 但如果你是寫成列表，長度不對就會產生例外
# 在操作時大部分的聚合方法都會忽略遺失值
by_column = people.groupby(mapping, axis = 1) 
by_column.mean()

---

In [None]:
people

In [None]:
map_Series = pd.Series(mapping)
map_Series

In [None]:
people.groupby(map_Series, axis = 1).count() # 這裡的聚合方法 .count() 遺失值不會算入

---

In [None]:
people

In [None]:
people.groupby(len)

In [None]:
# 分組的 key 也可以是函式
# 會作用在每個索引標籤上
# 若加入參數 axis = 1 ，就會作用在欄位上
people.groupby(len).sum()

In [None]:
people.groupby(len, axis = 1).sum()

In [None]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

---

In [None]:
# 也可以對索引標籤做分組
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                    [1, 3, 5, 1, 3]],
                                    names = ['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns = columns)
hier_df

In [None]:
hier_df.groupby(level = 'cty', axis = 1).count()

In [None]:
hier_df.groupby(level = 0).count()

---

In [None]:
sensor_data = pd.read_csv('./mod08/sensors.csv')
sensor_data

In [None]:
group_by_sensor = sensor_data.groupby('sensor')
group_by_sensor # 返回一個 GroupBy 物件

In [None]:
# 如果想看分組數目可用 ngroups 屬性
group_by_sensor.ngroups

In [None]:
group_by_sensor.groups

---

In [None]:
for name, group in group_by_sensor:
    print(name)
    print(group)

In [None]:
# 可以定義成函式，這樣程式碼可以重複使用，注意裡面的參數是要帶入 GroupBy 物件
def print_groups(group_object):
    for name, group in group_object:
        print(name)
        print(group[:5])

---

In [None]:
print_groups(group_by_sensor)

In [None]:
# 返回序列
# 資料列的個數，是不是遺失值都要算
group_by_sensor.size()

In [None]:
group_by_sensor.count() # 對列作操作數個數，排除遺失值

In [None]:
group_by_sensor.get_group('accel') # 可以用 GroupBy 的 .get_group() 方法提取特定分組

In [None]:
group_by_sensor.head(3) # GroupBy 物件的方法，顯示每個分組的前三筆資料

In [None]:
group_by_sensor.tail(3) # GroupBy 物件的方法，顯示每個分組的後三筆資料

In [None]:
group_by_sensor.nth(1) # GroupBy 物件的方法，傳回第二項

In [None]:
# GroupBy 物件的方法，對每一組做敘述性統計量
# axis 不是數值資料，所以沒有被顯示
group_by_sensor.describe() 

---

In [None]:
sensor_data

In [None]:
mcg = sensor_data.groupby(['sensor', 'axis']) # 因為指定了好幾行，所以名字變成了 tuple
print_groups(mcg)

---

In [None]:
mi = sensor_data.copy()
mi = mi.set_index(['sensor', 'axis'])
mi

In [None]:
print_groups(mi.groupby(level = 0)) # 對索引標籤進行分組

In [None]:
print_groups(mi.groupby(level = ['sensor', 'axis']))

## 套用聚合函數

In [None]:
s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9])
s

In [None]:
# Return value at the given quantile.
# 找出分位數
s.quantile(0.5)

In [None]:
s.quantile([0.25, 0.5, 0.75])

---

In [None]:
s = pd.Series([1, 2, 3, 4])
s

In [None]:
s.quantile(0.5)

In [None]:
# 第一步看位置，四個數共三格
# 在 1 + 0.25*3 = 1.75 的位置 
# 第 1 的數是 1 ，第 2 的數是 2，回推第 1.75位置的數是多少
s.quantile([0.25, 0.5, 0.75, 0.9])

---

In [None]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

In [None]:
# 如果沒寫 [['data1', 'data2']]，會出現警告，因為非數值的 key2 沒辦法算分位數
grouped = df.groupby('key1')[['data1', 'data2']]
grouped.quantile(0.5)

In [None]:
grouped.mean()

In [None]:
# 另一種寫法
# 用 .agg() 方法，傳入聚合的函數
grouped.agg(np.mean)

In [None]:
# 也可以傳入字串，但有些沒辦法被辨識出
grouped.agg('mean')

---

In [None]:
print_groups(grouped)

In [None]:
# 注意 arr 傳入的會是序列
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [None]:
# 傳入自定義的函數
grouped.agg(peak_to_peak)

In [None]:
def max_Series(arr):
    return max(arr.map(abs))

In [None]:
# 傳入自定義的函數
grouped.agg(max_Series)

---

In [None]:
grouped.describe()

---

In [None]:
tips = pd.read_csv('./mod08/tips.csv')
tips

In [None]:
# 加入小費佔總結帳金額的比例
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]

In [None]:
grouped = tips.groupby(['day', 'smoker'])
print_groups(grouped)

In [None]:
# 關注欄位 tip_pct
grouped_pct = grouped['tip_pct']
print_groups(grouped_pct)

In [None]:
grouped_pct.agg('mean')

In [None]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

In [None]:
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)]) # 傳入 (name, function) 的 tuple

---

In [None]:
print_groups(grouped)

In [None]:
result = grouped[['tip_pct', 'total_bill']].agg(['count', 'mean', 'max'])
result

In [None]:
result['tip_pct']

In [None]:
# Durchschnitt 是德語的平均，Abweichung 是德語的變異數
grouped[['tip_pct', 'total_bill']].agg([('Durchschnitt', 'mean'), ('Abweichung', np.var)])

---

In [None]:
print_groups(grouped)

In [None]:
grouped.agg({'tip': np.max, 'size': 'sum'})  # 針對不同行套用不同的函式

In [None]:
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
             'size' : 'sum'})

---

In [None]:
tips

In [None]:
# 如果加入參數 as_index = False，day 與 smoker 就不會變成索引標籤
tips.groupby(['day', 'smoker'], as_index = False).mean()

---

In [None]:
sensor_data = pd.read_csv('./mod08/sensors.csv')
sensor_data[:5]

In [None]:
mi = sensor_data.copy()
mi = mi.set_index(['sensor', 'axis'])
mi[:5]

In [None]:
sensor_axis_grouping = mi.groupby(level = ['sensor', 'axis'])
print_groups(sensor_axis_grouping)

In [None]:
sensor_axis_grouping.agg(np.mean)

---

In [None]:
sensor_data

In [None]:
sensor_data.groupby(['sensor', 'axis']).agg(np.mean) # 預設 as_index = True

In [None]:
sensor_data.groupby(['sensor', 'axis'], as_index = False).agg(np.mean) 

---

In [None]:
print_groups(sensor_axis_grouping)

In [None]:
sensor_axis_grouping.mean()

In [None]:
sensor_axis_grouping.agg(np.mean)

In [None]:
sensor_axis_grouping.agg([np.sum, np.std])

In [None]:
# 針對不同行套用不同的函式
# len 在算序列的長度
sensor_axis_grouping.agg({'interval': len, 'reading': np.mean}) 

In [None]:
sensor_axis_grouping['reading'].mean() # 返回序列

In [None]:
sensor_axis_grouping[['reading']].mean() # 返回資料框

## 轉換分組資料

In [None]:
df = pd.DataFrame({'A': 'a b a'.split(),
                   'B': [1, 2, 3],
                   'C': [4, 6, 5]})
df

In [None]:
g = df.groupby('A')

In [None]:
print_groups(g)

---

In [None]:
g[['B', 'C']].mean()

In [None]:
g[['B', 'C']].apply(lambda x: x + 2) 

In [None]:
g[['B', 'C']].apply(lambda x: x / x.sum())

In [None]:
print_groups(g)

In [None]:
g[['B', 'C']].apply(lambda x: x.max() - x.min())

In [None]:
g.apply(lambda x: x.C.max())

---

In [None]:
tips = pd.read_csv('./mod08/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]

In [None]:
def top(df, n = 5, column = 'tip_pct'):
    return df.sort_values(by = column)[-n:]

top(tips, n = 6)

---

In [None]:
print_groups(tips.groupby('smoker'))

In [None]:
# 這裡有聚合的效果，每看到一個群組就會呼叫 top 函式
# 最後每個結果會用類似 pd.concat() 的方式串連起來，用分組名稱標示每塊資料，所以最後出來的結果帶有階層式索引
tips.groupby('smoker').apply(top)

---

In [None]:
print_groups(tips.groupby(['smoker', 'day']))

In [None]:
# 有其他參數或關鍵字要傳的話，將他們寫在函式名稱後面即可
tips.groupby(['smoker', 'day']).apply(top, n = 1, column = 'total_bill')

---

In [None]:
print_groups(tips.groupby('smoker'))

In [None]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

In [None]:
result.stack()

---

In [None]:
tips

In [None]:
print_groups(tips.groupby('smoker')['tip_pct'])

In [None]:
tips.groupby('smoker')['tip_pct'].apply(lambda x: x.describe())

---

In [None]:
print_groups(tips.groupby('smoker'))

In [None]:
tips.groupby('smoker').apply(top)

In [None]:
# 可以比較一下 as_index = False 與 group_keys = False 的差別
tips.groupby('smoker', group_keys = False).apply(top) # 關閉分組索引製作功能

---

In [None]:
# 分位數與購物籃分析
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000)})
frame

In [None]:
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]

In [None]:
grouped = frame.data2.groupby(quartiles)
print_groups(grouped)

In [None]:
grouped.max()

In [None]:
def get_stats(group):
    return  pd.Series({'min': group.min(), 
                       'max': group.max(),
                       'count': group.count(),
                       'mean': group.mean()})

grouped.apply(get_stats)

In [None]:
# 寫成字典會得到跟剛剛一樣的效果，出來的字典型態會被轉換成序列在串連起來
def get_stats(group):
    return  {'min': group.min(), 
            'max': group.max(),
            'count': group.count(),
            'mean': group.mean()}

grouped.apply(get_stats)

In [None]:
grouped.apply(get_stats).unstack()

---

In [None]:
frame

In [None]:
 pd.qcut(frame.data1, 10)

In [None]:
# 如果加入參數 labels = False，會把桶子名改成數值，從 0 開始算起。
grouping = pd.qcut(frame.data1, 10, labels = False)
grouping

In [None]:
grouped = frame.data2.groupby(grouping)
print_groups(grouped)

In [None]:
grouped.apply(get_stats)

In [None]:
grouped.apply(get_stats).unstack()

---

In [None]:
df = pd.DataFrame({'key': ['a', 'b', 'c']*4,
                   'value': np.arange(12.)})
df

In [None]:
g = df.groupby('key')['value']

In [None]:
print_groups(g)

In [None]:
g.mean()

In [None]:
g.apply(lambda x: x.mean())

In [None]:
g.apply(lambda x: np.mean(x))

---

In [None]:
df

In [None]:
g = df.groupby('key')['value']
print_groups(g)

In [None]:
# 注意 GroupBy 物件的 .transform() 方法的效果，與 .apply() 的差別
# 可以產生一個常數，用於廣播到所有分組，把值帶回原始的資料框
# 可以產生跟輸入分組一樣大小的一個資料，再把值帶回原始的資料框
# 它的輸入不能是 mutate 類型
g.transform(lambda x: x.mean())

In [None]:
g.transform('mean')

---

In [None]:
df

In [None]:
g = df.groupby('key')['value']
print_groups(g)

In [None]:
# 這時跟用 .apply() 方法同效果
g.transform(lambda x: x*2)

In [None]:
g.apply(lambda x: x*2)

In [None]:
g.transform(lambda x: x.rank(ascending = False))

---

In [None]:
df

In [None]:
g = df.groupby('key')['value']
print_groups(g)

In [None]:
def normailze(x):
    return (x - x.mean())/x.std()

In [None]:
# 沒有聚合效果，所以跟 .apply() 產生同一的結果
g.transform(normailze)

In [None]:
g.apply(normailze)

---

In [None]:
df

In [None]:
g = df.groupby('key')['value']
print_groups(g)

In [None]:
g.transform('mean')

In [None]:
g.transform('std')

In [None]:
#　這種方法叫做未包裝分組 (unwrapped)
(df['value'] - g.transform('mean'))/g.transform('std')

---

In [None]:
transform_data = pd.DataFrame({ 'Label': ['A', 'C', 'B', 'A', 'C'],
                                'Values': [0, 1, 2, 3, 4],
                                'Values2': [5, 6, 7, 8, 9],
                                'Other': ['foo', 'bar', 'baz', 'fiz', 'buz']}, index = list('VWXYZ'))
transform_data

In [None]:
grouped_by_label = transform_data.groupby('Label')
print_groups(grouped_by_label)

In [None]:
grouped_by_label[['Values', 'Values2']].transform(lambda x: x + 10)

---

In [None]:
df = pd.DataFrame({ 'Label': list("ABABAB"),
                    'Values': [10, 20, 11, np.nan, 12, 22]})
df

In [None]:
grouped = df.groupby('Label')
print_groups(grouped)

In [None]:
grouped.mean()

In [None]:
# 遺失值按照分組的平均值來填
filled_NaNs = grouped.transform(lambda x: x.fillna(x.mean()))
filled_NaNs

---

In [None]:
# 從平均值 0.5，標準差 2 的常態分佈選出 365*3 個數
np.random.seed(123456)
data = pd.Series(np.random.normal(0.5, 2, 365*3), 
                 index = pd.date_range('2013-01-01', periods = 365*3))
data

In [None]:
# window: 窗要多大
# min_periods: 窗裡面最少有幾個值就算出結果
periods = 100
data.rolling(window = periods, min_periods = periods).mean()

In [None]:
periods = 100
rolling = data.rolling(window = periods, min_periods = periods).mean().dropna()
rolling

In [None]:
rolling.plot(); # 之後會有專門的章節介紹

---

In [None]:
rolling

In [None]:
# 注意現在的索引標籤是 datetime 資料型態
group_key = lambda x: x.year
groups = rolling.groupby(group_key) # 別忘了 .groupby() 方法也可帶入函式，會作用在每個索引標籤上
print_groups(groups)

In [None]:
groups.agg([np.mean, np.std])

---

In [None]:
print_groups(rolling.groupby(group_key))

In [None]:
zscore = lambda x: (x - x.mean())/x.std()
normed = rolling.groupby(group_key).transform(zscore)
normed

In [None]:
normed.groupby(group_key).agg([np.mean, np.std])

---

In [None]:
compared = pd.DataFrame({'Original': rolling,
                         'Normed': normed})
compared

In [None]:
# 標準化完的資料再視覺化，更容易理解也更有利於分析
compared.plot();

## 過濾分組資料

In [None]:
df = pd.DataFrame({'Label': list('AABCCC'),
                   'Values': [1, 2, 3, 4, np.nan, 8]})
df

In [None]:
print_groups(df.groupby('Label'))

In [None]:
# GroupBy 物件的 .filter() 方法裡面放函式，函式返回的是布林值
# 把分完組的資料框或序列，一個一個帶入函式，True 的留下來，False 的會被過濾掉，再返回原始資料
f = lambda x: x.Values.count() > 1 # 序列的 .count() 方法沒算遺失值
df.groupby('Label').filter(f)

In [None]:
# 只要組裡有遺失值，整組就被刪了!
f = lambda x: x.Values.isnull().sum() == 0
df.groupby('Label').filter(f)

---

In [None]:
df

In [None]:
grouped = df.groupby('Label')
print_groups(grouped)

In [None]:
grouped.mean()

In [None]:
# 先算出每組的平均，再取平均
group_mean = grouped.mean().mean()
group_mean

In [None]:
# 比組的平均差距高過 2 留下，其他全組過濾掉!
f = lambda x: abs(x.Values.mean() - group_mean) > 2.0
df.groupby('Label').filter(f)

## 綜合應用

### 依分組指定填充遺失值

In [None]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

In [None]:
# 就序列而言，把遺失值填入平均值還蠻常見
s.fillna(s.mean())

---

In [None]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index = states)
data

In [None]:
group_key

In [None]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

In [None]:
print_groups(data.groupby(group_key))

In [None]:
data.groupby(group_key).mean()

In [None]:
# 這題用 .transform() 方法會得到一樣的結果
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

---

In [None]:
print_groups(data.groupby(group_key))

In [None]:
# 這題用 .transform() 方法會得到一樣的結果
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

### 隨機取樣和排列

In [None]:
# 紅心 (Heart)、黑桃 (Spade)、梅花 (club)、方塊 (Diamond)
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10]*3)*4
base_names = ['A'] + list(range(2, 11)) + ['J', 'Q', 'K']
suits

In [None]:
card_val

In [None]:
base_names

In [None]:
# 用 list comprehensive 來顯示紅心的牌
[str(num) + 'H' for num in base_names]

In [None]:
# 真正來製作一副撲克牌
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend([str(num) + suit for num in base_names])
    
deck = pd.Series(card_val, index = cards)
deck

In [None]:
# 自定義函數來看看要取出幾張牌
def draw(deck, n = 5):
    return deck.sample(n)

draw(deck)

In [None]:
# 從每種花色隨機抽兩張牌
get_suit = lambda card: card[-1]
deck.groupby(get_suit).apply(draw, n = 2)

In [None]:
# 當然也可以把匿名函數寫在裡面
deck.groupby(get_suit).apply(lambda x: x.sample(2))

In [None]:
deck.groupby(get_suit, group_keys = False).apply(draw, n = 2)

### 加權平均和關聯性

In [None]:
df = pd.DataFrame(np.array([[1, 2], [3, 4], [5, 4]]),
                  columns = ['data', 'weights'])
df

注意加權平均是要自己乘上【權重/權重相加】<br/>
這題會是 $$1\times\frac{2}{10}+3\times\frac{4}{10}+5\times\frac{4}{10} = 3.4$$

In [None]:
# 可調用 np.average() 函式幫我們處理
np.average(df.data, weights = df.weights)

---

In [None]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                'b', 'b', 'b', 'b'],
                   'data': np.random.randn(8),
                   'weights': np.random.rand(8)})
df

In [None]:
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights = g['weights'])
grouped.apply(get_wavg)

---

In [None]:
close_px = pd.read_csv('./mod08/stock_px_2.csv', 
                       parse_dates = True,
                       index_col = 0)
close_px

In [None]:
# 資料框的 .info() 方法可以簡略看出資料框的資訊
close_px.info()

In [None]:
close_px[:5]

In [None]:
# 計算每個欄位的變化百分比
rets = close_px.pct_change()
rets

In [None]:
# 去除遺失值
rets = rets.dropna()
rets

In [None]:
get_year = lambda x: x.year
by_year  = rets.groupby(get_year)
print_groups(by_year)

In [None]:
# 分完組後，去看每個股票跟 SPX 的相關係數
spx_corr = lambda x: x.corrwith(x['SPX'])
by_year.apply(spx_corr)

In [None]:
# 分完組後，去看 APPLE 股價跟 MSFT 股價的相關係數
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

---

In [None]:
import statsmodels.api as sm

def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

In [None]:
by_year.apply(regress, 'AAPL', ['SPX'])

### 樞紐關係表和交叉表

In [None]:
tips = pd.read_csv('./mod08/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips[:6]

In [None]:
# 跟 .pivot() 方法類似，要給出 values、index、columns
# pivot_table 預設的聚合型態是計算分組的平均值
tips.pivot_table(index = ['day', 'smoker'])

In [None]:
tips.pivot_table(values = ['tip_pct', 'size'], index = ['time', 'day'], columns = 'smoker')

In [None]:
# 注意 margins 參數的效果
tips.pivot_table(values = ['tip_pct', 'size'], index = ['time', 'day'], columns = 'smoker', margins = True)

In [None]:
# 跟上面比對一下
tips['size'].mean()

In [None]:
# 跟上上面比對一下
tips.groupby('smoker')[['size', 'tip_pct']].mean()

---

In [None]:
# 第一個位置參數就是 values
tips.pivot_table('tip_pct', index = ['time', 'smoker'], 
                  columns = 'day',
                  aggfunc= len,
                  margins = True)

In [None]:
# 跟上上面比對一下
grp = tips.groupby(['time', 'smoker', 'day'])['tip_pct']
print_groups(grp)

---

In [None]:
tips.pivot_table('tip_pct', index = ['time', 'size', 'smoker'],
                 columns = 'day', aggfunc = 'mean')

In [None]:
tips.pivot_table('tip_pct', index = ['time', 'size', 'smoker'],
                 columns = 'day', aggfunc = 'mean', fill_value = 0)

---

In [None]:
from io import StringIO

data = """\
Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed"""
data = pd.read_table(StringIO(data), sep = '\s+')

In [None]:
data

In [None]:
# 交叉表是一種樞紐分析表的特殊例子，專用來計算分組的頻率
# pd.crosstab 是全域函式，所以參數沒辦法帶入像 'Nationality' 之類的字串
# 只帶入 index 與 columns 參數，會幫你算每組的個數
pd.crosstab(index = data.Nationality, columns = data.Handedness, margins = True)

In [None]:
data.pivot_table('Sample', index = 'Nationality', columns = 'Handedness', margins= True, aggfunc = 'count')

In [None]:
grp = data.groupby(['Nationality', 'Handedness'])
print_groups(grp)

In [None]:
grp.count()

---

In [None]:
pd.crosstab(index = [tips.time, tips.day], columns = tips.smoker, margins = True)

---

In [None]:
data = """\
Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed"""

In [None]:
lst = data.split('\n')

In [None]:
lst

In [None]:
pd.Series(lst)

In [None]:
# 因為裡面有很多空白，所以要用 '\s+'
s = pd.Series(lst).str.split('\s+')
s

In [None]:
# 要先把資料框的框架建出來
my_df = pd.DataFrame(np.zeros((len(s) - 1, 3)), columns = s[0])
my_df

In [None]:
for i, val in enumerate(s[1:]):
    my_df.iloc[i] = val

In [None]:
my_df