<a href="https://colab.research.google.com/github/sasuraibito1125/google_colab/blob/main/F%E5%88%86%E5%B8%83%E8%A1%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 関数定義

In [1]:
# @title #### 四捨五入関数 { vertical-output: true, display-mode: "form" }
#@markdown 関数名: `round_up`
#@markdown * 引数：`precision=4`, `digit=None`
#@markdown * 戻り値：lambda function
def round_up(precision=4, digit=None):
  '''Generate lambda function to round off.

  ex)
  df.map(round_up(digit='0.0001'))
  df.map(round_up(digit='1E1'))

  Parameters
  ----------
  precision: precision of rounding. default is 4.
  digit: string of rounding digit. default is None(no used) and prioritize this to precision.

  Returns
  -------
  lambda function
    lambda function to round off.
  '''
  from decimal import Decimal, ROUND_HALF_UP
  def to_digit(precision):
    if precision == 0:
      return '0'
    else:
      return '0.' + ('0' * (precision - 1)) + '1'
  return lambda x: float(Decimal(str(x))
                            .quantize(Decimal(digit if digit else to_digit(precision)),
                                      rounding=ROUND_HALF_UP))

In [2]:
# @title #### F分布表生成関数 { vertical-output: true, display-mode: "form" }
# @markdown 関数名：`generate_fd_table`
# @markdown * 引数：`alpha`, `v1=None`, `v2=None`, `precision=4`
# @markdown * 戻り値：`DataFrame`
# @markdown * 依存関数：`round_up`
def generate_fd_table(alpha, v1=None, v2=None, precision=4):
  '''To generate the table of F distribution.

  Parameters
  ----------
  alpha: point of upper.
  v1: list of first degrees of freedom. if not defined, values are 1 to 30 and 40 to 100 per 10 steps
  v2: list of second degrees of freedom. if not defined, values are 1 to 30 and 40 to 100 per 10 steps
  precision: precision of the value of probability. default 4.

  Returns
  -------
  DataFrame
    table of F distribution
  '''
  import numpy as np
  import pandas as pd
  from scipy.stats import f

  def generate_default_v():
    v_a = np.arange(1, 31, dtype=int)
    v_b = np.linspace(40, 100, 7, dtype=int)
    return np.concatenate((v_a, v_b))

  def ensure_v(v):
    return np.array(v) if v else generate_default_v()

  v1 = ensure_v(v1)
  v2 = ensure_v(v2).reshape(-1, 1)

  df = pd.DataFrame(f.isf(alpha, v1, v2),
                    columns=list(map(lambda x: "%d" % x, v1)),
                    index=list(map(lambda x: "%d" % x, v2)))
  df.index.name = 'v2↓v1→'
  return df.applymap(round_up(precision))


In [3]:
# @title #### 表の行と列を強調する関数
# @markdown 関数名：`highlight_subject`
# @markdown * 引数：`df`, `row=None`, `col=None`, `row_color='#FFD0FF'`, `col_color='#D1FDFF'`, `cross_color='#C7B5FF'`
# @markdown * 戻り値：`Styler`
def highlight_subject(df, row=None, col=None,
                      row_color='#FFD0FF',
                      col_color='#D1FDFF',
                      cross_color='#C7B5FF'):
  '''highlight row, column and cross point of DataFrame.

  Parameters
  ----------
  df: a target pandas DataFrame to be highlighted
  row: a target row to be highlighted
  col: a target column to be highlighted
  row_color: a highlight color of row
  col_color: a highlight color of column
  cross_color: a highlight color of cross point of highligh row and column

  Returns
  -------
  Styler
    pandas Styler object to be highlighted
  '''
  import pandas as pd

  idx = pd.IndexSlice
  styler = df.style
  if row:
    styler = styler.set_properties(**{'color':'black', 'background-color': row_color},
                                   subset=idx[idx[row], :])
  if col:
    styler = styler.set_properties(**{'color':'black', 'background-color': col_color},
                                   subset=idx[col])
  if col and row:
    styler = styler.set_properties(**{'color':'black', 'background-color': cross_color},
                                   subset=idx[idx[row], idx[col]])
  return styler


In [4]:
# @title #### F分布表から指定された自由度の $F$ 値を取得する関数 { vertical-output: true, display-mode: "form" }
# @markdown 関数名：`find_f_in_fd_table`
# @markdown * 引数：`fd`, `v1`, `v2`
# @markdown * 戻り値：`float`
def find_f_in_fd_table(fd, v1, v2):
  '''Find F-value from degrees of freedom in F distribution table.

  Parameters
  ----------
  fd: pandas DataFrame of F distribution table.
  v1: first degrees of freedom.
  v2: second degrees of freedom.

  Returns
  -------
  float
    F-value(float)
  '''
  return fd.at[f'{v2:d}', f'{v1:d}']


In [5]:
# @title #### 一元配置分散分析の分散分析表生成関数 { vertical-output: true, display-mode: "form" }
# @markdown 関数名：`to_anova`
# @markdown * 引数：`data`, `precision=3`
# @markdown * 戻り値：`DataFrame`
def to_anova(data, precision=3):
  '''Convert to analysis of variance table from original data.

  Parameters
  ----------
  data: DataFrame of original data.
  precision: precision of values calculated except only F-value is precision + 2.

  Returns
  -------
  DataFrame
    analysis of variance table.
  '''
  import numpy as np
  import pandas as pd

  round = round_up(precision)
  round_p2 = round_up(precision + 2)
  np_round = np.frompyfunc(round, 1, 1)

  data_mean_all = np_round(np.nanmean(data))

  ma_all = round(((data - data_mean_all)**2).sum().sum())
  ma_each = np_round(((data.mean(axis=1, numeric_only=True).apply(round) - data_mean_all)**2 * data.count(axis=1, numeric_only=True)).sum())

  anova = pd.DataFrame({
                      '平方和': [ma_each, round(ma_all - ma_each), ma_all],
                      '自由度': [len(data.index) - 1, 0, data.count().sum() - 1],
                    }, index=['要因', '残差', '全体'])

  anova.loc['残差', '自由度'] = anova['自由度']['全体'] - anova['自由度']['要因']
  anova['平均平方'] = (anova['平方和'] / anova['自由度']).apply(round)
  anova.loc['要因', 'F値'] = round_p2(anova['平均平方']['要因'] / anova['平均平方']['残差'])

  anova.loc['全体', '平均平方'] = ''

  return anova.fillna('')


In [None]:
# @title #### 二元配置分散分析の分散分析表生成関数 { vertical-output: true, display-mode: "form" }
# @markdown 関数名：`to_2w_anova`
# @markdown * 引数：`data`, `factors`, `repeating=True`, `precision=3`
# @markdown * 戻り値：`tuple<DataFrame, DataFrame>`
# @markdown * 依存関数：`round_up`
def to_2w_anova(data, factors, repeating=True, precision=5):
  '''Convert to 2-way analysis of variance table from original data.

  Parameters
  ----------
  data: DataFrame of original data.
  factors: list of names of factors.
  repeating: with repeating data or not. default value is True.
  precision: precision of values calculated.

  Returns
  -------
  tuple
    2-way analysis of variance table and table of various means .
  '''
  import pandas as pd
  import numpy as np

  # prepare rounding functions
  round = round_up(precision)
  np_round = np.frompyfunc(round, 1, 1)

  ## common values
  df_cols = data.columns
  df_idx = data.index.unique(0)
  idx_ndups = len(data.loc[df_idx[0]].index)

  ## various means
  df_means = pd.DataFrame({c:[data.loc[i, c].mean() for i in df_idx] for c in df_cols},
                          index=df_idx)
  df_means.loc['全体'] = data.mean()
  df_means['全体'] = pd.Series({i:np.nanmean(data.loc[i]) for i in df_idx})
  df_means.loc['全体', '全体'] = np.nanmean(data)
  df_means = df_means.applymap(round)

  df_mean_all = df_means.loc['全体', '全体']
  df_col_means = df_means.loc['全体']
  df_row_means = df_means['全体']
  df_pair_means = df_means.iloc[:-1, :-1]

  ## sum of squares
  sos_all = round(((data - df_mean_all)**2).sum().sum())
  sos_col = round(((df_col_means - df_mean_all)**2 * len(data.index)).sum().sum())
  sos_row = round(sum([(df_row_means[i] - df_mean_all)**2 * (len(df_cols) * (len(data.loc[i]) if repeating else 1)) for i in df_idx]))
  sos_pair = round(((df_pair_means - df_mean_all)**2 * idx_ndups).sum().sum() - sos_col - sos_row) if repeating else 0.0
  sos_re = sos_all - sos_col - sos_row - sos_pair

  ## degrees of freedom
  dof_all = data.count().sum() - 1
  dof_col = len(df_cols) - 1
  dof_row = len(df_idx) - 1
  dof_pair = dof_col * dof_row if repeating else 0
  dof_re = dof_all - dof_col - dof_row - dof_pair

  ## mean squares
  ms_col = round(sos_col / dof_col)
  ms_row = round(sos_row / dof_row)
  ms_pair = round(sos_pair / dof_pair) if repeating else 0.0
  ms_re = round(sos_re / dof_re)

  ## F-values
  f_col = round(ms_col / ms_re)
  f_row = round(ms_row / ms_re)
  f_pair = round(ms_pair / ms_re) if repeating else 0.0

  anova = pd.DataFrame({
      '平方和': [sos_col, sos_row, sos_pair, sos_re, sos_all],
      '自由度': [dof_col, dof_row, dof_pair, dof_re, dof_all],
      '平均平方': [ms_col, ms_row, ms_pair, ms_re, ''],
      'F値': [f_col, f_row, f_pair, '', ''],
  }, index=[*factors, f'{factors[0]}×{factors[1]}', '残差', '全体'])
  if not repeating:
    anova = anova.drop(index=f'{factors[0]}×{factors[1]}')
  return anova, df_means

In [None]:
# @title #### 分散分析におけるF検定の判定 { vertical-output: true }
# @markdown 関数名：`decide_f_test`
# @markdown * 引数：`avt2d`, `alpha`
# @markdown * 戻り値：`DataFrame`
# @markdown * 依存関数：`generate_fd_table`, `find_f_in_fd_table`
def decide_f_test(avt2d, alpha):
  '''Decide F-test of 2-dimentional analysis of variance table.

  Parameters
  ----------
  avt2d: Pandas DataFrame of 2-dimentional analysis of variance table.
  alpha: point of upper.

  Returns
  -------
  DataFrame
    result of decision of F-test.
  '''
  result = avt2d.iloc[:3, [1,3]] # pick dof and f-value (as base DataFrame)

  v2 = avt2d.loc['残差', '自由度']
  result['残差の自由度(v2)'] = v2

  fd = generate_fd_table(alpha, list(range(result['自由度'].min() - 3, result['自由度'].max() + 3)), list(range(v2 - 3, v2 + 3)))
  result['棄却域'] = [find_f_in_fd_table(fd, result.iloc[i, 0], v2) for i in range(3)]

  result['帰無仮説'] = result.apply(lambda row: '棄却する' if row['棄却域'] < row['F値'] else '棄却しない', axis=1)
  return result.iloc[:, [0, 2, 3, 1, 4]]

# 利用例

In [6]:
# @title #### 利用例：`generate_fd_table` { run: "auto", display-mode: "both" }
# @markdown F分布表の表示（抜粋）
# @markdown
# @markdown 上側$\alpha\%$点を指定
alpha = 0.05 #@param {type:"number"}
fd = generate_fd_table(alpha)
fd.iloc[:15, :10]

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
v2↓v1→,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,161.4476,199.5,215.7073,224.5832,230.1619,233.986,236.7684,238.8827,240.5433,241.8817
2,18.5128,19.0,19.1643,19.2468,19.2964,19.3295,19.3532,19.371,19.3848,19.3959
3,10.128,9.5521,9.2766,9.1172,9.0135,8.9406,8.8867,8.8452,8.8123,8.7855
4,7.7086,6.9443,6.5914,6.3882,6.2561,6.1631,6.0942,6.041,5.9988,5.9644
5,6.6079,5.7861,5.4095,5.1922,5.0503,4.9503,4.8759,4.8183,4.7725,4.7351
6,5.9874,5.1433,4.7571,4.5337,4.3874,4.2839,4.2067,4.1468,4.099,4.06
7,5.5914,4.7374,4.3468,4.1203,3.9715,3.866,3.787,3.7257,3.6767,3.6365
8,5.3177,4.459,4.0662,3.8379,3.6875,3.5806,3.5005,3.4381,3.3881,3.3472
9,5.1174,4.2565,3.8625,3.6331,3.4817,3.3738,3.2927,3.2296,3.1789,3.1373
10,4.9646,4.1028,3.7083,3.478,3.3258,3.2172,3.1355,3.0717,3.0204,2.9782


In [7]:
# @title #### 利用例：`generate_fd_table` { run: "auto", display-mode: "both" }
# @markdown F分布表の表示（上側$\alpha\%$点および自由度$v_1, v_2$を指定）
# @markdown
# @markdown $\alpha$ と自由度
alpha = 0.05 #@param {type:"number"}
v1 = [1, 2, 3, 4, 5, 6, 7] #@param {type:"raw"}
v2 = [1, 2, 3, 4, 5, 10, 15, 20, 50, 100] #@param {type:"raw"}

generate_fd_table(alpha=alpha, v1=v1, v2=v2)

Unnamed: 0_level_0,1,2,3,4,5,6,7
v2↓v1→,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,161.4476,199.5,215.7073,224.5832,230.1619,233.986,236.7684
2,18.5128,19.0,19.1643,19.2468,19.2964,19.3295,19.3532
3,10.128,9.5521,9.2766,9.1172,9.0135,8.9406,8.8867
4,7.7086,6.9443,6.5914,6.3882,6.2561,6.1631,6.0942
5,6.6079,5.7861,5.4095,5.1922,5.0503,4.9503,4.8759
10,4.9646,4.1028,3.7083,3.478,3.3258,3.2172,3.1355
15,4.5431,3.6823,3.2874,3.0556,2.9013,2.7905,2.7066
20,4.3512,3.4928,3.0984,2.8661,2.7109,2.599,2.514
50,4.0343,3.1826,2.79,2.5572,2.4004,2.2864,2.1992
100,3.9361,3.0873,2.6955,2.4626,2.3053,2.1906,2.1025


In [8]:
# @title #### 利用例：`highlight_subject` { run: "auto", display-mode: "both" }
# @markdown F分布表の指定された自由度の $F$ 値を強調表示
# @markdown
# @markdown 自由度
v1 = 5 #@param {type:"integer"}
v2 = 10 #@param {type:"integer"}
highlight_subject(fd.head(15).iloc[:, :10], f'{v2:d}', f'{v1:d}')

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10
v2↓v1→,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,161.4476,199.5,215.7073,224.5832,230.1619,233.986,236.7684,238.8827,240.5433,241.8817
2,18.5128,19.0,19.1643,19.2468,19.2964,19.3295,19.3532,19.371,19.3848,19.3959
3,10.128,9.5521,9.2766,9.1172,9.0135,8.9406,8.8867,8.8452,8.8123,8.7855
4,7.7086,6.9443,6.5914,6.3882,6.2561,6.1631,6.0942,6.041,5.9988,5.9644
5,6.6079,5.7861,5.4095,5.1922,5.0503,4.9503,4.8759,4.8183,4.7725,4.7351
6,5.9874,5.1433,4.7571,4.5337,4.3874,4.2839,4.2067,4.1468,4.099,4.06
7,5.5914,4.7374,4.3468,4.1203,3.9715,3.866,3.787,3.7257,3.6767,3.6365
8,5.3177,4.459,4.0662,3.8379,3.6875,3.5806,3.5005,3.4381,3.3881,3.3472
9,5.1174,4.2565,3.8625,3.6331,3.4817,3.3738,3.2927,3.2296,3.1789,3.1373
10,4.9646,4.1028,3.7083,3.478,3.3258,3.2172,3.1355,3.0717,3.0204,2.9782


In [9]:
# @title 利用例：`find_f_in_fd_table` { run: "auto", vertical-output: true, display-mode: "both" }
# @markdown 自由度を指定して $F$ 値を計算
# @markdown
# @markdown 自由度
v1 = 5 #@param {type:"integer"}
v2 = 10 #@param {type:"integer"}
find_f_in_fd_table(fd, v1, v2)

3.3258

In [10]:
# @title #### 利用例：`generate_fd_table`、`find_f_in_fd_table` { run: "auto", vertical-output: true, display-mode: "both" }
# @markdown $\alpha$ と自由度を指定して $F$ 値を計算
# @markdown
# @markdown $\alpha$ と自由度
alpha = 0.025 # @param {type:"number"}
v1 = 5 # @param {type:"integer"}
v2 = 3 # @param {type:"integer"}
find_f_in_fd_table(generate_fd_table(alpha), v1, v2)

14.8848

In [11]:
# @title #### 利用例：`to_anova` { run: "auto", vertical-output: true, display-mode: "both" }
# @markdown 一元配置分散分析の分散分析表の作成
import pandas as pd
data = {
      '1': [21.0, 92.0,  2.0,  8.0,  8.0,  2.0, 13.0],
      '2': [12.0, 15.0, 18.0, 20.0, 13.0,  3.0,  2.0],
      '3': [ 8.0,  6.0,  5.0, 11.0,  5.0,  3.0,  2.0],
      '4': [11.0,  5.0,  5.0, 34.0,  0.0,  1.0,  5.0],
      '5': [ 6.0,  5.0, 21.0,  4.0,  1.0, None,  5.0],
      '6': [ 3.0,  5.0,  6.0,  0.0, None, None,  9.0],
      '7': [ 4.0,  5.0,  0.0, 10.0, None, None,  4.0],
      '8': [None, None,  2.0, None, None, None,  7.0],
      '9': [None, None, 14.0, None, None, None, None]
    }
index = ['北海道・東北', '関東', '中部', '近畿', '中国', '四国', '九州・沖縄']
to_anova(pd.DataFrame(data, index=index))

Unnamed: 0,平方和,自由度,平均平方,F値
要因,1108.27,6,184.712,0.92624
残差,7976.836,40,199.421,
全体,9085.106,46,,


In [None]:
# @title #### 利用例：`to_2w_anova` { run: "auto", vertical-output: true, display-mode: "both" }
# @markdown 二元配置分散分析の分散分析表の作成
import pandas as pd
data = {
      '肥料\u3000100g': [14.5, 15.1, 14.1, 16.2, 15.3, 17.5],
      '肥料\u3000200g': [16.5, 16.1, 15.0, 18.6, 16.9, 18.6],
      '肥料\u3000300g': [17.8, 19.0, 15.2, 21.7, 20.5, 19.4],
      '肥料\u3000400g': [18.1, 20.2, 17.2, 23.6, 24.9, 25.5]
      }
index=['土A', '土A', '土A', '土B', '土B', '土B']
input = pd.DataFrame(data, index=index)
anova_2w, means = to_2w_anova(input, ['肥料の量', '土の種類'], precision=2)


In [None]:
# @title ##### 平均値の表 { vertical-output: true, display-mode: "form" }
means

Unnamed: 0,肥料　100g,肥料　200g,肥料　300g,肥料　400g,全体
土A,14.57,15.87,17.33,18.5,16.57
土B,16.33,18.03,20.53,24.67,19.89
全体,15.45,16.95,18.93,21.58,18.23


In [None]:
# @title ##### 分散分析表 { display-mode: "form" }
anova_2w

Unnamed: 0,平方和,自由度,平均平方,F値
肥料の量,126.48,3,42.16,29.48
土の種類,66.13,1,66.13,46.24
肥料の量×土の種類,18.18,3,6.06,4.24
残差,22.88,16,1.43,
全体,233.67,23,,


In [None]:
# @title #### 利用例：`to_2w_anova` { run: "auto", vertical-output: true, display-mode: "both" }
# @markdown 二元配置分散分析の分散分析表の作成
import pandas as pd
data = {
      '肥料\u3000100g': [14.5, 16.2],
      '肥料\u3000200g': [16.5, 18.6],
      '肥料\u3000300g': [17.8, 21.7],
      '肥料\u3000400g': [18.1, 23.6]
      }
index=['土A', '土B']
input = pd.DataFrame(data, index=index)
anova_2w_2, means_2 = to_2w_anova(input, ['肥料の量', '土の種類'], False, 2)

In [None]:
# @title ##### 平均値の表 { vertical-output: true, display-mode: "form" }
means_2

Unnamed: 0,肥料　100g,肥料　200g,肥料　300g,肥料　400g,全体
土A,14.5,16.5,17.8,18.1,16.73
土B,16.2,18.6,21.7,23.6,20.03
全体,15.35,17.55,19.75,20.85,18.38


In [None]:
# @title ##### 分散分析表 { display-mode: "form" }
anova_2w_2

Unnamed: 0,平方和,自由度,平均平方,F値
肥料の量,35.7,3,11.9,7.78
土の種類,21.78,1,21.78,14.24
残差,4.6,3,1.53,
全体,62.08,7,,


In [None]:
# @title #### 利用例：`decide_f_test` { run: "auto", vertical-output: true, display-mode: "both" }
# @markdown 分散分析におけるF検定の判定（分散分析表は前の利用例の結果を利用）
# @markdown
# @markdown $\alpha$
alpha = 0.05 # @param {type:"number"}
decide_f_test(anova_2w, alpha)

Unnamed: 0,自由度,残差の自由度(v2),棄却域,F値,帰無仮説
肥料の量,3,16,3.2389,29.48,棄却する
土の種類,1,16,4.494,46.24,棄却する
肥料の量×土の種類,3,16,3.2389,4.24,棄却する
