In [1]:
import numpy as np
import pandas as pd


### サンプルデータ作成

In [2]:
# サンプルデータ生成の準備
rand = np.random.RandomState(seed=71)
train_y_prob = np.linspace(0, 1.0, 10000)

# 真の値と予測値が以下のtrain_y, train_pred_probであったとする
train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob)
train_pred_prob = np.clip(train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0)

In [3]:
len(train_y), len(train_y_prob)

(10000, 10000)

### 閾値の確認

In [4]:
from sklearn.metrics import f1_score

In [5]:
for threshold in np.linspace(0.1, 1.0, 19):
    score = f1_score(train_y, train_pred_prob >= threshold)
    print(f'th: {threshold:.2f} => score: {score}')

th: 0.10 => score: 0.7048616090635307
th: 0.15 => score: 0.7209527353926313
th: 0.20 => score: 0.7350096711798839
th: 0.25 => score: 0.7449333871618893
th: 0.30 => score: 0.7535669058674547
th: 0.35 => score: 0.7553557259984133
th: 0.40 => score: 0.7523291209298035
th: 0.45 => score: 0.7410358565737052
th: 0.50 => score: 0.7224831529507862
th: 0.55 => score: 0.6983546617915904
th: 0.60 => score: 0.6699931957359946
th: 0.65 => score: 0.6357552581261952
th: 0.70 => score: 0.596690665656183
th: 0.75 => score: 0.5498867723458106
th: 0.80 => score: 0.5026543727298128
th: 0.85 => score: 0.44897959183673475
th: 0.90 => score: 0.3956956187548039
th: 0.95 => score: 0.3467007509186771
th: 1.00 => score: 0.29940318302387264


### 最適な閾値を求める

In [6]:
from scipy.optimize import minimize

In [7]:
# 閾値の最適化関数（最小化するため、返り値を負にしている）
def f1_opt(x):
    return -f1_score(train_y, train_pred_prob >= x)

In [8]:
init_threshold = 0.5

result = minimize(f1_opt, x0=np.array([init_threshold]), method='Nelder-Mead')
best_threshold = result['x'].item()

score = f1_score(train_y, train_pred_prob >= best_threshold)
print(f'th: {best_threshold:.2f} => score: {score}')

th: 0.32 => score: 0.7557317703844165
