# 202_6.2_行動数_2_s-learner_q使用.ipynb

## 6.2 プラットフォーム全体で観測される報酬を最適化する方策学習

In [38]:
import warnings
warnings.filterwarnings('ignore')

import torch
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.style.use('ggplot')

from dataset import generate_synthetic_data2
from policylearners import IPSBasedGradientPolicyLearner, CateBasedGradientPolicyLearner
from utils import softmax

import math
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import rankdata
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from xgboost import XGBRegressor

### ログデータ(トレーニングデータ)のサイズ$n$を変化させたときの方策性能の変化

In [39]:
## シミュレーション設定
num_runs = 50 # シミュレーションの繰り返し回数
dim_x = 5 # 特徴量xの次元
num_actions = 10 # 行動数, |A|
beta = -0.1 # データ収集方策のパラメータ
max_iter = 21 # エポック数
test_data_size = 50000 # テストデータのサイズ
random_state = 12345
torch.manual_seed(random_state)
random_ = check_random_state(random_state)
num_actions_list = [2, 10] # 行動数
num_data_list = [100, 500, 1000, 2000] # トレーニングデータのサイズ

In [40]:
num_actions = num_actions_list[1]
## 期待報酬関数を定義するためのパラメータを抽出
theta_1 = random_.normal(size=(dim_x, num_actions))
M_1 = random_.normal(size=(dim_x, num_actions))
b_1 = random_.normal(size=(1, num_actions))
theta_0 = random_.normal(size=(dim_x, num_actions))
M_0 = random_.normal(size=(dim_x, num_actions))
b_0 = random_.normal(size=(1, num_actions))
## 学習された方策の真の性能を近似するためのテストデータを生成
test_data = generate_synthetic_data2(
    num_data=test_data_size, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = random_state
)
pi_0, q_x_a_1, q_x_a_0 = test_data["pi_0"], test_data["q_x_a_1"], test_data["q_x_a_0"]
pi_0_value = (pi_0 * q_x_a_1 + (1. - pi_0) * q_x_a_0).sum(1).mean()

In [41]:
num_data = num_data_list[3]
test_policy_value_list = []
_ = 0

## データ収集方策が形成する分布に従いログデータを生成
offline_logged_data = generate_synthetic_data2(
    num_data=num_data, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = _
)

true_value_of_learned_policies = dict()
true_value_of_learned_policies["logging"] = pi_0_value

## ログデータを用いてオフ方策学習を実行する
### 勾配ベースのアプローチ (IPS推定量で方策勾配を推定)
ips = IPSBasedGradientPolicyLearner(
    dim_x=dim_x, num_actions=num_actions, max_iter=max_iter, random_state=random_state + _
)
ips.fit(offline_logged_data, test_data)
pi_ips = ips.predict(test_data)
true_value_of_learned_policies["ips-pg"] = (pi_ips * q_x_a_1 + (1 - pi_ips) * q_x_a_0).sum(1).mean()
ips_pg = (pi_ips * q_x_a_1 + (1 - pi_ips) * q_x_a_0).sum(1).mean()
### 勾配ベースのアプローチ (新たに開発した推定量で方策勾配を推定)
cate = CateBasedGradientPolicyLearner(
    dim_x=dim_x, num_actions=num_actions, max_iter=max_iter, random_state=random_state + _
)
cate.fit(offline_logged_data, test_data)
pi_cate = cate.predict(test_data)
true_value_of_learned_policies["cate-pg"] = (pi_cate * q_x_a_1 + (1 - pi_cate) * q_x_a_0).sum(1).mean()
cate_pg = (pi_cate * q_x_a_1 + (1 - pi_cate) * q_x_a_0).sum(1).mean()

test_policy_value_list.append(true_value_of_learned_policies)

## 各方策・報酬期待値理解
### 旧方策

In [42]:
print(f'旧方策報酬: {pi_0_value:.3f}')

旧方策報酬: 2.127


### IPS(勾配)

In [43]:
print(f'IPS(勾配)による方策報酬: {ips_pg:.3f}')

IPS(勾配)による方策報酬: 1.895


### Newによる方策報酬

In [44]:
print(f'Newによる方策報酬: {cate_pg:.3f}')

Newによる方策報酬: 3.589


## ここから佐々木の方で方策学習を行い、それによる報酬期待値を得る
### ちなみにランダムで推薦アイテムを決めた場合の期待報酬を計算してみる

In [45]:
pi_random = np.full((50000, num_actions), 1/num_actions)
radom_pg = (pi_random * q_x_a_1 + (1 - pi_random) * q_x_a_0).sum(1).mean()
print(f'ランダム割り当てによる方策報酬: {radom_pg:.3f}')

ランダム割り当てによる方策報酬: 2.280


(まさかのIPS以上New未満...)

### qを使用したs-learnerでやってみる

In [46]:
test_data['q_x_a_0']

array([[ -1.73087096,   0.93077985,  -0.17424915, ...,   2.17762938,
          2.35939733,   0.77054432],
       [ -0.19577258,   1.44716031,   1.26053243, ...,   1.13698243,
          1.58204252,  -1.65177736],
       [ -2.01676091,  -0.5535038 ,  -0.28647523, ...,   1.52757023,
          1.43643854,   0.88138895],
       ...,
       [ -1.97569397,   1.20505265,   0.40875011, ...,   1.13278874,
          1.28206285,   1.32277244],
       [ -1.26091916,   1.22896602,  -1.00946736, ...,   0.19725775,
          1.56322632,   1.11542478],
       [-11.12456726,  -4.79121056,   2.68104132, ...,  -1.1109626 ,
         -1.31255622,   1.04566184]])

In [47]:
# 仮に test_data, q_x_a_0, q_x_a_1 を定義しているとする
x = test_data['x']           # (サンプル数, 5)
q_x_a_0 = test_data['q_x_a_0']  # (サンプル数, 10)
q_x_a_1 = test_data['q_x_a_1']  # (サンプル数, 10)

# データを横に結合
combined = np.hstack([x, q_x_a_0, q_x_a_1])

# カラム名作成
num_x = x.shape[1]           # 5
num_q = q_x_a_0.shape[1]     # 10

columns = (
    [f'x_{i}' for i in range(num_x)] +
    [f'q_x_a_0_{i}' for i in range(num_q)] +
    [f'q_x_a_1_{i}' for i in range(num_q)]
)

# データフレームに
test_data_df = pd.DataFrame(combined, columns=columns)

test_data_df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,q_x_a_0_0,q_x_a_0_1,q_x_a_0_2,q_x_a_0_3,q_x_a_0_4,...,q_x_a_1_0,q_x_a_1_1,q_x_a_1_2,q_x_a_1_3,q_x_a_1_4,q_x_a_1_5,q_x_a_1_6,q_x_a_1_7,q_x_a_1_8,q_x_a_1_9
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,-1.730871,0.930780,-0.174249,-2.621963,1.466922,...,-0.488781,2.027100,0.251114,1.344162,1.487972,-0.288106,1.037183,0.494436,1.519047,0.949349
1,1.393406,0.092908,0.281746,0.769023,1.246435,-0.195773,1.447160,1.260532,-1.867514,-1.145665,...,-0.004809,1.423241,1.078085,-0.313926,-0.690111,-0.121923,0.565838,1.014066,1.276770,-1.071410
2,1.007189,-1.296221,0.274992,0.228913,1.352917,-2.016761,-0.553504,-0.286475,-1.159568,1.428666,...,-1.212284,-0.001168,-0.045710,-0.438177,0.762250,0.773166,0.572019,0.856757,0.623008,1.116884
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,-2.306275,2.584195,-1.235351,-0.073965,1.785856,...,-1.698291,1.325477,-0.490443,0.401324,0.904540,-0.399031,1.730341,2.203897,0.143039,0.993705
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,-4.533773,-7.732926,8.485108,1.458492,-0.016358,...,-4.890683,3.240235,2.384734,3.293543,3.887484,-1.231654,-1.726301,-3.040396,-2.824536,0.872268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-1.502279,0.338190,-0.573258,-0.377969,-1.742207,-1.162177,1.367664,0.736091,-1.514574,1.649956,...,0.040293,0.751431,0.859534,-0.234282,1.114649,0.307736,0.734194,-0.653770,-0.461428,-0.054665
49996,-0.238790,0.361659,0.050803,1.136654,-1.520213,-1.339460,1.556244,-1.029334,-0.953110,0.441678,...,-0.207958,0.874277,-0.214056,-0.528424,0.618222,1.173456,0.738369,0.532347,0.880538,0.751032
49997,0.111191,1.349551,-0.887112,1.432936,-0.347411,-1.975694,1.205053,0.408750,-1.084714,-1.097528,...,-0.812760,0.770219,0.025407,-0.283583,-0.185410,0.263245,0.812201,0.564545,0.628867,0.885215
49998,0.438447,0.399579,-0.526843,0.779586,-1.400417,-1.260919,1.228966,-1.009467,-0.833212,0.329541,...,-0.404052,1.104270,-0.728647,-0.570909,0.565349,1.057961,0.852332,0.268694,0.853737,0.705998


In [48]:
# r_hat_{action}を取得する
# -> action=actionの時の報酬期待値を取得する
for action in range(num_actions):
    pi_ = np.zeros((test_data_size, num_actions))
    pi_[np.arange(test_data_size), action] = 1
    r_hat_ = (pi_ * q_x_a_1 + (1 - pi_) * q_x_a_0).sum(1)
    test_data_df[f'r_hat_{action}'] = r_hat_

test_data_df.head(10)

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,q_x_a_0_0,q_x_a_0_1,q_x_a_0_2,q_x_a_0_3,q_x_a_0_4,...,r_hat_0,r_hat_1,r_hat_2,r_hat_3,r_hat_4,r_hat_5,r_hat_6,r_hat_7,r_hat_8,r_hat_9
0,-0.204708,0.478943,-0.519439,-0.55573,1.965781,-1.730871,0.93078,-0.174249,-2.621963,1.466922,...,3.293237,3.147467,2.47651,6.017271,2.072197,4.173168,1.805246,0.367953,1.210796,2.229951
1,1.393406,0.092908,0.281746,0.769023,1.246435,-0.195773,1.44716,1.260532,-1.867514,-1.145665,...,1.296995,1.082112,0.923583,2.659619,1.561586,1.358389,0.757546,0.983115,0.800759,1.686399
2,1.007189,-1.296221,0.274992,0.228913,1.352917,-2.016761,-0.553504,-0.286475,-1.159568,1.428666,...,3.088828,2.836688,2.525117,3.005743,1.617936,2.212615,2.674677,1.613539,1.470921,2.519847
3,0.886429,-2.001637,-0.371843,1.669025,-0.43857,-2.306275,2.584195,-1.235351,-0.073965,1.785856,...,5.960801,4.094098,6.097725,5.828105,4.471501,5.909858,5.411785,5.642637,4.119882,5.753515
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,-4.533773,-7.732926,8.485108,1.458492,-0.016358,...,8.911218,20.241289,3.167754,11.103179,13.17197,5.932885,0.081024,6.284769,7.423221,7.060538
5,0.124121,0.302614,0.523772,0.00094,1.34381,0.963083,0.305102,-1.213054,-1.292537,1.171039,...,2.431769,2.635534,2.705298,3.151094,2.229607,2.899464,2.632842,1.994616,1.997355,2.678958
6,-0.713544,-0.831154,-2.370232,-1.860761,-0.860757,0.934126,2.764626,0.465882,-3.13214,-2.234631,...,1.899695,-0.682711,0.101168,5.080606,2.539266,2.117535,1.071177,-0.599854,1.894749,-0.235274
7,0.560145,-1.265934,0.119827,-1.063512,0.332883,-1.56291,0.431416,-1.428281,1.397335,1.134066,...,3.140132,2.752111,3.775288,2.696229,2.625229,2.516573,3.426545,2.364956,2.735688,2.951209
8,-2.359419,-0.199543,-1.541996,-0.970736,-1.30703,-0.503943,1.503961,2.097014,-2.397746,1.961392,...,2.858219,1.4196,1.39147,4.842075,-0.014585,3.011774,1.872695,2.335877,2.817581,0.534831
9,0.28635,0.377984,-0.753887,0.331286,1.349742,-0.07065,-0.859512,1.080807,-1.428652,0.039274,...,2.602031,3.042874,2.100937,3.663313,2.726694,3.138818,2.457004,1.963728,2.041186,2.587326


In [68]:
r_hat_cols = [f"r_hat_{i}" for i in range(num_actions)]
test_data_df = pd.DataFrame(
    np.random.rand(test_data_size, num_actions),
    columns=r_hat_cols
)

# r_hat列だけ取り出す
r_hat_values = test_data_df[r_hat_cols].values  # (行数, num_actions)

# スケール＆正規化する関数
def normalize_row_extreme(row, power=3.0):
    min_val = row.min()
    max_val = row.max()
    if max_val - min_val > 1e-6:
        scaled = (row - min_val) / (max_val - min_val)
    else:
        scaled = np.ones_like(row) / len(row)
    
    # ここで格差をつける（スケール後にべき乗）
    scaled = np.power(scaled, power)

    # 合計1に正規化
    return scaled / scaled.sum()

# 各行に適用
policy_probs = np.apply_along_axis(normalize_row_extreme, axis=1, arr=r_hat_values)

# DataFrameにまとめる
policy_df = pd.DataFrame(policy_probs, columns=[f"pi_{i}" for i in range(num_actions)])

# 確認
policy_df.iloc[0:1, :]

Unnamed: 0,pi_0,pi_1,pi_2,pi_3,pi_4,pi_5,pi_6,pi_7,pi_8,pi_9
0,0.358525,0.171299,0.128675,0.002801,0.016976,0.275792,0.00792,0.037995,1.7e-05,0.0


In [69]:
pi_sasaken = np.array(policy_df)
pi_sasaken[0:5]

array([[3.58524629e-01, 1.71298968e-01, 1.28674688e-01, 2.80113080e-03,
        1.69758006e-02, 2.75792298e-01, 7.92043778e-03, 3.79949095e-02,
        1.71387993e-05, 0.00000000e+00],
       [4.37142336e-02, 0.00000000e+00, 4.88243838e-03, 1.67977842e-07,
        8.33951965e-02, 3.62396184e-01, 3.09887107e-01, 1.34073413e-03,
        5.29910810e-05, 1.94330946e-01],
       [2.81283618e-02, 0.00000000e+00, 8.70318767e-06, 1.25876218e-01,
        2.94845783e-01, 2.63220386e-01, 1.07568367e-02, 1.57250693e-01,
        9.70936404e-02, 2.28193790e-02],
       [8.72103042e-02, 4.45247812e-02, 6.06823076e-01, 4.29770558e-02,
        1.35418593e-03, 3.92779720e-03, 6.76654196e-03, 0.00000000e+00,
        6.27220121e-02, 1.43694246e-01],
       [1.16910596e-01, 2.53348085e-04, 1.81647181e-01, 1.89779770e-02,
        0.00000000e+00, 2.38882399e-01, 1.00276525e-01, 1.25752720e-02,
        3.00257430e-01, 3.02192725e-02]])

In [70]:
# 報酬期待値を計算
sasaken_pg = (pi_sasaken * q_x_a_1 + (1 - pi_sasaken) * q_x_a_0).sum(1).mean()
print(f'ささけん割り当てによる方策報酬: {sasaken_pg:.3f}')

ささけん割り当てによる方策報酬: 2.281


旧方策・IPS以上ランダム未満、まだまだかな、、、