# 205_6.2_行動数_2_qを使用せず観測可能データから方策学習実施_num_actions_汎用.ipynb

## モジュールインポート

In [19]:
import warnings
warnings.filterwarnings('ignore')

import torch
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.style.use('ggplot')

from dataset import generate_synthetic_data2
# from policylearners import IPSBasedGradientPolicyLearner, CateBasedGradientPolicyLearner
from policylearners_sasaken_edit import IPSBasedGradientPolicyLearner, CateBasedGradientPolicyLearner_by_sk
from utils import softmax

import math
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import rankdata
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from xgboost import XGBRegressor

## 行動数10・ログデータ数2,000のデータを用意する

In [36]:
## シミュレーション設定
num_runs = 50 # シミュレーションの繰り返し回数
dim_x = 5 # 特徴量xの次元
num_actions = 10 # 行動数, |A|
beta = -0.1 # データ収集方策のパラメータ
max_iter = 21 # エポック数
test_data_size = 50000 # テストデータのサイズ
random_state = 12345
torch.manual_seed(random_state)
random_ = check_random_state(random_state)
num_actions_list = [2, 10] # 行動数
num_data_list = [100, 500, 1000, 2000] # トレーニングデータのサイズ

num_actions = num_actions_list[1]
## 期待報酬関数を定義するためのパラメータを抽出
theta_1 = random_.normal(size=(dim_x, num_actions))
M_1 = random_.normal(size=(dim_x, num_actions))
b_1 = random_.normal(size=(1, num_actions))
theta_0 = random_.normal(size=(dim_x, num_actions))
M_0 = random_.normal(size=(dim_x, num_actions))
b_0 = random_.normal(size=(1, num_actions))
## 学習された方策の真の性能を近似するためのテストデータを生成
test_data = generate_synthetic_data2(
    num_data=test_data_size, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = random_state
)
pi_0, q_x_a_1, q_x_a_0 = test_data["pi_0"], test_data["q_x_a_1"], test_data["q_x_a_0"]
pi_0_value = (pi_0 * q_x_a_1 + (1. - pi_0) * q_x_a_0).sum(1).mean()

num_data = num_data_list[3]
test_policy_value_list = []
_ = 0

## データ収集方策が形成する分布に従いログデータを生成
offline_logged_data = generate_synthetic_data2(
    num_data=num_data, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = _
)

true_value_of_learned_policies = dict()
true_value_of_learned_policies["logging"] = pi_0_value
print(f'旧方策報酬: {pi_0_value:.3f}')

旧方策報酬: 2.127


## RCT（0を推薦:1を推薦=5:5）

In [37]:
def calc_random_pg(num_actions, test_data):
    q_x_a_1, q_x_a_0 = test_data["q_x_a_1"], test_data["q_x_a_0"]
    num_data = test_data['num_data']
    pi_random = np.full((num_data, num_actions), 1/num_actions)
    radom_pg = (pi_random * q_x_a_1 + (1 - pi_random) * q_x_a_0).sum(1).mean()
    return radom_pg

random_pg = calc_random_pg(num_actions=num_actions, test_data=test_data)
print(f'ランダム割り当てによる方策報酬: {random_pg:.3f}')

ランダム割り当てによる方策報酬: 2.280


## ログデータからtest_dataのq_x_a予測値を生成し、それを元に

- 予測値が高い方を優先して推薦する方法
- 方策勾配を使った方策学習による結果

の比較を行う

### test_dataのq_x_aを予測する

In [39]:
# データ整形: 観測される情報のみを持つDataFrameを生成する
x = offline_logged_data['x']
a_mat = offline_logged_data['a_mat']
_q_x_a_0 = offline_logged_data['q_x_a_0'] # ノイズはないがログデータとして各コンテンツの試聴時間は得られているものとする
_q_x_a_1 = offline_logged_data['q_x_a_1'] # ノイズはないがログデータとして各コンテンツの試聴時間は得られているものとする

num_actions = _q_x_a_0.shape[1]

# 結合
combined = np.hstack([x, a_mat, _q_x_a_0, _q_x_a_1])

# カラム名の生成
columns = [f'x_{i}' for i in range(x.shape[1])] + [f'a_{i}' for i in range(a_mat.shape[1])]
columns += [f'q_x_a_contents_{i}_not_recommended' for i in range(num_actions)]
columns += [f'q_x_a_contents_{i}_recommended' for i in range(num_actions)]

# DataFrame 作成
df = pd.DataFrame(combined, columns=columns)

df.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a_0,a_1,a_2,a_3,a_4,...,q_x_a_contents_0_recommended,q_x_a_contents_1_recommended,q_x_a_contents_2_recommended,q_x_a_contents_3_recommended,q_x_a_contents_4_recommended,q_x_a_contents_5_recommended,q_x_a_contents_6_recommended,q_x_a_contents_7_recommended,q_x_a_contents_8_recommended,q_x_a_contents_9_recommended
0,1.764052,0.400157,0.978738,2.240893,1.867558,0.0,0.0,0.0,0.0,0.0,...,0.610305,1.519782,3.099037,2.636847,1.595949,1.177914,-1.07924,1.697214,1.874099,0.666065
1,-0.977278,0.950088,-0.151357,-0.103219,0.410599,0.0,0.0,0.0,0.0,0.0,...,0.899631,0.833017,1.057889,-0.832202,0.796212,0.558681,-0.080191,-1.164529,-0.180623,-1.232541
2,0.144044,1.454274,0.761038,0.121675,0.443863,0.0,0.0,0.0,0.0,0.0,...,-1.01791,-0.971159,-0.001332,1.042398,0.152209,0.579347,-0.75343,0.236275,0.55745,0.810329
3,0.333674,1.494079,-0.205158,0.313068,-0.854096,0.0,0.0,0.0,0.0,0.0,...,-0.528433,-0.701272,0.089529,0.472129,-0.570712,0.583948,1.198806,0.192241,0.521053,0.497581
4,-2.55299,0.653619,0.864436,-0.742165,2.269755,0.0,0.0,0.0,0.0,1.0,...,-0.022087,2.580284,2.215281,2.163039,-1.134082,-0.41638,1.261683,-0.051449,0.548445,0.409844


In [40]:
# 初期化（NaNで埋める）
for i in range(num_actions):
    df[f'observed_r_contents_{i}'] = np.nan

# 各行でどのアクションが推薦されたか（a_i == 1の列）を使って代入
for i in range(num_actions):
    recommended_mask = df[f'a_{i}'] == 1
    for j in range(num_actions):
        colname = f'observed_r_contents_{j}'
        source_col = (
            f'q_x_a_contents_{j}_recommended' if i == j
            else f'q_x_a_contents_{j}_not_recommended'
        )
        df.loc[recommended_mask, colname] = df.loc[recommended_mask, source_col]

df.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a_0,a_1,a_2,a_3,a_4,...,observed_r_contents_0,observed_r_contents_1,observed_r_contents_2,observed_r_contents_3,observed_r_contents_4,observed_r_contents_5,observed_r_contents_6,observed_r_contents_7,observed_r_contents_8,observed_r_contents_9
0,1.764052,0.400157,0.978738,2.240893,1.867558,0.0,0.0,0.0,0.0,0.0,...,-1.930769,3.95948,2.176297,-4.42904,-0.989263,-3.223142,-1.07924,1.955981,3.022632,-0.579877
1,-0.977278,0.950088,-0.151357,-0.103219,0.410599,0.0,0.0,0.0,0.0,0.0,...,1.029789,1.004168,1.485089,-1.388836,0.951395,0.790578,-0.046726,-1.164529,-0.254305,-1.319611
2,0.144044,1.454274,0.761038,0.121675,0.443863,0.0,0.0,0.0,0.0,0.0,...,-1.843026,-1.437606,0.042343,1.567839,-0.029299,1.200594,-0.75343,0.848258,0.911116,1.038333
3,0.333674,1.494079,-0.205158,0.313068,-0.854096,0.0,0.0,0.0,0.0,0.0,...,-2.20266,-1.290043,-0.024899,0.477447,-0.814997,0.583948,1.442285,1.005841,1.208849,1.065806
4,-2.55299,0.653619,0.864436,-0.742165,2.269755,0.0,0.0,0.0,0.0,1.0,...,-1.515896,1.121202,1.417819,-3.8791,-1.134082,-4.019438,0.220571,1.790231,0.751986,1.256526


In [41]:
model_list = []
for a_val in range(num_actions):

    # 説明変数: x0〜x4 + a（処置変数）
    feature_cols = ['x_0', 'x_1', 'x_2', 'x_3', 'x_4', f'a_{a_val}']
    X = df[feature_cols]
    y = df[f'observed_r_contents_{a_val}']  # 目的変数
    
    # 学習
    tmp_model = RandomForestRegressor(random_state=42)
    tmp_model.fit(X, y)

    model_list.append(tmp_model)

In [42]:
# testデータをデータフレームに変換
test_data_df = pd.DataFrame(test_data['x'], columns=[
    'x_0',
    'x_1',
    'x_2',
    'x_3',
    'x_4',
])

for a_val in range(num_actions):
    # a_0
    ## a_0 not recommended
    tmp_test_data_df = test_data_df.copy()
    tmp_test_data_df[f'a_{a_val}'] = 0
    test_data_df[f'q_x_a_hat_contents_{a_val}_when_a_{a_val}_not_recommended'] = model_list[a_val].predict(
        tmp_test_data_df[['x_0', 'x_1', 'x_2', 'x_3', 'x_4', f'a_{a_val}']]
    )
    tmp_test_data_df = test_data_df.copy()
    
    ## a_0 recommended
    tmp_test_data_df[f'a_{a_val}'] = 1
    test_data_df[f'q_x_a_hat_contents_{a_val}_when_a_{a_val}_recommended'] = model_list[a_val].predict(
        tmp_test_data_df[['x_0', 'x_1', 'x_2', 'x_3', 'x_4', f'a_{a_val}']]
    )

test_data_df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,q_x_a_hat_contents_0_when_a_0_not_recommended,q_x_a_hat_contents_0_when_a_0_recommended,q_x_a_hat_contents_1_when_a_1_not_recommended,q_x_a_hat_contents_1_when_a_1_recommended,q_x_a_hat_contents_2_when_a_2_not_recommended,...,q_x_a_hat_contents_5_when_a_5_not_recommended,q_x_a_hat_contents_5_when_a_5_recommended,q_x_a_hat_contents_6_when_a_6_not_recommended,q_x_a_hat_contents_6_when_a_6_recommended,q_x_a_hat_contents_7_when_a_7_not_recommended,q_x_a_hat_contents_7_when_a_7_recommended,q_x_a_hat_contents_8_when_a_8_not_recommended,q_x_a_hat_contents_8_when_a_8_recommended,q_x_a_hat_contents_9_when_a_9_not_recommended,q_x_a_hat_contents_9_when_a_9_recommended
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,-1.036084,-0.155480,0.993990,1.032234,0.233815,...,-2.344124,-0.807315,1.129807,0.918757,2.039891,0.441597,2.358494,1.425908,0.108056,0.021327
1,1.393406,0.092908,0.281746,0.769023,1.246435,0.449739,0.474625,1.012219,0.932045,1.345067,...,-0.432266,-0.405606,0.779285,0.281495,1.210359,0.955967,1.466166,1.020075,-1.228943,-1.188389
2,1.007189,-1.296221,0.274992,0.228913,1.352917,-2.000574,-1.522229,0.076199,0.099107,-0.258592,...,0.541869,0.531418,0.451720,0.361327,1.554583,1.119914,1.470413,0.465306,0.526183,0.486066
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,-1.585345,-1.321169,2.174911,0.971027,-0.273528,...,-0.016829,0.560326,0.943329,0.649627,0.949234,0.961130,1.371051,0.343471,0.804986,0.747737
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,-3.273082,-3.156161,-7.741715,-7.741715,7.726534,...,1.510836,0.883106,4.987563,1.851962,0.680711,0.892366,0.270501,0.279058,1.777752,0.917708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-1.502279,0.338190,-0.573258,-0.377969,-1.742207,-1.250623,-0.110257,1.101412,1.011637,0.546112,...,0.639619,0.296144,2.532134,0.699091,-1.159499,-1.026064,-0.184876,-0.183675,0.765176,0.165020
49996,-0.238790,0.361659,0.050803,1.136654,-1.520213,-1.351243,-0.326142,1.467523,0.874938,-1.028707,...,0.982552,0.525083,1.940515,0.865519,-0.310822,-0.236870,1.495217,0.899569,1.066347,0.550767
49997,0.111191,1.349551,-0.887112,1.432936,-0.347411,-2.001012,-0.783807,1.427972,0.404952,1.399133,...,-1.057038,-0.778032,1.337400,0.644591,1.037696,0.806708,1.230924,0.778807,0.760530,0.650736
49998,0.438447,0.399579,-0.526843,0.779586,-1.400417,-1.279674,-0.292509,0.649047,0.629204,-0.688249,...,0.580772,0.264897,1.961483,0.860023,0.566893,0.459064,1.486413,0.862329,1.154346,0.531465


In [43]:
for a_val in range(num_actions):
    test_data_df[f'uplift_contents_{a_val}_by_recommend'] = test_data_df[f'q_x_a_hat_contents_{a_val}_when_a_{a_val}_recommended'] - test_data_df[f'q_x_a_hat_contents_{a_val}_when_a_{a_val}_not_recommended']
test_data_df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,q_x_a_hat_contents_0_when_a_0_not_recommended,q_x_a_hat_contents_0_when_a_0_recommended,q_x_a_hat_contents_1_when_a_1_not_recommended,q_x_a_hat_contents_1_when_a_1_recommended,q_x_a_hat_contents_2_when_a_2_not_recommended,...,uplift_contents_0_by_recommend,uplift_contents_1_by_recommend,uplift_contents_2_by_recommend,uplift_contents_3_by_recommend,uplift_contents_4_by_recommend,uplift_contents_5_by_recommend,uplift_contents_6_by_recommend,uplift_contents_7_by_recommend,uplift_contents_8_by_recommend,uplift_contents_9_by_recommend
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,-1.036084,-0.155480,0.993990,1.032234,0.233815,...,0.880604,0.038243,0.088181,3.510305,0.031263,1.536809,-0.211050,-1.598294,-0.932586,-0.086729
1,1.393406,0.092908,0.281746,0.769023,1.246435,0.449739,0.474625,1.012219,0.932045,1.345067,...,0.024886,-0.080174,-0.098337,1.329648,0.809511,0.026660,-0.497790,-0.254392,-0.446091,0.040554
2,1.007189,-1.296221,0.274992,0.228913,1.352917,-2.000574,-1.522229,0.076199,0.099107,-0.258592,...,0.478346,0.022908,0.023016,0.912213,-0.132914,-0.010451,-0.090393,-0.434669,-1.005106,-0.040117
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,-1.585345,-1.321169,2.174911,0.971027,-0.273528,...,0.264177,-1.203884,0.003197,0.157313,-0.950640,0.577155,-0.293702,0.011897,-1.027580,-0.057249
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,-3.273082,-3.156161,-7.741715,-7.741715,7.726534,...,0.116921,0.000000,-2.891344,0.603302,0.087367,-0.627730,-3.135601,0.211655,0.008557,-0.860044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-1.502279,0.338190,-0.573258,-0.377969,-1.742207,-1.250623,-0.110257,1.101412,1.011637,0.546112,...,1.140366,-0.089774,-0.004122,1.195666,-0.838874,-0.343475,-1.833043,0.133435,0.001201,-0.600156
49996,-0.238790,0.361659,0.050803,1.136654,-1.520213,-1.351243,-0.326142,1.467523,0.874938,-1.028707,...,1.025101,-0.592585,0.634365,0.252443,0.067043,-0.457469,-1.074996,0.073953,-0.595648,-0.515580
49997,0.111191,1.349551,-0.887112,1.432936,-0.347411,-2.001012,-0.783807,1.427972,0.404952,1.399133,...,1.217205,-1.023020,-0.697212,0.427653,1.085110,0.279006,-0.692809,-0.230987,-0.452118,-0.109794
49998,0.438447,0.399579,-0.526843,0.779586,-1.400417,-1.279674,-0.292509,0.649047,0.629204,-0.688249,...,0.987165,-0.019843,0.466686,0.091724,0.133043,-0.315875,-1.101460,-0.107829,-0.624084,-0.622880


### 予測値が高い方を優先して推薦する方法にて方策を導出し、得られる報酬を計算する

In [45]:
def softmax(x):
    # 各行に対してsoftmaxを適用
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # オーバーフロー防止
    return e_x / e_x.sum(axis=1, keepdims=True)

# ソフトマックス適用
q_hat = test_data_df[[f"uplift_contents_{a_val}_by_recommend" for a_val in range(num_actions)]].values
pi_method_1 = softmax(q_hat)
pi_method_1[0:5]

method_1_pg = (pi_method_1 * q_x_a_1 + (1. - pi_method_1) * q_x_a_0).sum(1).mean()
print(f'方策①による方策報酬: {method_1_pg:.3f}')

方策①による方策報酬: 3.155


<span style="color:red">→旧方策(0.487)以上 かつ ランダム(1.287)より上！</span>

### 方策勾配を使った方策学習を行う

In [46]:
### 予測モデルにて作成した予測値を用いた、方策にて勾配ベースのアプローチ
cate = CateBasedGradientPolicyLearner_by_sk(
    dim_x=dim_x, num_actions=num_actions, max_iter=max_iter, random_state=random_state + _
)
cate.fit(offline_logged_data, test_data,
         np.array(test_data_df[[f"q_x_a_hat_contents_{a_val}_when_a_{a_val}_recommended" for a_val in range(num_actions)]]),
         np.array(test_data_df[[f"q_x_a_hat_contents_{a_val}_when_a_{a_val}_not_recommended" for a_val in range(num_actions)]])
        )
pi_cate = cate.predict(test_data)
cate_pg = (pi_cate * q_x_a_1 + (1 - pi_cate) * q_x_a_0).sum(1).mean()
print(f'Newによる方策報酬: {cate_pg:.3f}')

Newによる方策報酬: 3.589


<span style="color:red">→S-learner(3.104)よりも上！</span>