# 204_6.2_行動数_2_qを使用せず観測可能データから方策学習実施.ipynb

## モジュールインポート

In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.style.use('ggplot')

from dataset import generate_synthetic_data2
# from policylearners import IPSBasedGradientPolicyLearner, CateBasedGradientPolicyLearner
from policylearners_sasaken_edit import IPSBasedGradientPolicyLearner, CateBasedGradientPolicyLearner_by_sk
from utils import softmax

import math
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import rankdata
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from xgboost import XGBRegressor

## 行動数2・ログデータ数2,000のデータを用意する

In [2]:
## シミュレーション設定
num_runs = 50 # シミュレーションの繰り返し回数
dim_x = 5 # 特徴量xの次元
num_actions = 10 # 行動数, |A|
beta = -0.1 # データ収集方策のパラメータ
max_iter = 21 # エポック数
test_data_size = 50000 # テストデータのサイズ
random_state = 12345
torch.manual_seed(random_state)
random_ = check_random_state(random_state)
num_actions_list = [2, 10] # 行動数
num_data_list = [100, 500, 1000, 2000] # トレーニングデータのサイズ

num_actions = num_actions_list[0]
## 期待報酬関数を定義するためのパラメータを抽出
theta_1 = random_.normal(size=(dim_x, num_actions))
M_1 = random_.normal(size=(dim_x, num_actions))
b_1 = random_.normal(size=(1, num_actions))
theta_0 = random_.normal(size=(dim_x, num_actions))
M_0 = random_.normal(size=(dim_x, num_actions))
b_0 = random_.normal(size=(1, num_actions))
## 学習された方策の真の性能を近似するためのテストデータを生成
test_data = generate_synthetic_data2(
    num_data=test_data_size, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = random_state
)
pi_0, q_x_a_1, q_x_a_0 = test_data["pi_0"], test_data["q_x_a_1"], test_data["q_x_a_0"]
pi_0_value = (pi_0 * q_x_a_1 + (1. - pi_0) * q_x_a_0).sum(1).mean()

num_data = num_data_list[3]
test_policy_value_list = []
_ = 0

## データ収集方策が形成する分布に従いログデータを生成
offline_logged_data = generate_synthetic_data2(
    num_data=num_data, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = _
)

true_value_of_learned_policies = dict()
true_value_of_learned_policies["logging"] = pi_0_value
print(f'旧方策報酬: {pi_0_value:.3f}')

旧方策報酬: 0.487


## RCT（0を推薦:1を推薦=5:5）

In [3]:
def calc_random_pg(num_actions, test_data):
    q_x_a_1, q_x_a_0 = test_data["q_x_a_1"], test_data["q_x_a_0"]
    num_data = test_data['num_data']
    pi_random = np.full((num_data, num_actions), 1/num_actions)
    radom_pg = (pi_random * q_x_a_1 + (1 - pi_random) * q_x_a_0).sum(1).mean()
    return radom_pg

random_pg = calc_random_pg(num_actions=num_actions, test_data=test_data)
print(f'ランダム割り当てによる方策報酬: {random_pg:.3f}')

ランダム割り当てによる方策報酬: 1.287


## ログデータからtest_dataのq_x_a予測値を生成し、それを元に

- 予測値が高い方を優先して推薦する方法
- 方策勾配を使った方策学習による結果

の比較を行う

### test_dataのq_x_aを予測する

In [4]:
# データ整形: 観測される情報のみを持つDataFrameを生成する
x = offline_logged_data['x']
a = offline_logged_data['a']
_q_x_a_0 = offline_logged_data['q_x_a_0'] # ノイズはないがログデータとして各コンテンツの試聴時間は得られているものとする
_q_x_a_1 = offline_logged_data['q_x_a_1'] # ノイズはないがログデータとして各コンテンツの試聴時間は得られているものとする
combined = np.hstack([x, a.reshape(-1, 1), _q_x_a_0, _q_x_a_1])

# データフレームに変換
df = pd.DataFrame(combined, columns=[
    'x_0',
    'x_1',
    'x_2',
    'x_3',
    'x_4',
    'a',
    'q_x_a_contents_0_not_recommended',
    'q_x_a_contents_1_not_recommended',
    'q_x_a_contents_0_recommended',
    'q_x_a_contents_1_recommended',
])

# 初期化（NaNで埋める）
df['observed_r_contents_0'] = np.nan
df['observed_r_contents_1'] = np.nan

# a=0 のとき → q_x_a_0系の値を観測された報酬として使う
df.loc[df['a'] == 0, 'observed_r_contents_0'] = df.loc[df['a'] == 0, 'q_x_a_contents_0_recommended']
df.loc[df['a'] == 0, 'observed_r_contents_1'] = df.loc[df['a'] == 0, 'q_x_a_contents_1_not_recommended']

# a=1 のとき → q_x_a_1系の値を観測された報酬として使う
df.loc[df['a'] == 1, 'observed_r_contents_0'] = df.loc[df['a'] == 1, 'q_x_a_contents_0_not_recommended']
df.loc[df['a'] == 1, 'observed_r_contents_1'] = df.loc[df['a'] == 1, 'q_x_a_contents_1_recommended']

df.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a,q_x_a_contents_0_not_recommended,q_x_a_contents_1_not_recommended,q_x_a_contents_0_recommended,q_x_a_contents_1_recommended,observed_r_contents_0,observed_r_contents_1
0,1.764052,0.400157,0.978738,2.240893,1.867558,0.0,13.501137,2.322846,-3.144051,-10.975893,-3.144051,2.322846
1,-0.977278,0.950088,-0.151357,-0.103219,0.410599,1.0,3.860589,0.004691,1.305038,-1.686152,3.860589,-1.686152
2,0.144044,1.454274,0.761038,0.121675,0.443863,1.0,-3.273465,0.57571,-0.231863,2.422044,-3.273465,2.422044
3,0.333674,1.494079,-0.205158,0.313068,-0.854096,1.0,-1.589338,0.166794,-0.32851,0.883631,-1.589338,0.883631
4,-2.55299,0.653619,0.864436,-0.742165,2.269755,1.0,-4.535771,7.64017,6.378315,-1.107807,-4.535771,-1.107807


In [5]:
# 説明変数: x0〜x4 + a（処置変数）
feature_cols = ['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'a']
X = df[feature_cols]
y = df['observed_r_contents_0']  # 目的変数

# 学習
model_contents_0 = RandomForestRegressor(random_state=42)
model_contents_0.fit(X, y)

# 説明変数: x0〜x4 + a（処置変数）
feature_cols = ['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'a']
X = df[feature_cols]
y = df['observed_r_contents_1']  # 目的変数

# 学習
model_contents_1 = RandomForestRegressor(random_state=42)
model_contents_1.fit(X, y)

In [6]:
# testデータをデータフレームに変換
test_data_df = pd.DataFrame(test_data['x'], columns=[
    'x_0',
    'x_1',
    'x_2',
    'x_3',
    'x_4',
])
test_data_df_list = []
for a in range(num_actions):
    tmp_test_data_df = test_data_df.copy()
    tmp_test_data_df['a'] = a
    test_data_df_list.append(tmp_test_data_df)
display(test_data_df_list[0].head())
display(test_data_df_list[1].head())

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a
0,-0.204708,0.478943,-0.519439,-0.55573,1.965781,0
1,1.393406,0.092908,0.281746,0.769023,1.246435,0
2,1.007189,-1.296221,0.274992,0.228913,1.352917,0
3,0.886429,-2.001637,-0.371843,1.669025,-0.43857,0
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,0


Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a
0,-0.204708,0.478943,-0.519439,-0.55573,1.965781,1
1,1.393406,0.092908,0.281746,0.769023,1.246435,1
2,1.007189,-1.296221,0.274992,0.228913,1.352917,1
3,0.886429,-2.001637,-0.371843,1.669025,-0.43857,1
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,1


In [7]:
for a in range(num_actions):
    test_data_df[f'q_x_a_hat_contents_0_at_a_{a}'] = model_contents_0.predict(test_data_df_list[a][['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'a']])
    test_data_df[f'q_x_a_hat_contents_1_at_a_{a}'] = model_contents_1.predict(test_data_df_list[a][['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'a']])
test_data_df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,q_x_a_hat_contents_0_at_a_0,q_x_a_hat_contents_1_at_a_0,q_x_a_hat_contents_0_at_a_1,q_x_a_hat_contents_1_at_a_1
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,5.785202,-2.304773,-1.580996,-2.447885
1,1.393406,0.092908,0.281746,0.769023,1.246435,2.610365,-2.416919,5.476263,-2.728987
2,1.007189,-1.296221,0.274992,0.228913,1.352917,3.236249,-2.527941,0.499500,-0.880847
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,-4.475344,-0.075743,2.909205,-2.754333
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,15.065243,13.030085,6.211654,13.142777
...,...,...,...,...,...,...,...,...,...
49995,-1.502279,0.338190,-0.573258,-0.377969,-1.742207,-1.899513,2.357365,10.714795,-4.332164
49996,-0.238790,0.361659,0.050803,1.136654,-1.520213,-1.165175,-0.385124,5.600932,-3.458045
49997,0.111191,1.349551,-0.887112,1.432936,-0.347411,-2.475923,0.343688,1.851102,-2.024944
49998,0.438447,0.399579,-0.526843,0.779586,-1.400417,-0.154383,-0.847052,5.092318,-3.409458


In [8]:
test_data_df['uplift_contents_0_by_recommend'] = test_data_df['q_x_a_hat_contents_0_at_a_0'] - test_data_df['q_x_a_hat_contents_0_at_a_1']
test_data_df['uplift_contents_1_by_recommend'] = test_data_df['q_x_a_hat_contents_1_at_a_1'] - test_data_df['q_x_a_hat_contents_1_at_a_0']
test_data_df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,q_x_a_hat_contents_0_at_a_0,q_x_a_hat_contents_1_at_a_0,q_x_a_hat_contents_0_at_a_1,q_x_a_hat_contents_1_at_a_1,uplift_contents_0_by_recommend,uplift_contents_1_by_recommend
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,5.785202,-2.304773,-1.580996,-2.447885,7.366198,-0.143112
1,1.393406,0.092908,0.281746,0.769023,1.246435,2.610365,-2.416919,5.476263,-2.728987,-2.865898,-0.312068
2,1.007189,-1.296221,0.274992,0.228913,1.352917,3.236249,-2.527941,0.499500,-0.880847,2.736749,1.647094
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,-4.475344,-0.075743,2.909205,-2.754333,-7.384549,-2.678590
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,15.065243,13.030085,6.211654,13.142777,8.853589,0.112692
...,...,...,...,...,...,...,...,...,...,...,...
49995,-1.502279,0.338190,-0.573258,-0.377969,-1.742207,-1.899513,2.357365,10.714795,-4.332164,-12.614309,-6.689529
49996,-0.238790,0.361659,0.050803,1.136654,-1.520213,-1.165175,-0.385124,5.600932,-3.458045,-6.766107,-3.072921
49997,0.111191,1.349551,-0.887112,1.432936,-0.347411,-2.475923,0.343688,1.851102,-2.024944,-4.327025,-2.368632
49998,0.438447,0.399579,-0.526843,0.779586,-1.400417,-0.154383,-0.847052,5.092318,-3.409458,-5.246701,-2.562406


### 予測値が高い方を優先して推薦する方法にて方策を導出し、得られる報酬を計算する

In [9]:
def softmax(x):
    # 各行に対してsoftmaxを適用
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # オーバーフロー防止
    return e_x / e_x.sum(axis=1, keepdims=True)

# ソフトマックス適用
q_hat = test_data_df[["uplift_contents_0_by_recommend", "uplift_contents_1_by_recommend"]].values
pi_method_1 = softmax(q_hat)
pi_method_1[0:5]

method_1_pg = (pi_method_1 * q_x_a_1 + (1. - pi_method_1) * q_x_a_0).sum(1).mean()
print(f'方策①による方策報酬: {method_1_pg:.3f}')

方策①による方策報酬: 3.104


<span style="color:red">→旧方策(0.487)以上 かつ ランダム(1.287)より上！</span>

### 方策勾配を使った方策学習を行う

In [12]:
### 予測モデルにて作成した予測値を用いた、方策にて勾配ベースのアプローチ
cate = CateBasedGradientPolicyLearner_by_sk(
    dim_x=dim_x, num_actions=num_actions, max_iter=max_iter, random_state=random_state + _
)
cate.fit(offline_logged_data, test_data,
         np.array(test_data_df[["q_x_a_hat_contents_0_at_a_0", "q_x_a_hat_contents_1_at_a_1"]]),
         np.array(test_data_df[["q_x_a_hat_contents_0_at_a_1", "q_x_a_hat_contents_1_at_a_0"]]),
        )
pi_cate = cate.predict(test_data)
cate_pg = (pi_cate * q_x_a_1 + (1 - pi_cate) * q_x_a_0).sum(1).mean()
print(f'Newによる方策報酬: {cate_pg:.3f}')

Newによる方策報酬: 3.135


<span style="color:red">→S-learner(3.104)よりも上！</span>