# 202_6.2_各方策・報酬期待値理解.ipynb

## 6.2 プラットフォーム全体で観測される報酬を最適化する方策学習

In [2]:
import warnings
warnings.filterwarnings('ignore')

import torch
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.style.use('ggplot')

from dataset import generate_synthetic_data2
from policylearners import IPSBasedGradientPolicyLearner, CateBasedGradientPolicyLearner
from utils import softmax

import math
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

### ログデータ(トレーニングデータ)のサイズ$n$を変化させたときの方策性能の変化

In [3]:
## シミュレーション設定
num_runs = 50 # シミュレーションの繰り返し回数
dim_x = 5 # 特徴量xの次元
num_actions = 10 # 行動数, |A|
beta = -0.1 # データ収集方策のパラメータ
max_iter = 21 # エポック数
test_data_size = 50000 # テストデータのサイズ
random_state = 12345
torch.manual_seed(random_state)
random_ = check_random_state(random_state)
num_actions_list = [2, 10] # 行動数
num_data_list = [100, 500, 1000, 2000] # トレーニングデータのサイズ

In [4]:
num_actions = num_actions_list[1]
## 期待報酬関数を定義するためのパラメータを抽出
theta_1 = random_.normal(size=(dim_x, num_actions))
M_1 = random_.normal(size=(dim_x, num_actions))
b_1 = random_.normal(size=(1, num_actions))
theta_0 = random_.normal(size=(dim_x, num_actions))
M_0 = random_.normal(size=(dim_x, num_actions))
b_0 = random_.normal(size=(1, num_actions))
## 学習された方策の真の性能を近似するためのテストデータを生成
test_data = generate_synthetic_data2(
    num_data=test_data_size, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = random_state
)
pi_0, q_x_a_1, q_x_a_0 = test_data["pi_0"], test_data["q_x_a_1"], test_data["q_x_a_0"]
pi_0_value = (pi_0 * q_x_a_1 + (1. - pi_0) * q_x_a_0).sum(1).mean()

In [5]:
num_data = num_data_list[2]
test_policy_value_list = []
_ = 0

## データ収集方策が形成する分布に従いログデータを生成
offline_logged_data = generate_synthetic_data2(
    num_data=num_data, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = _
)

true_value_of_learned_policies = dict()
true_value_of_learned_policies["logging"] = pi_0_value

## ログデータを用いてオフ方策学習を実行する
### 勾配ベースのアプローチ (IPS推定量で方策勾配を推定)
ips = IPSBasedGradientPolicyLearner(
    dim_x=dim_x, num_actions=num_actions, max_iter=max_iter, random_state=random_state + _
)
ips.fit(offline_logged_data, test_data)
pi_ips = ips.predict(test_data)
true_value_of_learned_policies["ips-pg"] = (pi_ips * q_x_a_1 + (1 - pi_ips) * q_x_a_0).sum(1).mean()
ips_pg = (pi_ips * q_x_a_1 + (1 - pi_ips) * q_x_a_0).sum(1).mean()
### 勾配ベースのアプローチ (新たに開発した推定量で方策勾配を推定)
cate = CateBasedGradientPolicyLearner(
    dim_x=dim_x, num_actions=num_actions, max_iter=max_iter, random_state=random_state + _
)
cate.fit(offline_logged_data, test_data)
pi_cate = cate.predict(test_data)
true_value_of_learned_policies["cate-pg"] = (pi_cate * q_x_a_1 + (1 - pi_cate) * q_x_a_0).sum(1).mean()
cate_pg = (pi_cate * q_x_a_1 + (1 - pi_cate) * q_x_a_0).sum(1).mean()

test_policy_value_list.append(true_value_of_learned_policies)

## 各方策・報酬期待値理解
### 旧方策

In [6]:
print(f'旧方策報酬: {pi_0_value:.3f}')

旧方策報酬: 2.127


### IPS(勾配)

In [7]:
print(f'IPS(勾配)による方策報酬: {ips_pg:.3f}')

IPS(勾配)による方策報酬: 1.962


### Newによる方策報酬

In [8]:
print(f'Newによる方策報酬: {cate_pg:.3f}')

Newによる方策報酬: 3.528


## ここから佐々木の方で方策学習を行い、それによる報酬期待値を得る
### ちなみにランダムで推薦アイテムを決めた場合の期待報酬を計算してみる

In [9]:
pi_random = np.full((50000, num_actions), 1/num_actions)
radom_pg = (pi_random * q_x_a_1 + (1 - pi_random) * q_x_a_0).sum(1).mean()
print(f'ランダム割り当てによる方策報酬: {radom_pg:.3f}')

ランダム割り当てによる方策報酬: 2.280


(まさかのIPS以上New未満...)

### シンプル同じようなxを持つユーザー同士で比較したときに、rが高ければその方策いいよね、という考えでやってみる

In [10]:
# データ整形: 観測される情報のみを持つDataFrameを生成する
x = offline_logged_data['x']
a = offline_logged_data['a']
r = offline_logged_data['r']
pscore = offline_logged_data['pscore']
combined = np.hstack([x, a.reshape(-1, 1), r.reshape(-1, 1), pscore.reshape(-1, 1)])

# データフレームに変換
df = pd.DataFrame(combined, columns=[
    'x_0',
    'x_1',
    'x_2',
    'x_3',
    'x_4',
    'a',
    'r',
    'pscore'
])
df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a,r,pscore
0,1.764052,0.400157,0.978738,2.240893,1.867558,6.0,1.412246,0.137188
1,-0.977278,0.950088,-0.151357,-0.103219,0.410599,7.0,-1.932179,0.098761
2,0.144044,1.454274,0.761038,0.121675,0.443863,6.0,-0.568106,0.096473
3,0.333674,1.494079,-0.205158,0.313068,-0.854096,5.0,1.117016,0.105284
4,-2.552990,0.653619,0.864436,-0.742165,2.269755,4.0,-0.344649,0.162006
...,...,...,...,...,...,...,...,...
995,1.711489,-1.820816,0.163495,-0.813117,-0.605355,1.0,1.831552,0.104161
996,-1.327524,-0.644172,1.908883,-0.563545,1.082473,5.0,-0.547455,0.103926
997,-1.951911,2.441216,-0.017285,0.912282,1.239658,9.0,0.157768,0.124637
998,-0.573367,0.424889,-0.271260,-0.683568,-1.537438,2.0,-1.325663,0.092624


In [11]:
# # rを重みとして使用する都合上、マイナスがあっては都合が悪いので報酬最小値が0より大きくなるようにする
# min_r = df['r'].min()
# if min_r < 0:
#     _add = math.ceil(-min_r)
# else:
#     _add = 0
# 
# df['pseudo_r'] = df['r'] + _add
# df
df['sample_weight'] = df['r'] / df['pscore']
col = 'sample_weight'  # スケーリングしたい列名
df['pseudo_' + col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a,r,pscore,sample_weight,pseudo_sample_weight
0,1.764052,0.400157,0.978738,2.240893,1.867558,6.0,1.412246,0.137188,10.294210,0.431190
1,-0.977278,0.950088,-0.151357,-0.103219,0.410599,7.0,-1.932179,0.098761,-19.564157,0.196003
2,0.144044,1.454274,0.761038,0.121675,0.443863,6.0,-0.568106,0.096473,-5.888762,0.303721
3,0.333674,1.494079,-0.205158,0.313068,-0.854096,5.0,1.117016,0.105284,10.609596,0.433675
4,-2.552990,0.653619,0.864436,-0.742165,2.269755,4.0,-0.344649,0.162006,-2.127385,0.333348
...,...,...,...,...,...,...,...,...,...,...
995,1.711489,-1.820816,0.163495,-0.813117,-0.605355,1.0,1.831552,0.104161,17.583792,0.488609
996,-1.327524,-0.644172,1.908883,-0.563545,1.082473,5.0,-0.547455,0.103926,-5.267736,0.308613
997,-1.951911,2.441216,-0.017285,0.912282,1.239658,9.0,0.157768,0.124637,1.265819,0.360076
998,-0.573367,0.424889,-0.271260,-0.683568,-1.537438,2.0,-1.325663,0.092624,-14.312241,0.237371


In [12]:
# x_0~x4を説明変数とし、pseudo_rを重みとして、aを予測するモデルを作成する

# 説明変数と目的変数
X = np.array(df[[f'x_{i}' for i in range(5)]])  # x_0 ~ x_4
y = np.array(df['a'])
sample_weight = np.array(df['pseudo_sample_weight'])

# モデル作成
model = LogisticRegression()
# model = lgb.LGBMClassifier()
# model = HistGradientBoostingClassifier(max_iter=100)
# model = LGBMClassifier(max_depth=5, num_leaves=31)
# model = CatBoostClassifier(iterations=100, verbose=0)
# model = RandomForestClassifier(n_estimators=100, max_depth=5)
# model = SVC(kernel='rbf', C=1.0, probability=True)
# model = lgb.LGBMClassifier(objective='multiclass', num_class=num_actions, verbose=1)
model.fit(X, y, sample_weight=sample_weight)

In [13]:
# test_dataに適合させる
pi_sasaken = model.predict_proba(test_data['x'])
pi_sasaken[0:5]

array([[0.12273328, 0.1733627 , 0.11446115, 0.09034823, 0.10266293,
        0.05648197, 0.10479457, 0.0938461 , 0.07998743, 0.06132165],
       [0.06935579, 0.12850473, 0.15311006, 0.09578087, 0.11342036,
        0.064886  , 0.07128453, 0.12436774, 0.11005795, 0.06923198],
       [0.07021218, 0.13092849, 0.15641193, 0.09191569, 0.09704682,
        0.05951085, 0.08936938, 0.08927335, 0.14057937, 0.07475194],
       [0.04796592, 0.10567143, 0.16916526, 0.07703493, 0.08917362,
        0.06995643, 0.06498505, 0.10566113, 0.17308327, 0.09730296],
       [0.05564602, 0.03677888, 0.08168259, 0.09369834, 0.08806116,
        0.11788666, 0.17089659, 0.07096247, 0.07248267, 0.21190462]])

In [14]:
# 頼むいい感じになってくれ
sasaken_pg = (pi_sasaken * q_x_a_1 + (1 - pi_sasaken) * q_x_a_0).sum(1).mean()
print(f'ささけん割り当てによる方策報酬: {sasaken_pg:.3f}')
# logi: 2.228
# rf  : 2.232
# hist: 2.1６４

ささけん割り当てによる方策報酬: 2.230


旧方策・IPS以上ランダム未満、まだまだかな、、、

In [81]:
# 21:40

In [16]:
# シャープ化温度（小さいほど極端になる）
temperature = 0.3

# シャープ化
sharpened = pi_sasaken ** (1 / temperature)

# 行ごとに正規化（合計1にする）
sharpened /= sharpened.sum(axis=1, keepdims=True)
sharpened

array([[0.13736674, 0.43436949, 0.10885978, ..., 0.05615579, 0.03296701,
        0.01359523],
       [0.0219956 , 0.17183978, 0.30813365, ..., 0.15408224, 0.10251753,
        0.02186499],
       [0.02212273, 0.1765676 , 0.31942101, ..., 0.0492648 , 0.22380467,
        0.02726081],
       ...,
       [0.03312743, 0.14952296, 0.13568165, ..., 0.38256727, 0.03260991,
        0.04633233],
       [0.01679641, 0.08985739, 0.12538274, ..., 0.18549207, 0.1097077 ,
        0.14796269],
       [0.09122264, 0.04976045, 0.02654771, ..., 0.32553926, 0.00402996,
        0.17270305]])

In [17]:
# 頼むいい感じになってくれ
sharpened_pg = (sharpened * q_x_a_1 + (1 - sharpened) * q_x_a_0).sum(1).mean()
print(f'ささけん割り当てによる方策報酬: {sharpened_pg:.3f}')
# logi: 2.228
# rf  : 2.232
# hist: 2.1６４

ささけん割り当てによる方策報酬: 2.128
