# 201_6.2_データ理解.ipynb

## 6.2 プラットフォーム全体で観測される報酬を最適化する方策学習

In [41]:
import warnings
warnings.filterwarnings('ignore')

import torch
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.style.use('ggplot')

from dataset import generate_synthetic_data2
from policylearners import IPSBasedGradientPolicyLearner, CateBasedGradientPolicyLearner
from utils import softmax

### ログデータ(トレーニングデータ)のサイズ$n$を変化させたときの方策性能の変化

In [42]:
## シミュレーション設定
num_runs = 50 # シミュレーションの繰り返し回数
dim_x = 5 # 特徴量xの次元
num_actions = 10 # 行動数, |A|
beta = -0.1 # データ収集方策のパラメータ
max_iter = 21 # エポック数
test_data_size = 50000 # テストデータのサイズ
random_state = 12345
torch.manual_seed(random_state)
random_ = check_random_state(random_state)
num_actions_list = [2, 10] # 行動数
num_data_list = [100, 500, 1000, 2000] # トレーニングデータのサイズ

In [43]:
num_actions = num_actions_list[0]
## 期待報酬関数を定義するためのパラメータを抽出
theta_1 = random_.normal(size=(dim_x, num_actions))
M_1 = random_.normal(size=(dim_x, num_actions))
b_1 = random_.normal(size=(1, num_actions))
theta_0 = random_.normal(size=(dim_x, num_actions))
M_0 = random_.normal(size=(dim_x, num_actions))
b_0 = random_.normal(size=(1, num_actions))
## 学習された方策の真の性能を近似するためのテストデータを生成
test_data = generate_synthetic_data2(
    num_data=test_data_size, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = random_state
)
pi_0, q_x_a_1, q_x_a_0 = test_data["pi_0"], test_data["q_x_a_1"], test_data["q_x_a_0"]
pi_0_value = (pi_0 * q_x_a_1 + (1. - pi_0) * q_x_a_0).sum(1).mean()

In [44]:
num_data = num_data_list[0]
test_policy_value_list = []
_ = 0

## データ収集方策が形成する分布に従いログデータを生成
offline_logged_data = generate_synthetic_data2(
    num_data=num_data, beta=beta,
    theta_1=theta_1, M_1=M_1, b_1=b_1, theta_0=theta_0, M_0=M_0, b_0=b_0,
    dim_context=dim_x, num_actions=num_actions, random_state = _
)

true_value_of_learned_policies = dict()
true_value_of_learned_policies["logging"] = pi_0_value

## ログデータを用いてオフ方策学習を実行する
### 勾配ベースのアプローチ (IPS推定量で方策勾配を推定)
ips = IPSBasedGradientPolicyLearner(
    dim_x=dim_x, num_actions=num_actions, max_iter=max_iter, random_state=random_state + _
)
ips.fit(offline_logged_data, test_data)
pi_ips = ips.predict(test_data)
true_value_of_learned_policies["ips-pg"] = (pi_ips * q_x_a_1 + (1 - pi_ips) * q_x_a_0).sum(1).mean()
### 勾配ベースのアプローチ (新たに開発した推定量で方策勾配を推定)
cate = CateBasedGradientPolicyLearner(
    dim_x=dim_x, num_actions=num_actions, max_iter=max_iter, random_state=random_state + _
)
cate.fit(offline_logged_data, test_data)
pi_cate = cate.predict(test_data)
true_value_of_learned_policies["cate-pg"] = (pi_cate * q_x_a_1 + (1 - pi_cate) * q_x_a_0).sum(1).mean()

test_policy_value_list.append(true_value_of_learned_policies)

In [45]:
offline_logged_data.keys()

dict_keys(['num_data', 'num_actions', 'x', 'a', 'r', 'a_mat', 'r_mat', 'pi_0', 'pscore', 'pscore_mat', 'q_x_a_1', 'q_x_a_0', 'cate_x_a'])

In [46]:
offline_logged_data['x'][0:5] # 100 x 5

array([[ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ,  1.86755799],
       [-0.97727788,  0.95008842, -0.15135721, -0.10321885,  0.4105985 ],
       [ 0.14404357,  1.45427351,  0.76103773,  0.12167502,  0.44386323],
       [ 0.33367433,  1.49407907, -0.20515826,  0.3130677 , -0.85409574],
       [-2.55298982,  0.6536186 ,  0.8644362 , -0.74216502,  2.26975462]])

In [47]:
offline_logged_data['a'][0:5] # 100,

array([0, 1, 1, 1, 1])

In [48]:
offline_logged_data['r'][0:5] # 100,

array([-2.7613185 , -1.92036756,  1.84077523, -0.68413718,  0.19362076])

In [49]:
offline_logged_data['a_mat'][0:5] # 100 x 2

array([[1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1]])

In [50]:
offline_logged_data['r_mat'][0:5] # 100 x 2

array([[-2.7613185 ,  2.28860353],
       [ 4.95693577, -1.92036756],
       [-3.62091517,  1.84077523],
       [-3.22197227, -0.68413718],
       [-5.71492915,  0.19362076]])

In [51]:
offline_logged_data['pi_0'][0:5] # 100 x 2

array([[0.58288913, 0.41711087],
       [0.52160426, 0.47839574],
       [0.47015381, 0.52984619],
       [0.48640359, 0.51359641],
       [0.12279695, 0.87720305]])

In [52]:
offline_logged_data['pscore'][0:5] # 100,

array([0.58288913, 0.47839574, 0.52984619, 0.51359641, 0.87720305])

In [53]:
offline_logged_data['pscore_mat'][0:5] # 100 x 2

array([[0.58288913, 0.58288913],
       [0.47839574, 0.47839574],
       [0.52984619, 0.52984619],
       [0.51359641, 0.51359641],
       [0.87720305, 0.87720305]])

In [54]:
offline_logged_data['q_x_a_1'][0:5] # 100 x 2

array([[ -3.14405093, -10.9758933 ],
       [  1.30503785,  -1.68615175],
       [ -0.23186256,   2.42204371],
       [ -0.32851033,   0.88363055],
       [  6.37831465,  -1.10780731]])

In [55]:
offline_logged_data['q_x_a_0'][0:5] # 100 x 2

array([[ 1.35011374e+01,  2.32284581e+00],
       [ 3.86058892e+00,  4.69063050e-03],
       [-3.27346452e+00,  5.75710223e-01],
       [-1.58933775e+00,  1.66793683e-01],
       [-4.53577122e+00,  7.64016963e+00]])

In [56]:
offline_logged_data['cate_x_a'][0:5] # 100 x 2

array([[-16.64518835, -13.29873911],
       [ -2.55555107,  -1.69084238],
       [  3.04160196,   1.84633348],
       [  1.26082741,   0.71683686],
       [ 10.91408587,  -8.74797694]])