# 勾配ベース理解

In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.style.use('ggplot')

from dataset import generate_synthetic_data
from policylearners import RegBasedPolicyLearner, GradientBasedPolicyLearner, POTEC
from utils import softmax

In [2]:
## シミュレーション設定
num_runs = 100 # シミュレーションの繰り返し回数
dim_x = 5 # 特徴量xの次元
num_actions = 2 # 行動数, |A|
num_clusters = 2 # 行動クラスタ数, |C|
lambda_ = 0.5 # クラスタ効果と残差効果の配合率
max_iter = 31 # エポック数
test_data_size = 50000 # テストデータのサイズ
random_state = 12345
torch.manual_seed(random_state)
random_ = check_random_state(random_state)
num_data_list = [100, 200, 500, 1000, 2000] # トレーニングデータのサイズ

# num_data = num_data_list[4] # 2,000
num_data = num_data_list[0] # 100

## 期待報酬関数を定義するためのパラメータを抽出
phi_a = random_.choice(num_clusters, size=num_actions)
theta_g = random_.normal(size=(dim_x, num_clusters))
M_g = random_.normal(size=(dim_x, num_clusters))
b_g = random_.normal(size=(1, num_clusters))
theta_h = random_.normal(size=(dim_x, num_actions))
M_h = random_.normal(size=(dim_x, num_actions))
b_h = random_.normal(size=(1, num_actions))

## 学習された方策の真の性能を近似するためのテストデータを生成
test_data = generate_synthetic_data(
    num_data=test_data_size, lambda_=lambda_,
    theta_g=theta_g, M_g=M_g, b_g=b_g, theta_h=theta_h, M_h=M_h, b_h=b_h, phi_a=phi_a,
    dim_context=dim_x, num_actions=num_actions, num_clusters=num_clusters, random_state = random_state
)
pi_0_value = (test_data["q_x_a"] * test_data["pi_0"]).sum(1).mean()

## データ収集方策が形成する分布に従いログデータを生成
offline_logged_data = generate_synthetic_data(
    num_data=num_data, lambda_=lambda_,
    theta_g=theta_g, M_g=M_g, b_g=b_g, theta_h=theta_h, M_h=M_h, b_h=b_h, phi_a=phi_a,
    dim_context=dim_x, num_actions=num_actions, num_clusters=num_clusters,
    # random_state = _
    random_state = random_state
)

In [3]:
# 辞書型で直観的にわかりづらいのでdfに変換
# log_data->log_data_df
data = offline_logged_data.copy()
df = pd.DataFrame(data['x'], columns=['x_'+str(i) for i in range(data['x'].shape[1])])
df['a'] = data['a']
df['r'] = data['r']
ex_reward_df = pd.DataFrame(data['q_x_a'], columns=['q_x_'+str(i) for i in range(data['num_actions'])])
df = pd.concat([df, ex_reward_df], axis=1)
pi_b_df = pd.DataFrame(data['pi_0'].reshape(data['num_data'], data['num_actions']), columns=['pi_0_'+str(i) for i in range(data['num_actions'])])
df = pd.concat([df, pi_b_df], axis=1)
df['pscore'] = data['pscore']
df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a,r,q_x_0,q_x_1,pi_0_0,pi_0_1,pscore
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,1,0,0.175144,0.042578,0.431547,0.568453,0.568453
1,1.393406,0.092908,0.281746,0.769023,1.246435,0,0,0.357402,0.026896,0.545722,0.454278,0.545722
2,1.007189,-1.296221,0.274992,0.228913,1.352917,0,0,0.086599,0.165448,0.561380,0.438620,0.561380
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,0,0,0.000486,0.777752,0.381532,0.618468,0.381532
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,1,1,1.000000,1.000000,0.440605,0.559395,0.559395
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.398092,-0.916935,-0.082650,-1.939691,1.407994,1,1,0.000950,0.456561,0.442579,0.557421,0.557421
96,1.512406,0.526493,-0.266931,0.862284,0.083803,0,1,0.728565,0.313117,0.620969,0.379031,0.620969
97,-1.872339,-0.962791,0.080067,0.128726,-0.479120,1,1,0.728029,0.998228,0.442754,0.557246,0.557246
98,-0.640281,0.745974,-0.622547,0.936289,0.750018,1,1,0.920900,0.913424,0.549833,0.450167,0.450167


In [4]:
# df.to_csv('offline_logged_data.csv', index=False)

In [5]:
### 勾配ベースのアプローチ (IPS推定量で方策勾配を推定)
ips = GradientBasedPolicyLearner(dim_x=dim_x, num_actions=num_actions, max_iter=max_iter, batch_size=10)
_list = ips.fit(offline_logged_data, test_data)
_list

[tensor(0.2291, grad_fn=<NegBackward0>),
 tensor(0.5751, grad_fn=<NegBackward0>),
 tensor(0.4930, grad_fn=<NegBackward0>),
 tensor(0.3968, grad_fn=<NegBackward0>),
 tensor(0.4378, grad_fn=<NegBackward0>),
 tensor(0.5375, grad_fn=<NegBackward0>),
 tensor(0.3900, grad_fn=<NegBackward0>),
 tensor(0.4003, grad_fn=<NegBackward0>),
 tensor(0.1012, grad_fn=<NegBackward0>),
 tensor(0.3841, grad_fn=<NegBackward0>),
 tensor(0.1568, grad_fn=<NegBackward0>),
 tensor(0.4551, grad_fn=<NegBackward0>),
 tensor(0.3906, grad_fn=<NegBackward0>),
 tensor(0.2970, grad_fn=<NegBackward0>),
 tensor(0.3825, grad_fn=<NegBackward0>),
 tensor(0.4308, grad_fn=<NegBackward0>),
 tensor(0.3459, grad_fn=<NegBackward0>),
 tensor(0.3285, grad_fn=<NegBackward0>),
 tensor(0.0934, grad_fn=<NegBackward0>),
 tensor(0.3073, grad_fn=<NegBackward0>),
 tensor(0.1116, grad_fn=<NegBackward0>),
 tensor(0.3572, grad_fn=<NegBackward0>),
 tensor(0.3088, grad_fn=<NegBackward0>),
 tensor(0.2225, grad_fn=<NegBackward0>),
 tensor(0.3478, 