# 勾配ベース理解

In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.style.use('ggplot')

from dataset import generate_synthetic_data
# from policylearners import RegBasedPolicyLearner, GradientBasedPolicyLearner, POTEC
from utils import softmax

In [2]:
## シミュレーション設定
num_runs = 100 # シミュレーションの繰り返し回数
dim_x = 5 # 特徴量xの次元
num_actions = 2 # 行動数, |A|
num_clusters = 2 # 行動クラスタ数, |C|
lambda_ = 0.5 # クラスタ効果と残差効果の配合率
max_iter = 31 # エポック数
test_data_size = 50000 # テストデータのサイズ
random_state = 12345
torch.manual_seed(random_state)
random_ = check_random_state(random_state)
num_data_list = [100, 200, 500, 1000, 2000] # トレーニングデータのサイズ

# num_data = num_data_list[4] # 2,000
num_data = num_data_list[0] # 100

## 期待報酬関数を定義するためのパラメータを抽出
phi_a = random_.choice(num_clusters, size=num_actions)
theta_g = random_.normal(size=(dim_x, num_clusters))
M_g = random_.normal(size=(dim_x, num_clusters))
b_g = random_.normal(size=(1, num_clusters))
theta_h = random_.normal(size=(dim_x, num_actions))
M_h = random_.normal(size=(dim_x, num_actions))
b_h = random_.normal(size=(1, num_actions))

## 学習された方策の真の性能を近似するためのテストデータを生成
test_data = generate_synthetic_data(
    num_data=test_data_size, lambda_=lambda_,
    theta_g=theta_g, M_g=M_g, b_g=b_g, theta_h=theta_h, M_h=M_h, b_h=b_h, phi_a=phi_a,
    dim_context=dim_x, num_actions=num_actions, num_clusters=num_clusters, random_state = random_state
)
pi_0_value = (test_data["q_x_a"] * test_data["pi_0"]).sum(1).mean()

## データ収集方策が形成する分布に従いログデータを生成
offline_logged_data = generate_synthetic_data(
    num_data=num_data, lambda_=lambda_,
    theta_g=theta_g, M_g=M_g, b_g=b_g, theta_h=theta_h, M_h=M_h, b_h=b_h, phi_a=phi_a,
    dim_context=dim_x, num_actions=num_actions, num_clusters=num_clusters,
    # random_state = _
    random_state = random_state
)

In [3]:
# 辞書型で直観的にわかりづらいのでdfに変換
# log_data->log_data_df
data = offline_logged_data.copy()
df = pd.DataFrame(data['x'], columns=['x_'+str(i) for i in range(data['x'].shape[1])])
df['a'] = data['a']
df['r'] = data['r']
ex_reward_df = pd.DataFrame(data['q_x_a'], columns=['q_x_'+str(i) for i in range(data['num_actions'])])
df = pd.concat([df, ex_reward_df], axis=1)
pi_b_df = pd.DataFrame(data['pi_0'].reshape(data['num_data'], data['num_actions']), columns=['pi_0_'+str(i) for i in range(data['num_actions'])])
df = pd.concat([df, pi_b_df], axis=1)
df['pscore'] = data['pscore']
df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a,r,q_x_0,q_x_1,pi_0_0,pi_0_1,pscore
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,1,0,0.175144,0.042578,0.431547,0.568453,0.568453
1,1.393406,0.092908,0.281746,0.769023,1.246435,0,0,0.357402,0.026896,0.545722,0.454278,0.545722
2,1.007189,-1.296221,0.274992,0.228913,1.352917,0,0,0.086599,0.165448,0.561380,0.438620,0.561380
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,0,0,0.000486,0.777752,0.381532,0.618468,0.381532
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,1,1,1.000000,1.000000,0.440605,0.559395,0.559395
...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.398092,-0.916935,-0.082650,-1.939691,1.407994,1,1,0.000950,0.456561,0.442579,0.557421,0.557421
96,1.512406,0.526493,-0.266931,0.862284,0.083803,0,1,0.728565,0.313117,0.620969,0.379031,0.620969
97,-1.872339,-0.962791,0.080067,0.128726,-0.479120,1,1,0.728029,0.998228,0.442754,0.557246,0.557246
98,-0.640281,0.745974,-0.622547,0.936289,0.750018,1,1,0.920900,0.913424,0.549833,0.450167,0.450167


In [4]:
# df.to_csv('offline_logged_data.csv', index=False)

In [25]:
from copy import copy
from collections import OrderedDict
from dataclasses import dataclass
import numpy as np
from sklearn.utils import check_random_state
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ExponentialLR
from utils import softmax, RegBasedPolicyDataset, GradientBasedPolicyDataset

# class
dim_x: int = dim_x
num_actions: int = num_actions
hidden_layer_size: tuple = (30, 30, 30)
activation: str = "elu"
batch_size: int = 10
learning_rate_init: float = 0.005
gamma: float = 0.98
alpha: float = 1e-6
imit_reg: float = 0.0
log_eps: float = 1e-10
solver: str = "adagrad"
max_iter: int = 30
random_state: int = 12345

# init
layer_list = []
input_size = dim_x

if activation == "tanh":
    activation_layer = nn.Tanh
elif activation == "relu":
    activation_layer = nn.ReLU
elif activation == "elu":
    activation_layer = nn.ELU

for i, h in enumerate(hidden_layer_size):
    layer_list.append(("l{}".format(i), nn.Linear(input_size, h)))
    layer_list.append(("a{}".format(i), activation_layer()))
    input_size = h
layer_list.append(("output", nn.Linear(input_size, num_actions)))

nn_model = nn.Sequential(OrderedDict(layer_list))

random_ = check_random_state(random_state)
train_loss = []
train_value = []
test_value = []

# _create_train_data_for_opl
def _create_train_data_for_opl(
    x: np.ndarray,
    a: np.ndarray,
    r: np.ndarray,
    pscore: np.ndarray,
    q_hat: np.ndarray,
    pi_0: np.ndarray,
) -> tuple:
    dataset = GradientBasedPolicyDataset(
        torch.from_numpy(x).float(),
        torch.from_numpy(a).long(),
        torch.from_numpy(r).float(),
        torch.from_numpy(pscore).float(),
        torch.from_numpy(q_hat).float(),
        torch.from_numpy(pi_0).float(),
    )

    data_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
    )

    return data_loader

# _estimate_policy_gradient
def _estimate_policy_gradient(
    a: torch.Tensor,
    r: torch.Tensor,
    pscore: torch.Tensor,
    q_hat: torch.Tensor,
    pi: torch.Tensor,
    pi_0: torch.Tensor,
) -> torch.Tensor:
    current_pi = pi.detach()
    log_prob = torch.log(pi + log_eps)
    idx = torch.arange(a.shape[0], dtype=torch.long)

    q_hat_factual = q_hat[idx, a]
    iw = current_pi[idx, a] / pscore
    estimated_policy_grad_arr = iw * (r - q_hat_factual) * log_prob[idx, a]
    estimated_policy_grad_arr += torch.sum(q_hat * current_pi * log_prob, dim=1)

    # imitation regularization
    estimated_policy_grad_arr += imit_reg * log_prob[idx, a]

    return estimated_policy_grad_arr

# predict
def predict(dataset_test: np.ndarray) -> np.ndarray:

    nn_model.eval()
    x = torch.from_numpy(dataset_test["x"]).float()
    return nn_model(x).detach().numpy()

# fit
dataset = offline_logged_data
dataset_test = test_data
q_hat = None

x, a, r = dataset["x"], dataset["a"], dataset["r"]
pscore, pi_0 = dataset["pscore"], dataset["pi_0"]

In [26]:
if q_hat is None:
    q_hat = np.zeros((r.shape[0], num_actions))

if solver == "adagrad":
    optimizer = optim.Adagrad(
        nn_model.parameters(),
        lr=learning_rate_init,
        weight_decay=alpha,
    )
elif solver == "adam":
    optimizer = optim.AdamW(
        nn_model.parameters(),
        lr=learning_rate_init,
        weight_decay=alpha,
    )
else:
    raise NotImplementedError("`solver` must be one of 'adam' or 'adagrad'")

training_data_loader = _create_train_data_for_opl(
    x,
    a,
    r,
    pscore,
    q_hat,
    pi_0,
)

In [29]:
# start policy training
scheduler = ExponentialLR(optimizer, gamma=gamma)
q_x_a_train, q_x_a_test = dataset["q_x_a"], dataset_test["q_x_a"]
# for _ in range(max_iter):
_ = 0
loss_epoch = 0.0
nn_model.train()
for x_, a_, r_, p, q_hat_, pi_0_ in training_data_loader:
    optimizer.zero_grad()
    pi = nn_model(x_)
    print(nn_model)
    print(pi)
    loss = -_estimate_policy_gradient(
        a=a_,
        r=r_,
        pscore=p,
        q_hat=q_hat_,
        pi_0=pi_0_,
        pi=pi,
    ).mean()
    print(loss)
    print('==============================')
    loss.backward()
    optimizer.step()
    loss_epoch += loss.item()
train_loss.append(loss_epoch)
scheduler.step()
pi_train = predict(dataset)
train_value.append((q_x_a_train * pi_train).sum(1).mean())
pi_test = predict(dataset_test)
test_value.append((q_x_a_test * pi_test).sum(1).mean())


Sequential(
  (l0): Linear(in_features=5, out_features=30, bias=True)
  (a0): ELU(alpha=1.0)
  (l1): Linear(in_features=30, out_features=30, bias=True)
  (a1): ELU(alpha=1.0)
  (l2): Linear(in_features=30, out_features=30, bias=True)
  (a2): ELU(alpha=1.0)
  (output): Linear(in_features=30, out_features=2, bias=True)
)
tensor([[1.0508, 1.6958],
        [0.8254, 1.0537],
        [1.0413, 1.4871],
        [0.7560, 0.9081],
        [0.8688, 1.3850],
        [0.9912, 1.4644],
        [0.7446, 1.4035],
        [0.9548, 1.5378],
        [0.7649, 1.3943],
        [0.8751, 1.2964]], grad_fn=<AddmmBackward0>)
tensor(-0.3537, grad_fn=<NegBackward0>)
Sequential(
  (l0): Linear(in_features=5, out_features=30, bias=True)
  (a0): ELU(alpha=1.0)
  (l1): Linear(in_features=30, out_features=30, bias=True)
  (a1): ELU(alpha=1.0)
  (l2): Linear(in_features=30, out_features=30, bias=True)
  (a2): ELU(alpha=1.0)
  (output): Linear(in_features=30, out_features=2, bias=True)
)
tensor([[0.9176, 1.2740],
     

In [25]:
for x_, a_, r_, p, q_hat_, pi_0_ in training_data_loader:
    optimizer.zero_grad()
    pi = nn_model(x_)

    a=a_
    r=r_
    pscore=p
    q_hat=q_hat_
    pi_0=pi_0_
    pi=pi
    
    current_pi = pi.detach()
    log_prob = torch.log(pi + log_eps)
    idx = torch.arange(a.shape[0], dtype=torch.long)
    print(current_pi)
    print('====================')

    q_hat_factual = q_hat[idx, a]
    iw = current_pi[idx, a] / pscore
    estimated_policy_grad_arr = iw * (r - q_hat_factual) * log_prob[idx, a]
    estimated_policy_grad_arr += torch.sum(q_hat * current_pi * log_prob, dim=1)

    # imitation regularization
    estimated_policy_grad_arr += imit_reg * log_prob[idx, a]

    # return estimated_policy_grad_arr

    # optimizer.zero_grad()
    # pi = nn_model(x_)
    # loss = -_estimate_policy_gradient(
    #     a=a_,
    #     r=r_,
    #     pscore=p,
    #     q_hat=q_hat_,
    #     pi_0=pi_0_,
    #     pi=pi,
    # ).mean()
    # loss.backward()
    # optimizer.step()
    # loss_epoch += loss.item()

train_loss.append(loss_epoch)
scheduler.step()
pi_train = predict(dataset)
train_value.append((q_x_a_train * pi_train).sum(1).mean())
pi_test = predict(dataset_test)
test_value.append((q_x_a_test * pi_test).sum(1).mean())


tensor([[-0.0596, -0.0799],
        [-0.0249, -0.0351],
        [ 0.0053, -0.0109],
        [ 0.2124,  0.0857],
        [-0.3257, -0.1027],
        [-0.0983, -0.0927],
        [ 0.0868, -0.0966],
        [-0.0470, -0.0554],
        [ 0.0816, -0.1098],
        [-0.0009, -0.0409]])
tensor([[ 0.0074, -0.0447],
        [ 0.1448,  0.0025],
        [-0.0004, -0.0306],
        [ 0.1130, -0.0888],
        [-0.0588, -0.1069],
        [ 0.0268, -0.0602],
        [-0.0208, -0.0281],
        [-0.0571, -0.0420],
        [ 0.0383, -0.0757],
        [ 0.0616, -0.0149]])
tensor([[-0.0190, -0.0801],
        [-0.1829, -0.1297],
        [-0.1816, -0.1232],
        [ 0.0910, -0.0658],
        [ 0.0972, -0.0279],
        [-0.0017, -0.0779],
        [ 0.0076, -0.0085],
        [-0.0736, -0.0966],
        [-0.1470, -0.1289],
        [ 0.0332, -0.0378]])
tensor([[-0.1282, -0.1312],
        [ 0.0358, -0.0466],
        [ 0.1903,  0.0130],
        [ 0.0206, -0.0025],
        [-0.0259, -0.0749],
        [ 0.1316,