In [1]:
# IPS推定量方策勾配推定勉強_custom.ipynb

In [2]:
import warnings
warnings.filterwarnings('ignore')

import torch
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
plt.style.use('ggplot')

from dataset import generate_synthetic_data
from policylearners_custom import RegBasedPolicyLearner, GradientBasedPolicyLearner, POTEC
from utils import softmax

In [3]:
# データ

In [4]:
## シミュレーション設定
num_runs = 10 # シミュレーションの繰り返し回数
dim_x = 5 # 特徴量xの次元
num_actions = 3 # 行動数, |A|
num_clusters = 1 # 行動クラスタ数, |C|
lambda_ = 0.5 # クラスタ効果と残差効果の配合率
max_iter = 31 # エポック数
test_data_size = 50000 # テストデータのサイズ
random_state = 12345
torch.manual_seed(random_state)
random_ = check_random_state(random_state)
num_data_list = [100, 200, 500, 1000, 2000] # トレーニングデータのサイズ
num_data = 1000

## 期待報酬関数を定義するためのパラメータを抽出
phi_a = random_.choice(num_clusters, size=num_actions)
theta_g = random_.normal(size=(dim_x, num_clusters))
M_g = random_.normal(size=(dim_x, num_clusters))
b_g = random_.normal(size=(1, num_clusters))
theta_h = random_.normal(size=(dim_x, num_actions))
M_h = random_.normal(size=(dim_x, num_actions))
b_h = random_.normal(size=(1, num_actions))

## 学習された方策の真の性能を近似するためのテストデータを生成
test_data = generate_synthetic_data(
    num_data=test_data_size, lambda_=lambda_,
    theta_g=theta_g, M_g=M_g, b_g=b_g, theta_h=theta_h, M_h=M_h, b_h=b_h, phi_a=phi_a,
    dim_context=dim_x, num_actions=num_actions, num_clusters=num_clusters, random_state = random_state
)

In [5]:
def arrays_to_df_named(**arrays):
    """
    複数のarrayを、変数名を使って列名にしながらDataFrameに変換
    """
    col_list = []
    col_names = []
    
    for name, arr in arrays.items():
        arr = np.asarray(arr)
        if arr.ndim == 1:
            arr = arr.reshape(-1, 1)
        elif arr.ndim != 2:
            raise ValueError(f"Unsupported array shape: {arr.shape}")
        
        col_list.append(arr)
        
        # 列名を作成（1列ならname、複数列ならname_0, name_1, ...）
        if arr.shape[1] == 1:
            col_names.append(name)
        else:
            col_names.extend([f"{name}_{i}" for i in range(arr.shape[1])])
    
    concat_arr = np.hstack(col_list)
    return pd.DataFrame(concat_arr, columns=col_names)

In [6]:
x = test_data['x']
a = test_data['a']
r = test_data['r']
pi_0 = test_data['pi_0']
pscore = test_data['pscore']
q_x_a = test_data['q_x_a']

test_data_df = arrays_to_df_named(x=x, a=a, r=r, pi_0=pi_0, pscore=pscore, q_x_a=q_x_a)
test_data_df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a,r,pi_0_0,pi_0_1,pi_0_2,pscore,q_x_a_0,q_x_a_1,q_x_a_2
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,2.0,1.0,0.284894,0.402661,0.312445,0.312445,0.500317,0.508623,0.999674
1,1.393406,0.092908,0.281746,0.769023,1.246435,1.0,1.0,0.127650,0.454540,0.417810,0.454540,0.500254,0.518734,0.922592
2,1.007189,-1.296221,0.274992,0.228913,1.352917,0.0,0.0,0.259994,0.301837,0.438169,0.259994,0.485081,0.594697,0.852854
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,0.0,1.0,0.400166,0.455727,0.144106,0.400166,0.487238,0.986511,0.987238
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,1.0,1.0,0.266914,0.309795,0.423290,0.309795,0.655953,1.000000,0.500287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-1.502279,0.338190,-0.573258,-0.377969,-1.742207,0.0,1.0,0.471300,0.268550,0.260151,0.471300,0.498053,0.109837,0.500888
49996,-0.238790,0.361659,0.050803,1.136654,-1.520213,1.0,1.0,0.290778,0.463101,0.246121,0.463101,0.452106,0.041226,0.506799
49997,0.111191,1.349551,-0.887112,1.432936,-0.347411,1.0,1.0,0.437081,0.440002,0.122917,0.440002,0.991077,0.995526,0.995287
49998,0.438447,0.399579,-0.526843,0.779586,-1.400417,2.0,1.0,0.390347,0.170914,0.438739,0.438739,0.502425,0.168708,0.506787


In [7]:
# 旧方策報酬期待値
pi_0_value = (
    test_data_df['pi_0_0'] * test_data_df['q_x_a_0'] 
  + test_data_df['pi_0_1'] * test_data_df['q_x_a_1']
  + test_data_df['pi_0_2'] * test_data_df['q_x_a_2']
              ).mean()
pi_0_value

0.7046519746193252

In [8]:
# ログデータ生成
_ = 123
## データ収集方策が形成する分布に従いログデータを生成
offline_logged_data = generate_synthetic_data(
    num_data=num_data, lambda_=lambda_,
    theta_g=theta_g, M_g=M_g, b_g=b_g, theta_h=theta_h, M_h=M_h, b_h=b_h, phi_a=phi_a,
    dim_context=dim_x, num_actions=num_actions, num_clusters=num_clusters,
    random_state = _
)

x = offline_logged_data['x']
a = offline_logged_data['a']
r = offline_logged_data['r']
pi_0 = offline_logged_data['pi_0']
pscore = offline_logged_data['pscore']

df = arrays_to_df_named(x=x, a=a, r=r, pi_0=pi_0, pscore=pscore)
df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a,r,pi_0_0,pi_0_1,pi_0_2,pscore
0,-1.085631,0.997345,0.282978,-1.506295,-0.578600,1.0,1.0,0.269929,0.468966,0.261105,0.468966
1,1.651437,-2.426679,-0.428913,1.265936,-0.866740,1.0,1.0,0.112969,0.502632,0.384399,0.502632
2,-0.678886,-0.094709,1.491390,-0.638902,-0.443982,0.0,0.0,0.244829,0.337527,0.417644,0.244829
3,-0.434351,2.205930,2.186786,1.004054,0.386186,1.0,1.0,0.474907,0.421363,0.103730,0.421363
4,0.737369,1.490732,-0.935834,1.175829,-1.253881,2.0,1.0,0.271040,0.264836,0.464124,0.464124
...,...,...,...,...,...,...,...,...,...,...,...
995,0.842827,-0.158087,0.894803,1.912632,-0.723939,0.0,0.0,0.336784,0.263921,0.399295,0.336784
996,0.615427,0.874359,1.250426,0.134459,-0.896068,2.0,1.0,0.208205,0.387376,0.404420,0.404420
997,-1.533461,0.737953,1.020214,0.991191,0.662288,1.0,0.0,0.283604,0.393775,0.322621,0.393775
998,-1.040656,0.953849,-1.910184,0.778448,-1.057939,0.0,1.0,0.154338,0.456767,0.388896,0.154338


In [9]:
# ログデータ報酬実績値
logged_data_r = df['r'].mean()
logged_data_r

0.704

In [10]:
# 勾配ベースのアプローチ (IPS推定量で方策勾配を推定)
ips = GradientBasedPolicyLearner(dim_x=dim_x, num_actions=num_actions, max_iter=max_iter)
ips.fit(
    x=np.array(df[['x_0', 'x_1', 'x_2', 'x_3', 'x_4']]),
    a=np.array(df['a']),
    r=np.array(df['r']),
    pscore=np.array(df['pscore']),
    pi_0=np.array(df[['pi_0_0', 'pi_0_1', 'pi_0_2']]),
)
pi_ips = ips.predict(dataset_test_x=test_data['x'])

In [11]:
x = test_data['x']
a = test_data['a']
r = test_data['r']
pi_0 = test_data['pi_0']
pscore = test_data['pscore']
q_x_a = test_data['q_x_a']

test_data_with_pi_ips_df = arrays_to_df_named(x=x, a=a, r=r, pi_0=pi_0, pscore=pscore, q_x_a=q_x_a, pi_ips=pi_ips)
test_data_with_pi_ips_df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a,r,pi_0_0,pi_0_1,pi_0_2,pscore,q_x_a_0,q_x_a_1,q_x_a_2,pi_ips_0,pi_ips_1,pi_ips_2
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,2.0,1.0,0.284894,0.402661,0.312445,0.312445,0.500317,0.508623,0.999674,0.000144,0.997135,0.002722
1,1.393406,0.092908,0.281746,0.769023,1.246435,1.0,1.0,0.127650,0.454540,0.417810,0.454540,0.500254,0.518734,0.922592,0.000063,0.996939,0.002998
2,1.007189,-1.296221,0.274992,0.228913,1.352917,0.0,0.0,0.259994,0.301837,0.438169,0.259994,0.485081,0.594697,0.852854,0.000008,0.998642,0.001349
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,0.0,1.0,0.400166,0.455727,0.144106,0.400166,0.487238,0.986511,0.987238,0.000106,0.993014,0.006880
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,1.0,1.0,0.266914,0.309795,0.423290,0.309795,0.655953,1.000000,0.500287,0.127459,0.173333,0.699209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-1.502279,0.338190,-0.573258,-0.377969,-1.742207,0.0,1.0,0.471300,0.268550,0.260151,0.471300,0.498053,0.109837,0.500888,0.901295,0.004608,0.094097
49996,-0.238790,0.361659,0.050803,1.136654,-1.520213,1.0,1.0,0.290778,0.463101,0.246121,0.463101,0.452106,0.041226,0.506799,0.685600,0.048188,0.266212
49997,0.111191,1.349551,-0.887112,1.432936,-0.347411,1.0,1.0,0.437081,0.440002,0.122917,0.440002,0.991077,0.995526,0.995287,0.308347,0.528280,0.163373
49998,0.438447,0.399579,-0.526843,0.779586,-1.400417,2.0,1.0,0.390347,0.170914,0.438739,0.438739,0.502425,0.168708,0.506787,0.543739,0.184899,0.271362


In [12]:
# 新方策報酬期待値
pi_ips_value = (
    test_data_with_pi_ips_df['pi_ips_0'] * test_data_with_pi_ips_df['q_x_a_0'] 
  + test_data_with_pi_ips_df['pi_ips_1'] * test_data_with_pi_ips_df['q_x_a_1']
  + test_data_with_pi_ips_df['pi_ips_2'] * test_data_with_pi_ips_df['q_x_a_2']
              ).mean()
pi_ips_value

0.7549930563036533

In [13]:
# オフライン評価で評価してみる

In [14]:
# 必要な列を NumPy 配列として取り出す
pi_ips_values = test_data_with_pi_ips_df[[f"pi_ips_{i}" for i in range(3)]].to_numpy()  # shape = (n, 3)
a_indices = test_data_with_pi_ips_df["a"].astype(int).to_numpy()                       # shape = (n,)

# a の値をインデックスとして使って列ごとの値を抽出
test_data_with_pi_ips_df["pscore_pi_ips"] = pi_ips_values[np.arange(len(test_data_with_pi_ips_df)), a_indices]
test_data_with_pi_ips_df

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,a,r,pi_0_0,pi_0_1,pi_0_2,pscore,q_x_a_0,q_x_a_1,q_x_a_2,pi_ips_0,pi_ips_1,pi_ips_2,pscore_pi_ips
0,-0.204708,0.478943,-0.519439,-0.555730,1.965781,2.0,1.0,0.284894,0.402661,0.312445,0.312445,0.500317,0.508623,0.999674,0.000144,0.997135,0.002722,0.002722
1,1.393406,0.092908,0.281746,0.769023,1.246435,1.0,1.0,0.127650,0.454540,0.417810,0.454540,0.500254,0.518734,0.922592,0.000063,0.996939,0.002998,0.996939
2,1.007189,-1.296221,0.274992,0.228913,1.352917,0.0,0.0,0.259994,0.301837,0.438169,0.259994,0.485081,0.594697,0.852854,0.000008,0.998642,0.001349,0.000008
3,0.886429,-2.001637,-0.371843,1.669025,-0.438570,0.0,1.0,0.400166,0.455727,0.144106,0.400166,0.487238,0.986511,0.987238,0.000106,0.993014,0.006880,0.000106
4,-0.539741,0.476985,3.248944,-1.021228,-0.577087,1.0,1.0,0.266914,0.309795,0.423290,0.309795,0.655953,1.000000,0.500287,0.127459,0.173333,0.699209,0.173333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,-1.502279,0.338190,-0.573258,-0.377969,-1.742207,0.0,1.0,0.471300,0.268550,0.260151,0.471300,0.498053,0.109837,0.500888,0.901295,0.004608,0.094097,0.901295
49996,-0.238790,0.361659,0.050803,1.136654,-1.520213,1.0,1.0,0.290778,0.463101,0.246121,0.463101,0.452106,0.041226,0.506799,0.685600,0.048188,0.266212,0.048188
49997,0.111191,1.349551,-0.887112,1.432936,-0.347411,1.0,1.0,0.437081,0.440002,0.122917,0.440002,0.991077,0.995526,0.995287,0.308347,0.528280,0.163373,0.528280
49998,0.438447,0.399579,-0.526843,0.779586,-1.400417,2.0,1.0,0.390347,0.170914,0.438739,0.438739,0.502425,0.168708,0.506787,0.543739,0.184899,0.271362,0.271362


In [15]:
test_data_with_pi_ips_df['iw'] = test_data_with_pi_ips_df['pscore_pi_ips'] / test_data_with_pi_ips_df['pscore']
offline_pi_ips_value = (test_data_with_pi_ips_df['iw'] * test_data_with_pi_ips_df['r']).mean()
offline_pi_ips_value

0.76085401499816