In [35]:
import numpy as np
import pandas as pd

In [36]:
def get_best_policy(reward_matrix):

    policy = (reward_matrix == np.max(reward_matrix, axis=1)[:, None]).astype(float)

    return policy / np.sum(policy, axis=1)[:, None]


def get_perceived_score(logged_reward_matrix, policy, n_interactions):

    return np.sum(policy * logged_reward_matrix) / n_interactions


def get_logged_reward(reward_matrix, cust_segments, policy, n_interactions):

    assignment_matrix = n_interactions * cust_segments[:, None] * policy
    logged_reward_matrix = assignment_matrix * reward_matrix

    return  logged_reward_matrix, assignment_matrix


def get_true_score(reward_matrix, cust_segments, policy):

    logged_reward_matrix, _ = get_logged_reward(reward_matrix, cust_segments, policy, 1)

    return np.round(np.sum(logged_reward_matrix), 4)

In [37]:
q_x_a = np.array([
    [.4, .6],
    [.7, .5]
])

p_x = np.array([.4 , .6])

policy_0 = np.array([
    [.7, .3],
    [.2, .8]
])

n_interactions = 1000

In [38]:
n_x, n_a = q_x_a.shape
logged_reward_matrix_0, assignment_matrix_0 = get_logged_reward(q_x_a, p_x, policy_0, n_interactions)

records_0 = {'x': [], 'a': [], 'r': []}
for x in range(n_x):
    for a in range(n_a):
        records_0['x'] += [x] * int(assignment_matrix_0[x, a])
        records_0['a'] += [a] * int(assignment_matrix_0[x, a])
        records_0['r'] += [1] * int(logged_reward_matrix_0[x, a])
        records_0['r'] += [0] * int(assignment_matrix_0[x, a] - logged_reward_matrix_0[x, a])

records_0 = pd.DataFrame(records_0)

In [39]:
assignment_matrix_0

array([[280., 120.],
       [120., 480.]])

In [40]:
print('logged_reward_matrix for policy_0')
print(logged_reward_matrix_0)

logged_reward_matrix for policy_0
[[112.  72.]
 [ 84. 240.]]


In [41]:
policy_trueoptimal = get_best_policy(q_x_a)
policy_percieveoptimal = get_best_policy(logged_reward_matrix_0)


In [42]:
print('true optimal policy')
print(policy_trueoptimal)
print('percieved optimal policy')
print(policy_percieveoptimal)

true optimal policy
[[0. 1.]
 [1. 0.]]
percieved optimal policy
[[1. 0.]
 [0. 1.]]


In [43]:
print('true optimal policy true score:', get_true_score(q_x_a, p_x, policy_trueoptimal))
print('percieved optimal policy true score:', get_true_score(q_x_a, p_x, policy_percieveoptimal))
print('policy_0 true score:', get_true_score(q_x_a, p_x, policy_0))

true optimal policy true score: 0.66
percieved optimal policy true score: 0.46
policy_0 true score: 0.508


In [44]:
print('true optimal policy score bias:', get_perceived_score(logged_reward_matrix_0, policy_trueoptimal, n_interactions))
print('percieved optimal policy score bias:', get_perceived_score(logged_reward_matrix_0, policy_percieveoptimal, n_interactions))
print('policy_0 score bias:', get_perceived_score(logged_reward_matrix_0, policy_0, n_interactions))

true optimal policy score bias: 0.156
percieved optimal policy score bias: 0.352
policy_0 score bias: 0.30879999999999996


In [45]:
print('logged reward generated under optimal policy')
print(get_logged_reward(q_x_a, p_x, policy_trueoptimal, n_interactions)[0])
print('evaluated reward of optimal policy using original logged reward with propensity score adjustment')
print(policy_trueoptimal * logged_reward_matrix_0 / policy_0)

logged reward generated under optimal policy
[[  0. 240.]
 [420.   0.]]
evaluated reward of optimal policy using original logged reward with propensity score adjustment
[[  0. 240.]
 [420.   0.]]


Adjusting the evaluated reward by propensity score will transform the distribution of loggged reward as if it was generated by the evaluated policy

In [46]:
print('logged_reward_matrix_0 / policy_0')
print(logged_reward_matrix_0 / policy_0)
print('n_interactions * p_x * q_x_a')
print(n_interactions * p_x[:, None] * q_x_a)

logged_reward_matrix_0 / policy_0
[[160. 240.]
 [420. 300.]]
n_interactions * p_x * q_x_a
[[160. 240.]
 [420. 300.]]


Adjust logged_reward_matrix_0 by propensity score leads to the reward matrix that looks like it wasn't subjected to any policy (i.e. all actions were played at the same time).