# Contextual Bandits Agent with Policy Gradient Method (Deterministic) in Prediction Markets Problem
---
This is a program that simulates an agent who trades in a prediction market. The problem that the prediction market aims to solve is to predict the real distribution of a random variable. We define the random variable as the colour of a bucket. The problem design comes from a human-subjective experiment for decision markets.

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from main import deterministic_training_notebook
from Environment import ScoreFunction, DecisionRule

In [None]:
learning_rate_theta = 3e-4
learning_rate_wv = 1e-3
learning_rate_wq = 1e-1
memory_size = 16
batch_size = 16
training_episodes = int(1e6)
decay_rate = 0
beta1 = 0.9
beta2 = 0.9999
# Algorithm: adam, momentum, regular
algorithm = 'regular'
learning_std = False
fixed_std = 0.3
# Bucket parameters
pr_red_ball_red_bucket = 2/3
pr_red_ball_blue_bucket = 1/3
# prior_red_list = [0.7, 0.3]
prior_red_list = None
sq_agent_num=1 # total agent number will be (sq_agent_num +  3) * action_num
action_num=2
feature_num = 3
score_func = ScoreFunction.LOG
agent_list = []
evaluation_step = 1
explorer_learning = False

agent_list, pd_outcome_list, prior_outcome_list, nb_outcome_list, loss_list = deterministic_training_notebook(agent_list, feature_num, action_num, learning_rate_theta, learning_rate_wv, learning_rate_wq,
                                             memory_size, batch_size, training_episodes,
                                             decay_rate, beta1, beta2, algorithm, pr_red_ball_red_bucket,
                                             pr_red_ball_blue_bucket, prior_red_list, sq_agent_num,
                                             explorer_learning, fixed_std, evaluation_step)


In [None]:
print(np.mean(prior_outcome_list))
print(np.mean(pd_outcome_list))
print(np.mean(nb_outcome_list))

In [None]:
print(np.sum(loss_list))
plt.plot(loss_list)

In [None]:
for action_agent_list in agent_list:
    for agent in action_agent_list:
        agent.reward_history_plot()
        agent.mean_gradients_history_plot()
        agent.mean_gradients_successive_dot_product_plot()
    #     agent.mean_history_plot()
        agent.mean_weights_history_plot()

In [6]:
# agent_list[0].gradients_history_plot('q')
# agent_list[0].gradients_successive_dot_product_plot('q')
# #     agent.mean_history_plot()
# agent_list[0].weights_history_plot('q')

In [7]:
agent_list[0].w_q

array([[ 0.01613107,  0.06266245],
       [ 0.0065514 , -0.02498846],
       [ 0.05713842,  0.0127856 ],
       [-0.01092689, -0.06594342],
       [-0.04668794, -0.03421238],
       [-0.09612956, -0.00364618]])

In [8]:
agent_list[0].w_v

array([[ 0.03737971, -0.00155561],
       [ 0.00611195, -0.00452361],
       [-0.01692968,  0.00832946],
       [-0.01241121,  0.03555763],
       [-0.00171688,  0.06363951],
       [-0.01242466, -0.00942074]])

In [9]:
from Environment import BucketColour, Ball
import numpy as np

In [10]:
signal = agent_list[0].signal_encode(0, Ball.RED, 3/4)

In [11]:
a = np.array([[0.1, 0.1]])
phi_array =  np.matmul(signal.T, a)

In [12]:
phi_array

array([[0.1       , 0.1       ],
       [0.        , 0.        ],
       [0.10986123, 0.10986123],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.10986123, 0.10986123]])

In [31]:
v_array = np.matmul(signal, agent_list[0].w_v)

In [32]:
v_array

array([[0.03632203, 0.01079974]])

In [33]:
q_array = np.sum(phi_array * agent_list[0].w_q, axis=0, keepdims=True)
q_array

array([[-0.00066399,  0.00138547]])