### Test

In [1]:
from utils import * 
from algorithms import *
from agents import *
from env import *

In [2]:
# HYPERPARAMS
learning_rate  = 0.0003
gamma           = 0.9
lmbda           = 0.9
eps_clip        = 0.2
K_epoch         = 10
rollout_len    = 3
buffer_size    = 10
minibatch_size = 32
verbose = 0
num_states = 2

In [3]:
# ENVIRONMENT PARAMS
numRounds, numPeriods, numSteps = 1, 10000, 9
numBuyers, numSellers, numTokens = 3, 6, 4
gameTypes, seed = '1111', 42
disclosure = []
depth = 0
gameData = [gameTypes, numBuyers, numSellers, numTokens, numRounds, numPeriods, numSteps, seed]
numStates = num_states
log = Log(gameData, disclosure)

# ALGORITHM
algoArgs = [num_states, learning_rate, gamma, lmbda, eps_clip, K_epoch, rollout_len, buffer_size, minibatch_size, verbose]
model = PPO1(*algoArgs)
score = 0.0
print_interval = 100
rollout = []

# OTHER AGENTS
buyers = [Reinforcer(gameData, disclosure, index=0, buyer=1, reinforcer=0, depth = depth, verbose=verbose, numStates = num_states, algo = 'BASE', algoArgs=[numStates]),
          ZeroIntelligence(gameData, disclosure, index=0, buyer=1, reinforcer=0),
          ZeroIntelligence(gameData, disclosure, index=0, buyer=1, reinforcer=0)]
sellers = [ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0)]

# INITIALIZE
metaData = [gameTypes, numBuyers, numSellers, numTokens, numRounds, numPeriods, numSteps, seed, disclosure, buyers, sellers]
env = GymEnv(metaData, buyers, sellers, log)
print(log.roundData.iloc[0].buyerValues)
print(log.roundData.iloc[0].sellerCosts)
print(buyers[0].state)

[[ 99.25003886  82.45671643  77.66026229  65.21989865]
 [100.          84.05672672  63.22103594  62.96150214]
 [ 78.67729377  60.07550004  59.31575489  58.17739974]]
[[11.47061757 16.75562681 39.29472894 54.31626246]
 [22.83601084 29.76199996 33.6351073  56.56277446]
 [22.02002285 45.1952035  52.5790546  78.76449945]
 [27.9216024  38.27685302 51.64216474 57.74193037]
 [36.9662783  38.72188629 43.40438254 87.08807828]
 [24.4482233  26.90662066 47.53168931 66.02949848]]
[0, 0]


In [None]:
# TRAINING LOOP
for n_epi in range(10000):
    s, _ = env.reset()
    done = False
    count = 0
    a_list = []
    while count < 200 and not done:
        for t in range(rollout_len):
            mu, std = model.pi(torch.from_numpy(np.array(s)).float())
            dist = Normal(mu, std)
            a = dist.sample()
            a_list.append(a.item())
            log_prob = dist.log_prob(a)
            s_prime, r, done, truncated, info = env.step(a.item())
            rollout.append((s, a, r, s_prime, log_prob.item(), done))
            if len(rollout) == rollout_len:
                model.put_data(rollout)
                rollout = []
            s = s_prime
            score += r
            count += 1
        model.train_net()

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}, optmization step: {}, mu:{:.3f}, std:{:.3f}, mean a:{:.3f}, std a:{:.3f}".format(n_epi,
                                                                                          score/print_interval,
                                                                                          model.optimization_step,
                                                                                          mu.item(), std.item(), 
                                                                                          np.mean(a_list), np.std(a_list)
                                                                                         ))
        score = 0.0

# of episode :100, avg score : 93.4, optmization step: 0, 0.027, 0.593, 0.114, 0.588
# of episode :200, avg score : 100.1, optmization step: 100, 0.126, 0.528, 0.041, 0.517
# of episode :300, avg score : 104.6, optmization step: 200, 0.168, 0.446, 0.159, 0.324
# of episode :400, avg score : 109.6, optmization step: 300, 0.097, 0.425, -0.065, 0.211
# of episode :500, avg score : 116.3, optmization step: 400, 0.036, 0.371, 0.193, 0.362
# of episode :600, avg score : 117.7, optmization step: 500, 0.039, 0.383, 0.086, 0.426
# of episode :700, avg score : 114.9, optmization step: 600, 0.099, 0.341, 0.128, 0.306


In [None]:
env.log.graphTraining(maxNumPeriods=3000, skip = 20, trackBuyersIdx = [0])

In [None]:
env.log.graphLearning(trackBuyersIdx = [0], trackSellersIdx = [], rolling_window=100, type = 'period')

In [None]:
env.log.stepData.tail(10)

In [None]:
env.log.graphSales(0, 4887, trackBuyersIdx=[0], trackSellersIdx=[])

In [None]:
env.log.graphOffers(0, 4888, trackBuyersIdx=[0], trackSellersIdx=[])