### Test

In [3]:
from utils import * 
from algorithms import *
from agents import *
from env import *

In [4]:
# HYPERPARAMS
learning_rate  = 0.0003
gamma           = 0.9
lmbda           = 0.9
eps_clip        = 0.2
K_epoch         = 10
rollout_len    = 3
buffer_size    = 10
minibatch_size = 32
verbose = 1
num_states = 2

In [5]:
# ENVIRONMENT PARAMS
numRounds, numPeriods, numSteps = 1, 10000, 9
numBuyers, numSellers, numTokens = 3, 6, 4
gameTypes, seed = '1111', 42
disclosure = []

class GymEnv(gym.Env):
    def __init__(self, metaData, buyers, sellers, log):
        self.gameData = metaData[0:8]
        self.gameTypes, self.numBuyers, self.numSellers, self.numTokens, self.numRounds, self.numPeriods, self.numSteps, self.seed = self.gameData
        self.disclosure, self.buyers, self.sellers = metaData[8:]
        self.log = log
        self.rnd = 0
        self.period = 0
        self.Step = 0
        self.buyers = buyers
        self.sellers = sellers
        self.action_space = spaces.Box(-1,1,(1,),dtype=np.float32)
        self.numStates = len(generateState(self.buyers[0]))
        self.observation_space = spaces.Box(-1,9,(self.numStates,),dtype=np.float32)
        startRounds(self.gameData, self.log, self.buyers, self.sellers, self.rnd)
    
    def reset(self, seed = None):
        startPeriods(self.buyers, self.sellers)
        generateState(self.buyers[0])
        return self.buyers[0].state, {}

    def step(self, action):
        # start step -> reset step counts, update token values, generate state
        startSteps(self.buyers, self.sellers)
        print('\n',self.buyers[0].state, self.buyers[0].done)

        # trade: replace first bid with action
        bids, asks = collectOffers(self.buyers, self.sellers)
        min = self.buyers[0].stepTokenValue*0.01
        max = self.buyers[0].stepTokenValue*1.5
        frac = (action+1)/2
        bids[0] = min * (1-frac) + frac * max

        # transact
        currentAsk, currentAskIdx, currentBid, currentBidIdx = bestOffers(bids, asks)
        price, buy, sell = trade(self.buyers, self.sellers, currentAsk, currentAskIdx, currentBid, currentBidIdx)

        # obtain profits
        bprofit, sprofit = 0, 0
        if price > 0:
            self.buyers[currentBidIdx].transact(price)
            self.sellers[currentAskIdx].transact(price)
            bprofit = self.buyers[currentBidIdx].stepProfits
            sprofit = self.sellers[currentAskIdx].stepProfits

        # update log, disclose information and update states
        self.log.addStep([self.rnd, self.period, self.Step, bids, asks, currentBid, currentBidIdx, currentAsk, currentAskIdx, buy, sell, price, price>0, bprofit, sprofit])
        observe(self.buyers, self.sellers, self.log.disclose()) # add disclosure to personal data
        agentsObserve(self.buyers, self.sellers) # reinforcers update state and store data into algorithm
        print(self.buyers[0].state, self.buyers[0].done)

        # compute reward, newState, done
        newState = self.buyers[0].state
        done = self.buyers[0].done
        reward = 0.0
        if price > 0 and currentBidIdx == 0:
            reward = np.nan_to_num(bprofit,nan=0)

        # train agent
        agentsTrain(self.buyers, self.sellers)

        # end step: add to period counters
        endSteps(self.buyers, self.sellers)

        # if done with episode, end period
        self.Step += 1
        if done:
            endPeriods(self.buyers, self.sellers)
            self.period += 1
            self.Step = 0
        print(self.buyers[0].state, self.buyers[0].done)
        return newState, reward, done, False, {}
depth = 0
gameData = [gameTypes, numBuyers, numSellers, numTokens, numRounds, numPeriods, numSteps, seed]
numStates = num_states
log = Log(gameData, disclosure)

# ALGORITHM
algoArgs = [num_states, learning_rate, gamma, lmbda, eps_clip, K_epoch, rollout_len, buffer_size, minibatch_size, verbose]
model = PPO1(*algoArgs)
score = 0.0
print_interval = 100
rollout = []

# OTHER AGENTS
buyers = [Reinforcer(gameData, disclosure, index=0, buyer=1, reinforcer=1, depth = depth, verbose=verbose, numStates = num_states, algo = 'BASE', algoArgs=[numStates]),
          ZeroIntelligence(gameData, disclosure, index=0, buyer=1, reinforcer=0),
          ZeroIntelligence(gameData, disclosure, index=0, buyer=1, reinforcer=0)]
sellers = [ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
           ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0)]

# INITIALIZE
metaData = [gameTypes, numBuyers, numSellers, numTokens, numRounds, numPeriods, numSteps, seed, disclosure, buyers, sellers]
env = GymEnv(metaData, buyers, sellers, log)
print(log.roundData.iloc[0].buyerValues)
print(log.roundData.iloc[0].sellerCosts)
print(buyers[0].state)

[[ 99.25003886  82.45671643  77.66026229  65.21989865]
 [100.          84.05672672  63.22103594  62.96150214]
 [ 78.67729377  60.07550004  59.31575489  58.17739974]]
[[11.47061757 16.75562681 39.29472894 54.31626246]
 [22.83601084 29.76199996 33.6351073  56.56277446]
 [22.02002285 45.1952035  52.5790546  78.76449945]
 [27.9216024  38.27685302 51.64216474 57.74193037]
 [36.9662783  38.72188629 43.40438254 87.08807828]
 [24.4482233  26.90662066 47.53168931 66.02949848]]
[0, 0]


In [6]:
# TRAINING LOOP
for n_epi in range(1000):
    s, _ = env.reset()
    done = False
    count = 0
    a_list = []
    while count < 200 and not done:
        for t in range(rollout_len):
            mu, std = model.pi(torch.from_numpy(np.array(s)).float())
            dist = Normal(mu, std)
            a = dist.sample()
            a_list.append(a.item())
            log_prob = dist.log_prob(a)
            s_prime, r, done, truncated, info = env.step(a.item())
            rollout.append((s, a, r, s_prime, log_prob.item(), done))
            print('\t', n_epi, s, a, r, s_prime, log_prob.item(), done)
            if len(rollout) == rollout_len:
                model.put_data(rollout)
                rollout = []
            s = s_prime
            score += r
            count += 1
        model.train()

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}, optmization step: {}, mu:{:.3f}, std:{:.3f}, mean a:{:.3f}, std a:{:.3f}".format(n_epi,
                                                                                          score/print_interval,
                                                                                          model.optimization_step,
                                                                                          mu.item(), std.item(), 
                                                                                          np.mean(a_list), np.std(a_list)
                                                                                         ))
        score = 0.0


 [0, 0] False
state:[0, 0], action:-0.8508987126404584, reward:40.217007446117435, newstate:[0, 1],done:False
[0, 1] False
[0, 1] False
	 0 [0, 0] tensor([0.4008]) 40.217007446117435 [0, 1] -0.6705466508865356 False

 [1, 0] False
state:[1, 0], action:-0.8519106965318193, reward:29.02425278165539, newstate:[1, 1],done:False
[1, 1] False
[1, 1] False
	 0 [0, 1] tensor([0.2606]) 29.02425278165539 [1, 1] -0.5982328057289124 False

 [2, 0] False
state:[2, 0], action:0.45921235667612814, reward:0, newstate:[2, 0],done:False
[2, 0] False
[2, 0] False
	 0 [1, 1] tensor([-0.6985]) 0.0 [2, 0] -0.7507069110870361 False

 [3, 0] False
state:[3, 0], action:-0.012408807271218514, reward:0, newstate:[3, 0],done:False
[3, 0] False
[3, 0] False
	 0 [2, 0] tensor([-1.6414]) 0.0 [3, 0] -3.044830799102783 False

 [4, 0] False
state:[4, 0], action:0.815132947852186, reward:19.74311305114852, newstate:[4, 1],done:False
[4, 1] False
[4, 1] False
	 0 [3, 0] tensor([0.3803]) 19.74311305114852 [4, 1] -0.91833

KeyboardInterrupt: 

In [None]:
env.log.graphTraining(maxNumPeriods=3000, skip = 20, trackBuyersIdx = [0])

In [None]:
env.log.graphLearning(trackBuyersIdx = [0], trackSellersIdx = [], rolling_window=100, type = 'period')

In [None]:
env.log.stepData.tail(10)

In [None]:
env.log.graphSales(0, 4887, trackBuyersIdx=[0], trackSellersIdx=[])

In [None]:
env.log.graphOffers(0, 4888, trackBuyersIdx=[0], trackSellersIdx=[])