In [70]:
from utils import * 
# import dependencies
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.distributions import Categorical
import numpy as np
from copy import deepcopy
import gym
import itertools
from collections import deque

def loadAlgo(algo, numStates, numActions, algoArgs=[]):
    if algo=='VPG':
        return VPG(numStates, numActions, *algoArgs)
    if algo=='DQN':
        return DQN(numStates, numActions, *algoArgs)
    if algo=='PPO':
        return PPO(numStates, numActions, *algoArgs)

class PolicyNetwork(nn.Module):
    def __init__(self, numStates, numActions=10):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(numStates, 256)
        self.fc2 = nn.Linear(256, numActions)
        
    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=1)
        return x

class VPG:
    def __init__(self, numStates, episodeLength = 10, numActions=10):
        super(VPG, self).__init__()
        self.rewards = []
        self.log_probs = []
        self.numActions = numActions
        self.numStates = numStates
        self.gamma = 0.99
        self.episodeLength = episodeLength
        self.policy = PolicyNetwork(numStates, numActions)
        self.optimizer = optim.Adam(self.policy.parameters())
        self.done = False
        self.eps = np.finfo(np.float32).eps.item()

    def act(self, state):
        state = torch.tensor(state).unsqueeze(0).float()
        probs = self.policy.forward(state)
        multinomial = Categorical(probs)
        action = multinomial.sample()
        self.log_probs.append(multinomial.log_prob(action))
        return action.item()
        
    def observe(self, state, action, reward, newState, done):
        self.rewards.append(reward)
        self.done = done
    
    def train(self, state, action, reward, newState, done):
        if done and len(self.rewards)>=self.episodeLength:
            R = 0
            rewards = []
            for r in self.rewards[::-1]:
                R = r + self.gamma * R
                rewards.insert(0, R)
            rewards = torch.tensor(rewards)
            #rewards = (rewards - rewards.mean()) / (rewards.std() + self.eps)
            policy_loss = []  
            for log_prob, reward in zip(self.log_probs, rewards):
                policy_loss.append(-log_prob * reward)
            self.optimizer.zero_grad()
            policy_loss = torch.cat(policy_loss).sum()
            policy_loss.backward()
            self.optimizer.step()
            del self.rewards[:]
            del self.log_probs[:]

In [71]:
#from utils import *
#from algorithms import * 

import pandas as pd

class Trader:
    def __init__(self, gameData, disclosure, index, buyer, reinforcer):
        self.gameType, self.numBuyers, self.numSellers, self.numTokens, self.numRounds, self.numPeriods, self.numSteps, self.seed = gameData
        self.index = index
        self.buyer = buyer
        self.reinforcer = reinforcer
        self.df = pd.DataFrame(columns=disclosure)
        self.disclosure = disclosure
        self.gameTokens = []
        self.gameTrades = 0
        self.gameProfits = 0
        self.gameRounds = 0
        self.gameSteps = 0
        self.roundTrades = 0
        self.roundProfits = 0
        self.roundPeriods = 0
        self.periodTrades = 0
        self.periodProfits = 0
        self.periodSteps = 0
        self.stepTrades = 0
        self.stepProfits = 0
        self.stepTokenValue = 0
        self.periodSteps = 0
        self.stepTrades = 0
        self.stepProfits = 0
        self.stepTokenValue = 0
        
    def startRound(self, tokenValues):
        self.roundTokens = tokenValues
        self.roundTrades = 0
        self.roundProfits = 0
        self.roundPeriods = 0

    def endRound(self):
        self.gameTokens.append(self.roundTokens)
        self.gameTrades += self.roundTrades
        self.gameProfits += self.roundProfits
        self.gameRounds += 1
           
    def startPeriod(self):
        self.periodTokens = self.roundTokens
        self.periodTrades = 0
        self.periodProfits = 0
        self.periodSteps = 0

    def endPeriod(self):
        self.roundProfits += self.periodProfits
        self.roundTrades += self.periodTrades
        self.roundPeriods += 1

    def startStep(self):
        self.stepProfits = 0
        self.stepTrades = 0
        self.stepTokenValue = np.nan
        if self.periodTrades < self.numTokens:
            self.stepTokenValue = self.periodTokens[self.periodTrades]

    def endStep(self):
        self.gameSteps += 1
        self.periodSteps += 1
        self.periodProfits += self.stepProfits
        self.periodTrades += self.stepTrades  

    def buy(self, currentBid, currentAsk):
        self.acceptSale = False
        if self.stepTokenValue >= currentAsk:
            self.acceptSale = True
        return self.acceptSale

    def sell(self, currentBid, currentAsk):
        self.acceptSale = False
        if self.stepTokenValue <= currentBid:
            self.acceptSale = True
        return self.acceptSale
    
    def transact(self, price):
        self.stepTrades = 1
        self.stepProfits = profit(self.stepTokenValue,price,self.buyer)

class TruthTeller(Trader):
    def __init__(self, gameData, disclosure, index, buyer, reinforcer):
        super().__init__(gameData, disclosure, index, buyer, reinforcer)
    
    def bid(self):
        self.stepBid = self.stepTokenValue
        return self.stepBid
    
    def ask(self):
        self.stepAsk = self.stepTokenValue
        return self.stepAsk

class ZeroIntelligence(Trader):
    def __init__(self, gameData, disclosure, index, buyer, reinforcer):
        super().__init__(gameData, disclosure, index, buyer, reinforcer)
    
    def bid(self):
        self.stepBid = np.nan
        if self.stepTokenValue >= 0:
            self.stepBid = np.random.uniform(self.stepTokenValue*0.1,self.stepTokenValue, 1).item()
            self.stepBid = np.round(self.stepBid, 1)
        return np.round(self.stepBid,1)
        
    def ask(self):
        self.stepAsk = np.nan
        if self.stepTokenValue >= 0:
            self.stepAsk = np.random.uniform(self.stepTokenValue,self.stepTokenValue*1.9, 1).item()
            self.stepAsk = np.round(self.stepAsk, 1)
        return self.stepAsk

def generateAgents(gameData,buyerStrategies,sellerStrategies,disclosure):
    buyers, sellers = [], []
    for idx,i in enumerate(buyerStrategies):
        if i == 'TruthTeller':
            buyers.append(TruthTeller(gameData, disclosure, index=idx, buyer=1, reinforer=0)) 
        if i == 'ZeroIntelligence':
            buyers.append(ZeroIntelligence(gameData, disclosure, index=idx, buyer=1, reinforer=0)) 
        if i == 'VPG':
            buyers.append(VPG(gameData, disclosure, index=idx, buyer=1, reinforcer=1, episodeLength = gameData[7])) 
        if i == 'PPO':
            buyers.append(PPO(gameData, disclosure, index=idx, buyer=1, reinforcer=1)) 
        if i == 'SAC':
            buyers.append(SAC(gameData, disclosure, index=idx, buyer=1, reinforcer=1)) 
        if i == 'DQN':
            buyers.append(DQN(gameData, disclosure, index=idx, buyer=1, reinforcer=1)) 
        if i == 'DDPG':
            buyers.append(DDPG(gameData, disclosure, index=idx, buyer=1, reinforcer=1)) 

    for idx,i in enumerate(sellerStrategies):
        if i == 'TruthTeller':
            sellers.append(TruthTeller(gameData, disclosure, index=idx, buyer=0, reinforcer=0)) 
        if i == 'ZeroIntelligence':
            sellers.append(ZeroIntelligence(gameData, disclosure, index=idx, buyer=0, reinforcer=0)) 
        if i == 'VPG':
            sellers.append(VPG(gameData, disclosure, index=idx, buyer=0, reinforcer=1)) 
        if i == 'PPO':
            sellers.append(PPO(gameData, disclosure, index=idx, buyer=0, reinforcer=1)) 
        if i == 'SAC':
            sellers.append(SAC(gameData, disclosure, index=idx, buyer=0, reinforcer=1)) 
        if i == 'DQN':
            sellers.append(DQN(gameData, disclosure, index=idx, buyer=0, reinforcer=1)) 
        if i == 'DDPG':
            sellers.append(DDPG(gameData, disclosure, index=idx, buyer=0, reinforcer=1)) 
    return buyers, sellers

    
def generateState(agent):
    counters = [agent.periodSteps] #agent.stepTrades, agent.stepProfits, agent.stepTokenValue]
    disclosureLength = len(agent.disclosure)
    if (disclosureLength == 0) | (agent.depth == 0):
        activityLog = []
    else:   
        if agent.gameSteps >= agent.depth:
            agent.disclosureCopy = deepcopy(agent.disclosure)
            bidsDisclose, asksDisclose = False, False
            if 'bids' in agent.disclosure:
                agent.disclosureCopy.remove('bids')
                bidsDisclose = True
            if 'asks' in agent.disclosure:
                agent.disclosureCopy.remove('asks')
                asksDisclose = True
            
            activityLog = [[]]
            for i in range(1, agent.depth+1):
                activityLog[0] += agent.df.iloc[-i][agent.disclosureCopy].tolist()           
                if bidsDisclose:
                    activityLog[0] += agent.df.iloc[-i].bids
                if asksDisclose:
                    activityLog[0] += agent.df.iloc[-i].asks
            activityLog = activityLog[0]
        else:
            bidsDisclose, asksDisclose = False, False
            if 'bids' in agent.disclosure:
                disclosureLength -= 1
                bidsDisclose = True
            if 'asks' in agent.disclosure:
                disclosureLength -= 1
                asksDisclose = True
            activityLog = [-9] * (disclosureLength*agent.depth + bidsDisclose*agent.depth*agent.numBuyers+asksDisclose*agent.depth*agent.numSellers)
        
    state = counters + activityLog
    cleanState = [-9 if np.isnan(x) else x for x in state]
    return cleanState

class Reinforcer(Trader):
    def __init__(self, gameData, disclosure=['currentBid', 'currentAsk', 'buy', 'sell', 'price', 'price'], 
                 index=0, buyer=1, reinforcer=1, numActions=10, algo='VPG', algoArgs=[], depth = 0, verbose = 0):
        super().__init__(gameData, disclosure, index, buyer, reinforcer)
        self.depth = depth
        self.disclosure = disclosure
        print(generateState(self))
        self.state = generateState(self)
        print(self.state)
        self.numStates = len(self.state)
        self.numActions = numActions
        self.state = [-1]*self.numStates
        self.algo = loadAlgo(algo, self.numStates, self.numActions, *algoArgs)
        self.done = False
        self.verbose = verbose
    
    def observe(self):
        self.newState = generateState(self)
        self.algo.observe(self.state, self.action, self.stepProfits, self.newState, self.done)
        if self.verbose == 1:
            print(self.state, self.action, self.stepProfits, self.newState, self.done)
        self.state = self.newState
        if (self.periodSteps == self.numSteps-1): # & (self.roundPeriods == self.numPeriods-1):
            self.done = True
        else:
            self.done = False

    def train(self):
        self.algo.train(self.state, self.action, self.stepProfits, self.newState, self.done)
        
    def bid(self):
        self.stepBid = np.nan
        self.action = self.algo.act(self.state)
        if self.stepTokenValue >= 0:
            self.stepBid = (self.action/(self.numActions-1)) * 100
        return self.stepBid
        
    def ask(self):
        self.stepAsk = np.nan
        self.action = self.algo.act(self.state)
        if self.stepTokenValue >= 0:
            self.stepAsk = (self.action/(self.numActions-1)) * 100
        return self.stepAsk

In [72]:
import gymnasium as gym
from gymnasium import spaces

class GymEnv(gym.Env):
    def __init__(self, metaData, buyers, sellers, log):
        self.gameData = metaData[0:8]
        self.gameTypes, self.numBuyers, self.numSellers, self.numTokens, self.numRounds, self.numPeriods, self.numSteps, self.seed = self.gameData
        self.disclosure, self.buyers, self.sellers = metaData[8:]
        self.log = log
        self.rnd = 0
        self.period = 0
        self.Step = 0
        self.action_space = spaces.Box(-1,1,(1,),dtype=np.float32)
        self.numStates = len(generateState(self.buyers[0]))
        self.observation_space = spaces.Box(-1000,1000,(self.numStates,),dtype=np.float32)
        startRounds(self.gameData, self.log, self.buyers, self.sellers, self.rnd)
    
    def reset(self, seed = None):
        print('\nreset')
        startPeriods(self.buyers, self.sellers)
        generateState(self.buyers[0])
        return self.buyers[0].state, {}

    def step(self, action):
        print('\nstep')
        startSteps(self.buyers, self.sellers)
        bids, asks = collectOffers(self.buyers, self.sellers)
        bids[0] = action.item() * 100
        currentAsk, currentAskIdx, currentBid, currentBidIdx = bestOffers(bids, asks)
        price, buy, sell = trade(buyers, sellers, currentAsk, currentAskIdx, currentBid, currentBidIdx)
        bprofit, sprofit = 0, 0
        if price > 0:
            self.buyers[currentBidIdx].transact(price)
            self.sellers[currentAskIdx].transact(price)
            bprofit = self.buyers[currentBidIdx].stepProfits
            sprofit = self.sellers[currentAskIdx].stepProfits
        self.log.addStep([self.rnd, self.period, self.Step, bids, asks, currentBid, currentBidIdx, currentAsk, currentAskIdx, buy, sell, price, price>0, bprofit, sprofit])
        observe(self.buyers, self.sellers, log.disclose())
        reward = 0.0
        if price > 0 and currentBidIdx == 0:
            reward = np.nan_to_num(bprofit,nan=0)
        updateStates(self.buyers, self.sellers)
        newState = self.buyers[0].state
        print(self.buyers[0].state)
        done = self.buyers[0].done
        updatePolicy(self.buyers, self.sellers)
        endSteps(self.buyers, self.sellers)
        print('step:', self.Step, 'periodtrades:', self.buyers[0].periodTrades)
        self.Step += 1
        if done:
            print('\t', self.period)
            endPeriods(self.buyers, self.sellers)
            self.period += 1
            self.Step = 0
        return newState, reward, done, False, {}

import warnings 
warnings.filterwarnings('ignore')
import torch as th
th.autograd.set_detect_anomaly(True)
numRounds, numPeriods, numSteps = 1, 10000, 9
numBuyers, numSellers, numTokens = 3, 3, 3
gameTypes, seed = '1001', 42
disclosure = ['asks','currentBid','currentBidIdx','currentAsk',	'currentAskIdx','buy','sell','price','sale']
#disclosure = []
depth = 1
# ZeroIntelligence, TruthTeller

gameData = [gameTypes, numBuyers, numSellers, numTokens, numRounds, numPeriods, numSteps, seed]

buyers = [
    Reinforcer(gameData, disclosure, index=0, buyer=1, reinforcer=1, numActions = 20, verbose = 1, depth = depth),
    ZeroIntelligence(gameData, disclosure, index=0, buyer=1, reinforcer=0),
    ZeroIntelligence(gameData, disclosure, index=0, buyer=1, reinforcer=0),
]
sellers = [
    ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
    ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
    ZeroIntelligence(gameData, disclosure, index=0, buyer=0, reinforcer=0),
          ]
log = Log(gameData, disclosure)
metaData = [gameTypes, numBuyers, numSellers, numTokens, numRounds, numPeriods, numSteps, seed, disclosure, buyers, sellers]
from stable_baselines3 import PPO, SAC, DDPG
from stable_baselines3.ppo.policies import MlpPolicy
from stable_baselines3.common.base_class import BaseAlgorithm
env = GymEnv(metaData, buyers, sellers, log)
#print(env.log.roundData.iloc[0].buyerValues.item())
#policy_kwargs = dict(net_arch=dict(pi=[128, 128], qf=[128, 128]))
model = SAC("MlpPolicy", env, verbose=1,)
#model = PPO("MlpPolicy", env, verbose=1)

[0, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9]
[0, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9]
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [73]:
buyers[0].state

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]

In [74]:
model.learn(1000, progress_bar = False)


reset

step
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] 4 0 [0, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9] False
[0, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9]
step: 0 periodtrades: 0

step
[0, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9, -9] 6 0 [1, 47.4, 1, 14.7, 0, True, True, 31.049999999999997, True, 14.7, 49.6, 23.0] False
[1, 47.4, 1, 14.7, 0, True, True, 31.049999999999997, True, 14.7, 49.6, 23.0]
step: 1 periodtrades: 0

step
[1, 47.4, 1, 14.7, 0, True, True, 31.049999999999997, True, 14.7, 49.6, 23.0] 3 0 [2, 44.5, 2, 20.3, 2, True, True, 32.4, True, 32.4, 55.8, 20.3] False
[2, 44.5, 2, 20.3, 2, True, True, 32.4, True, 32.4, 55.8, 20.3]
step: 2 periodtrades: 0

step
[2, 44.5, 2, 20.3, 2, True, True, 32.4, True, 32.4, 55.8, 20.3] 3 0 [3, 19.6, 1, 24.3, 2, True, False, 24.3, True, 46.1, 45.8, 24.3] False
[3, 19.6, 1, 24.3, 2, True, False, 24.3, True, 46.1, 45.8, 24.3]
step: 3 periodtrades: 0

step
[3, 19.6, 1, 24.3, 2, True, False, 24.3, True, 46.1, 45.8, 24.3] 3 0 [4, 38.0,

RuntimeError: Function 'MulBackward0' returned nan values in its 0th output.

In [None]:
log.stepData.tail(8)

In [38]:
import numpy as np
from sb3_contrib import RecurrentPPO
from stable_baselines3.common.evaluation import evaluate_policy
model = RecurrentPPO("MlpLstmPolicy", env, verbose=1)
model.learn(20)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

reset

step
periodtrades 0 numtokens 3
[-1] 0 0 [0] False
[0]
step: 0 periodtrades: 0

step
periodtrades 0 numtokens 3
[0] 6 0 [1] False
[1]
step: 1 periodtrades: 0

step
periodtrades 0 numtokens 3
[1] 11 0 [2] False
[2]
step: 2 periodtrades: 0

step
periodtrades 0 numtokens 3
[2] 10 0 [3] False
[3]
step: 3 periodtrades: 0

step
periodtrades 0 numtokens 3
[3] 4 45.38761405944824 [4] False
[4]
step: 4 periodtrades: 1

step
periodtrades 1 numtokens 3
[4] 19 23.102818965911865 [5] False
[5]
step: 5 periodtrades: 2

step
periodtrades 2 numtokens 3
[5] 6 0 [6] False
[6]
step: 6 periodtrades: 2

step
periodtrades 2 numtokens 3
[6] 3 0 [7] False
[7]
step: 7 periodtrades: 2

step
periodtrades 2 numtokens 3
[7] 1 -10.949999999999996 [8] False
[8]
step: 8 periodtrades: 3
	 0

reset

step
periodtrades 0 numtokens 3
[8] 2 0 [0] True
[0]
step: 0 periodtrades: 0

step
periodtrades 0 numtokens 3
[0] 14 0 [1

RuntimeError: Function 'MulBackward0' returned nan values in its 0th output.