In [18]:
from itertools import count
import torch
import math
import torch.optim as optim 
import torch.nn as nn
import torch.nn.functional as F
import random
from collections import namedtuple, deque
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (30, 10)
plt.rcParams['font.size']=20

In [19]:
# parameters of model
gamma = 0.9
u = 40
v = 1
w = 4
n = 1
sigma = 0.05
rho = 0.1

In [20]:
def price(Q,u,v):
    return u-v*Q

def profit(P,Q):
    return
           
class cournot:
    def __init__(self,u,v,w,n):
        self.u=u
        self.v=v
        self.w=w
        self.n=n
        self.Qw=(u-w)/v
        self.Qc=n*(u-w)/v/(n+1)
        self.Qm=(u-w)/2*v
        self.Pw = price(self.Qw,u,v)
        self.Pc = price(self.Qc,u,v)
        self.Pm = price(self.Qm,u,v)
        self.Πw = 0
        self.Πc = (self.Pc-w)*self.Qc
        self.Πm = (self.Pm-w)*self.Qm
        self.qw = self.Qw/n
        self.qc = self.Qc/n
        self.qm = self.Qm/n
        self.πw = 0
        self.πc = self.Πc/n
        self.πm = self.Πm/n
    
    def price(self,Q):
        return self.u-v*(Q)
    
    def profit(self,Q):
        return (self.price(Q)-w)*Q
    
def whichidx(value, array):
    return np.argmin(np.abs(array-value))

game1 = cournot(u,v,w,n)

In [37]:
game1.profit(9)

243

In [21]:
Transition = namedtuple('Transition',('state','next_state','action','reward'))

class ReplayMemory(object):
    # Holds transitions from experience and gives a random batch of transitions for training
    def __init__(self,capacity):
        self.memory = deque([], maxlen=capacity)
        
    def push(self,*args):
        self.memory.append(Transition(*args))
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [22]:
class DQN(nn.Module):
    # Takes in a vector of states and gives out valuations for each action
    def __init__(self, n_obs, n_actions):
        super(DQN,self).__init__()
        self.layer1 = nn.Linear(n_obs, 128)
        self.layer2 = nn.Linear(128,128)
        self.layer3 = nn.Linear(128, n_actions)
    
    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)  

In [23]:
def getProbs(agent, BETA):
    return torch.exp(agent.policy_net(agent.state)/BETA)/torch.sum(torch.exp(agent.policy_net(agent.state)/BETA))

In [24]:
class Agent:
    def __init__(self, N_STATES, N_ACTIONS, N_MEMORY=10000, BATCH_SIZE=512, GAMMA=0, TAU = 0.0001, LR = 1e-5,
                 EPS_START = 0.9, EPS_END = 0.05, EPS_DECAY = 1000, 
                 BETA_START = 0.1, BETA_END = 0.01, BETA_DECAY = 1000):
        self.N_STATES = N_STATES
        self.N_ACTIONS = N_ACTIONS
        self.BATCH_SIZE = BATCH_SIZE
        self.GAMMA = GAMMA
        self.STEPS = 0
        self.EPS_START = EPS_START
        self.EPS_END = EPS_END
        self.EPS_DECAY = EPS_DECAY
        self.EPS_THRESHOLD = EPS_END + (EPS_START-EPS_END)*math.exp(-1.*self.STEPS/EPS_DECAY)
        self.BETA_START = BETA_START
        self.BETA_END = BETA_END
        self.BETA_DECAY = BETA_DECAY
        self.BETA_THRESHOLD = BETA_END + (BETA_START-BETA_END)*math.exp(-1.*self.STEPS/BETA_DECAY)
        self.LR = LR
        self.TAU = TAU
        self.policy_net = DQN(N_STATES,N_ACTIONS)
        self.target_net = DQN(N_STATES,N_ACTIONS)
        self.state = torch.zeros(1,N_STATES)
        self.actions = torch.tensor(np.arange(N_ACTIONS))
        self.memory = ReplayMemory(N_MEMORY)
        self.optimizer = optim.AdamW(self.policy_net.parameters(),lr=LR,amsgrad=True)
        self.action_history = []
        self.loss = torch.tensor(1000)

In [26]:
def select_action(agent):
    # Given state selects action either from the DQN (Q values become probs) or randomly. 
    sample = random.random()
    agent.EPS_THRESHOLD = agent.EPS_END + (agent.EPS_START-agent.EPS_END)*math.exp(-1.*agent.STEPS/agent.EPS_DECAY)
    agent.BETA_THRESHOLD = agent.BETA_END + (agent.BETA_START-agent.BETA_END)*math.exp(-1.*agent.STEPS/agent.BETA_DECAY)
    agent.STEPS += 1

    if sample>agent.EPS_THRESHOLD:
        with torch.no_grad():
            probs = getProbs(agent, agent.BETA_THRESHOLD)     
            index = probs.multinomial(num_samples=1, replacement=True)
            choice = agent.actions[index]
            return torch.tensor([[choice]],dtype=torch.long)
    else:
        randchoice = random.choice(np.arange(agent.N_ACTIONS))
        return torch.tensor([[randchoice]],dtype=torch.long)

In [27]:
def update_target_net(agent):
    target_net_state_dict = agent.target_net.state_dict()
    policy_net_state_dict = agent.policy_net.state_dict()
    for key in policy_net_state_dict:
        target_net_state_dict[key] = policy_net_state_dict[key]*agent.TAU+target_net_state_dict[key]*(1-agent.TAU)
    agent.target_net.load_state_dict(target_net_state_dict)

In [28]:
def optimize_model(agent):
    if len(agent.memory)<agent.BATCH_SIZE:
        return
    
    # Load data
    transitions = agent.memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))

    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    state_batch = torch.cat(batch.state)
    
    # final state is after simulation is done
    # these are the "s(t+1)" from the transitions
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)),dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    
    # For these batch states we compute optimal policy according to our policy-network
    # Compute Q(s,a) and get actions - these are the actions that would have been taken for each batch state 
    state_action_values = agent.policy_net(state_batch).gather(1,action_batch)
    
    # Compute V(s_t+1)
    next_state_values = torch.zeros(agent.BATCH_SIZE)
    with torch.no_grad():
        # Use the "old" target_net to obtain value (Expected Return) on optimal actions from sampled s(t+1)
        # This is Q_old(s',a')
        next_state_values[non_final_mask] = agent.target_net(non_final_next_states).max(1)[0]
        
    # Here we have add r + max Q_old(s',a')
    expected_state_action_values = (next_state_values*agent.GAMMA)+reward_batch

    # Compute the loss
    # Q_new(s,a) - r + max Q_old(s',a'): is the error
    # Huber loss function
    criterion = nn.SmoothL1Loss()
    agent.loss = criterion(state_action_values,expected_state_action_values.unsqueeze(1))
    agent.optimizer.zero_grad()
    agent.loss.backward()
    torch.nn.utils.clip_grad_value_(agent.policy_net.parameters(),100)
    agent.optimizer.step()   

In [29]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx] 

In [30]:
def track(agent, q1c):
    with torch.no_grad():
        test = torch.tensor([[q1c]], dtype = torch.float32)
        q = agent1.policy_net(test)
        print(f'Loss:{agent.loss.item()},BETA:{round(agent.BETA_THRESHOLD,2)},EPS:{round(agent.EPS_THRESHOLD,2)},P:{getProbs(agent,agent.BETA_THRESHOLD)},Q:{q}')

In [51]:
N_ACTIONS = 10
Qgrid = np.linspace(game1.qm*0.75, game1.qc*1.5, N_ACTIONS)
print(Qgrid, game1.qm)

[13.5 15.  16.5 18.  19.5 21.  22.5 24.  25.5 27. ] 18.0


In [54]:
BATCH_SIZE = 512 # number of transitions sampled from replay buffer
GAMMA = 0.9 # Discount factor
EPS_START = 0.9 # Initial value of Epsilon
EPS_END = 0.01 # End value of Epsilon
EPS_DECAY = 1000 # controls decay rate of Epsilon
TAU = 0.005 # update rate of target network
LR = 1e-4 # Learning rate of Adam
BETA_START = 0.9 # temperature of the greedy-exploratory policy
BETA_END = 0.1 # End value of temperature
BETA_DECAY = 1000 # controls decay rate of temperature
N_MEMORY = 10000
N_STATES = 1

agent1 = Agent(N_STATES, N_ACTIONS, N_MEMORY, BATCH_SIZE, GAMMA, TAU, LR,
                    EPS_START, EPS_END, EPS_DECAY,
                    BETA_START, BETA_END, BETA_DECAY)

In [58]:
prices = []
quantities = []

epochs = 10000
for t in range(epochs):
    print('\n Iteration:',t)

    # Take action
    a1 = select_action(agent1)
    q1 = Qgrid[a1]
    
    print(q1)
    
    # Obtain Reward
    r1 = game1.profit(q1)
    r1 = torch.tensor([r1])
    
    # Compute next state
    next_state1 = agent1.state 
    
    # store memory in transition
    agent1.memory.push(agent1.state, next_state1, a1, r1)

    # move to next state
    agent1.state = next_state1
    
    # optimize
    optimize_model(agent1)

    # soft update target_net
    update_target_net(agent1)
    
    # record actions
    agent1.action_history.append(q1)
    quantities.append(q1)
    
    # print loss
    #track(agent1, 1)



 Iteration: 0


RuntimeError: invalid multinomial distribution (encountering probability entry < 0)

In [67]:
getProbs(agent1, agent1.BETA_THRESHOLD)     

tensor([[0., 0., 0., 0., 0., 0., 0., nan, 0., 0.]], grad_fn=<DivBackward0>)

In [66]:
agent1.policy_net(agent1.state)/agent1.BETA_THRESHOLD/torch.sum(torch.exp(agent1.policy_net(agent1.state)/agent1.BETA_THRESHOLD))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], grad_fn=<DivBackward0>)

In [56]:
    a1 = select_action(agent1)
    q1 = Qgrid[a1]
    

RuntimeError: invalid multinomial distribution (encountering probability entry < 0)

In [57]:
q1, game1.qm

(15.0, 18.0)