In [1]:
from learningAgent import LearningAlgorithm
from environment import Model, AdversaryModes
import numpy as np

In [2]:
np.random.seed(10)

In [3]:
total_Demand = 400
tuple_Costs = [57,71]
total_Stages = 25
init_State = [total_Demand/2, total_Demand/2]
adversary_Mode = AdversaryModes.myopic

game = Model(total_Demand, tuple_Costs, total_Stages, init_State, adversary_Mode)

In [4]:
agent_Cost = tuple_Costs[0]
adv_Cost = tuple_Costs[1]
num_Actions = 10
num_States = abs(adv_Cost - agent_Cost) + 2 * num_Actions + 2

In [5]:
Qtable = np.zeros((num_States, num_Actions))

In [6]:
numberEpisodes = 10000
discountFactor = 0.99

In [7]:
algorithm = LearningAlgorithm(game, Qtable, numberEpisodes, discountFactor)

In [8]:
algorithm.solver()

In [9]:
# print(Qtable)

In [10]:
# Calculate Error
Qtable_error = np.zeros((num_States, num_Actions))

for s in range(num_States):
    for a in range(num_Actions):
        lowestState = int(200-(num_States)/2)
        highestState = int(200+(num_States)/2 - 1)
        state = s + lowestState

        monopoly_price = int((state + agent_Cost)/2) + 1
        action = a + monopoly_price - num_Actions + 1

        reward = (state - action) * (action - agent_Cost)
        adv_action = int((400 -state + adv_Cost)/2) + 1
        next_state = int(state + (adv_action - action)/2)
        #print(state,monopoly_price,action,reward,adv_action, next_state)

        ns = next_state - lowestState
        opt_value_next = max(Qtable[ns])
        new_value = (1-discountFactor)*reward + discountFactor * opt_value_next
        Qtable_error[s,a] = (new_value - Qtable[s,a])/new_value

In [11]:
print(Qtable_error)

[[0.03 0.03 0.06 0.03 0.01 0.03 0.03 0.02 0.01 0.  ]
 [0.01 0.01 0.01 0.02 0.02 0.03 0.01 0.01 0.03 0.01]
 [0.02 0.03 0.02 0.01 0.03 0.02 0.02 0.01 0.04 0.02]
 [0.   0.02 0.03 0.03 0.01 0.03 0.01 0.02 0.01 0.01]
 [0.02 0.01 0.03 0.   0.   0.03 0.04 0.01 0.03 0.03]
 [0.02 0.03 0.02 0.01 0.04 0.   0.08 0.05 0.01 0.01]
 [0.01 0.03 0.03 0.01 0.02 0.04 0.01 0.02 0.03 0.01]
 [0.01 0.02 0.01 0.02 0.01 0.06 0.08 0.01 0.04 0.04]
 [0.13 0.03 0.01 0.03 0.08 0.02 0.   0.06 0.01 0.03]
 [0.02 0.01 0.01 0.02 0.02 0.05 0.02 0.01 0.02 0.02]
 [0.   0.02 0.01 0.02 0.01 0.01 0.01 0.01 0.   0.06]
 [0.03 0.02 0.02 0.01 0.03 0.02 0.01 0.01 0.04 0.02]
 [0.02 0.   0.   0.02 0.02 0.03 0.02 0.05 0.01 0.02]
 [0.   0.01 0.06 0.02 0.03 0.04 0.05 0.02 0.05 0.02]
 [0.06 0.   0.   0.01 0.01 0.04 0.06 0.03 0.03 0.07]
 [0.03 0.04 0.01 0.04 0.02 0.01 0.04 0.02 0.12 0.02]
 [0.01 0.04 0.04 0.03 0.01 0.03 0.   0.02 0.05 0.04]
 [0.05 0.03 0.01 0.03 0.04 0.   0.03 0.   0.02 0.03]
 [0.03 0.02 0.04 0.03 0.03 0.   0.   0.01 0.01