In [1]:
from learningAgent import LearningAlgorithm
from environment import Model, AdversaryModes
from Qtable import QTable
from test import Test
import numpy as np

In [2]:
np.random.seed(10)

In [3]:
total_Demand = 400
agent_Cost = 57
adv_Cost = 71 
tuple_Costs = [57,71]
total_Stages = 7
init_State = [total_Demand/2, total_Demand/2]
adversaryProbs=[0]*len(AdversaryModes)
adversaryProbs[0]=1 # We only train against strategies that do not take into account the 
# previous actions (myopic or constant strategies).

game = Model(total_Demand, [agent_Cost,adv_Cost], total_Stages, adversaryProbs)

In [4]:
num_Actions = 6
num_States = abs(adv_Cost - agent_Cost) + 2 * num_Actions + 2

In [5]:
# Learning rate is given as [numerator,denominator] which gives us a learning rate function of 
# numerator/(n+demoninator)
Qtable = QTable(num_States, num_Actions, total_Stages , learning_Rate = [490000,500000])

In [6]:
numberEpisodes = 200_000
discountFactor = 0.99

In [7]:
algorithm = LearningAlgorithm(game, Qtable, numberEpisodes, discountFactor)

In [8]:
algorithm.solver()

In [9]:
print(Qtable.Q_table)

[[[367.56 311.89 255.79 ... 144.72  91.21  41.3 ]
  [366.81 311.15 255.05 ... 143.98  90.6   41.4 ]
  [366.89 311.23 255.13 ... 144.06  90.68  41.48]
  [365.54 309.87 253.77 ... 142.84  90.05  41.54]
  [365.58 309.91 253.81 ... 142.88  90.09  41.58]
  [364.77 309.1  253.   ... 142.07  89.42  41.6 ]]

 [[368.26 312.59 256.49 ... 145.42  91.91  42.  ]
  [368.35 312.68 256.58 ... 145.51  92.    42.09]
  [367.57 311.91 255.81 ... 144.74  91.36  42.16]
  [367.62 311.96 255.86 ... 144.79  91.41  42.21]
  [366.24 310.57 254.47 ... 143.54  90.75  42.24]
  [366.25 310.58 254.48 ... 143.55  90.76  42.25]]

 [[370.88 315.21 259.12 ... 147.31  93.21  42.6 ]
  [368.96 313.29 257.19 ... 146.12  92.61  42.7 ]
  [369.04 313.37 257.27 ... 146.2   92.69  42.78]
  [368.25 312.59 256.49 ... 145.42  92.04  42.84]
  [368.29 312.63 256.53 ... 145.46  92.08  42.88]
  [366.9  311.23 255.13 ... 144.2   91.41  42.9 ]]

 ...

 [[401.76 346.09 289.86 ... 175.69 117.74  59.04]
  [401.85 346.18 289.95 ... 175.78 117

In [10]:
# Qtable.Q_table.mean()

In [11]:
adversaryProbs=[0]*len(AdversaryModes)
adversaryProbs[0]=1 # We can test the Q-Table against any strategy. 

In [12]:
result = Test(game, Qtable, discountFactor, adversaryProbs)

In [13]:
#import numpy as np
#import matplotlib.pyplot as plt

In [14]:
# The following plots what the Q-Table says is the best action in each state
#states, bestResponses = result.bestResponses()
#plt.scatter(states,bestResponses)
#plt.show()

In [15]:
# Returns the optimal payoff and actions according to the Qtable
# payoff, advPayoff, actions, advActions, demandPotential = result.totalPayoff()
# print(payoff)
# print(advPayoff)
# print(actions)
# print(advActions)
# print(demandPotential)

In [16]:
# The percentage error of the Qtable. This has to be measured against the same opponent that 
# it was trained against, as this is a measure of how 'complete' the training is.
error = result.error()

In [17]:
error.mean()

-3.4578751899138774e-18

In [18]:
# The following calculates the payoff that the Q-Table gives against the different opponenets. 
# It may reach a state in which the Q-Table was not trained. This will cause an error saying 
# either 'max action reached' or 'min action reached'.
# for i in range(len(AdversaryModes)):
#     print(AdversaryModes(i))
#     adversaryProbs=[0]*len(AdversaryModes)
#     adversaryProbs[i]=1
#     result = Test(game, Qtable, discountFactor, adversaryProbs)
#     payoff, advPayoff, actions, advActions, demandPotential = result.totalPayoff()
#     print(payoff)
