In [1]:
from learningAgent import LearningAlgorithm
from environment import Model, AdversaryModes
from Qtable import QTable
from test import Test
import numpy as np

In [2]:
np.random.seed(10)

In [3]:
total_demand = 400
agent_cost = 57
adversary_cost = 71 
costs = [agent_cost,adversary_cost]
total_stages = 25
initial_state = [total_demand/2, total_demand/2]
adversary_probabilities=[0]*len(AdversaryModes)
adversary_probabilities[10]=1 # We only train against strategies that do not take into account the 
# previous actions (myopic or constant strategies).

game = Model(total_demand, costs, total_stages, adversary_probabilities)

In [4]:
number_actions = 150
number_demands = abs(adversary_cost - agent_cost) + 10 * number_actions + 2

In [5]:
# Learning rate is given as [numerator,denominator] which gives us a learning rate function of 
# numerator/(n+demoninator)
Qtable = QTable(number_demands, number_actions, total_stages , learning_rate = [490000,500000])

In [6]:
number_episodes = 1_000_000
discount_factor = 0.9999

In [7]:
algorithm = LearningAlgorithm(game, Qtable, number_episodes, discount_factor)

In [8]:
algorithm.solver()

In [9]:
# print(Qtable.Q_table)

In [10]:
# Qtable.Q_table.mean()

In [11]:
adversaryProbs=[0]*len(AdversaryModes)
adversaryProbs[10]=1 # We can test the Q-Table against any strategy. 

In [12]:
result = Test(game, Qtable, discount_factor, adversary_probabilities)

In [13]:
import numpy as np
import matplotlib.pyplot as plt

In [14]:
# Returns the optimal payoff and actions according to the Qtable
payoff, adversary_payoff, actions, adversary_actions, demand_potential = result.total_payoff()
print(payoff)
print(adversary_payoff)
print(actions)
print(adversary_actions)
print(demand_potential)

66058.47203847661
92132.55061550891
[120 112 135 123 113 112 101 107  92 102  92  95  94 100  94  90  93  93
  98  89  97  96  87  88 103]
[132 132  94  94  94  94  94  94  94  94  94  94  94  94  94  94  94  94
  94  94  94  94  94  94 161]
[200. 206. 216. 195. 180. 170. 161. 157. 150. 151. 147. 148. 147. 147.
 144. 144. 146. 146. 146. 144. 146. 144. 143. 146. 149.]


In [15]:
# The percentage error of the Qtable. This has to be measured against the same opponent that 
# it was trained against, as this is a measure of how 'complete' the training is.
result.error(1000)

0.029335938230328894

In [16]:
# The following calculates the payoff that the Q-Table gives against the different opponenets. 
# It may reach a state in which the Q-Table was not trained. This will cause an error saying 
# either 'max action reached' or 'min action reached'.
# for i in range(len(AdversaryModes)):
#     print(AdversaryModes(i))
#     adversaryProbs=[0]*len(AdversaryModes)
#     adversaryProbs[i]=1
#     result = Test(game, Qtable, discountFactor, adversaryProbs)
#     payoff, advPayoff, actions, advActions, demandPotential = result.totalPayoff()
#     print(payoff)
