In [1]:
from learningAgent import LearningAlgorithm
from environment import Model, AdversaryModes
from Qtable import QTable
from test import Test
import numpy as np

In [2]:
np.random.seed(10)

In [3]:
total_demand = 400
agent_cost = 57
adversary_cost = 71 
costs = [agent_cost,adversary_cost]
total_stages = 25
initial_state = [total_demand/2, total_demand/2]
adversary_probabilities=[0]*len(AdversaryModes)
adversary_probabilities[0]=1 # We only train against strategies that do not take into account the 
# previous actions (myopic or constant strategies).

game = Model(total_demand, costs, total_stages, adversary_probabilities)

In [4]:
number_actions = 50
number_states = abs(adversary_cost - agent_cost) + 2 * number_actions + 2

In [5]:
# Learning rate is given as [numerator,denominator] which gives us a learning rate function of 
# numerator/(n+demoninator)
Qtable = QTable(number_states, number_actions, total_stages , learning_rate = [490000,500000])

In [6]:
number_episodes = 50_000
discount_factor = 0.9999

In [7]:
algorithm = LearningAlgorithm(game, Qtable, number_episodes, discount_factor)

In [8]:
algorithm.solver()

In [9]:
# print(Qtable.Q_table)

In [10]:
# Qtable.Q_table.mean()

In [11]:
adversaryProbs=[0]*len(AdversaryModes)
adversaryProbs[0]=1 # We can test the Q-Table against any strategy. 

In [12]:
result = Test(game, Qtable, discount_factor, adversary_probabilities)

In [13]:
#import numpy as np
#import matplotlib.pyplot as plt

In [14]:
# The following plots what the Q-Table says is the best action in each state
#states, bestResponses = result.bestResponses()
#plt.scatter(states,bestResponses)
#plt.show()

In [15]:
# Returns the optimal payoff and actions according to the Qtable
payoff, adversary_payoff, actions, adversary_actions, demand_potential = result.total_payoff()
print(payoff)
print(adversary_payoff)
print(actions)
print(adversary_actions)
print(demand_potential)

169997.2470224009
58430.845981412385
[111 101 105 120 113 124 105 128 115 134  93 123 130  98 137 126 105 113
 112 108 113 131  97 140 138]
[135 129 122 118 119 117 119 116 119 118 122 115 117 120 115 120 122 118
 117 116 114 114 118 113 120]
[200. 212. 226. 234. 233. 236. 232. 239. 233. 235. 227. 241. 237. 230.
 241. 230. 227. 235. 237. 239. 243. 243. 234. 244. 230.]


In [16]:
# The percentage error of the Qtable. This has to be measured against the same opponent that 
# it was trained against, as this is a measure of how 'complete' the training is.
error = result.error()

In [17]:
error.mean()

0.7821538466161266

In [18]:
# The following calculates the payoff that the Q-Table gives against the different opponenets. 
# It may reach a state in which the Q-Table was not trained. This will cause an error saying 
# either 'max action reached' or 'min action reached'.
# for i in range(len(AdversaryModes)):
#     print(AdversaryModes(i))
#     adversaryProbs=[0]*len(AdversaryModes)
#     adversaryProbs[i]=1
#     result = Test(game, Qtable, discountFactor, adversaryProbs)
#     payoff, advPayoff, actions, advActions, demandPotential = result.totalPayoff()
#     print(payoff)
