In [1]:
from learningAgent import LearningAlgorithm
from environment import Model, AdversaryModes
from Qtable import QTable
from test import Test
import numpy as np

In [2]:
np.random.seed(10)

In [3]:
total_Demand = 400
agent_Cost = 57
adv_Cost = 71 
tuple_Costs = [57,71]
total_Stages = 25
init_State = [total_Demand/2, total_Demand/2]
adversaryProbs=[0]*len(AdversaryModes)
adversaryProbs[0]=1 # We only train against strategies that do not take into account the 
# previous actions (myopic or constant strategies).

game = Model(total_Demand, [agent_Cost,adv_Cost], total_Stages, adversaryProbs)

In [4]:
num_Actions = 50
num_States = abs(adv_Cost - agent_Cost) + 2 * num_Actions + 2

In [5]:
# Learning rate is given as [numerator,denominator] which gives us a learning rate function of 
# numerator/(n+demoninator)
Qtable = QTable(num_States, num_Actions, total_Stages , learning_Rate = [490000,500000])

In [6]:
numberEpisodes = 50_000_000
discountFactor = 0.9999

In [7]:
algorithm = LearningAlgorithm(game, Qtable, numberEpisodes, discountFactor)

In [8]:
algorithm.solver()

In [9]:
# print(Qtable.Q_table)

In [10]:
# Qtable.Q_table.mean()

In [11]:
adversaryProbs=[0]*len(AdversaryModes)
adversaryProbs[0]=1 # We can test the Q-Table against any strategy. 

In [12]:
result = Test(game, Qtable, discountFactor, adversaryProbs)

In [13]:
#import numpy as np
#import matplotlib.pyplot as plt

In [14]:
# The following plots what the Q-Table says is the best action in each state
#states, bestResponses = result.bestResponses()
#plt.scatter(states,bestResponses)
#plt.show()

In [15]:
# Returns the optimal payoff and actions according to the Qtable
payoff, advPayoff, actions, advActions, demandPotential = result.totalPayoff()
print(payoff)
print(advPayoff)
print(actions)
print(advActions)
print(demandPotential)

180214.8627812602
41262.33031321998
[ 89  92 100 104 110 110 106 109 107 106 108 108 108 108 108 108 108 108
 108 108 112 113 118 130 147]
[135 124 116 112 110 110 110 109 109 108 108 108 108 108 108 108 108 108
 108 108 108 109 110 112 116]
[200. 223. 239. 247. 251. 251. 251. 253. 253. 254. 255. 255. 255. 255.
 255. 255. 255. 255. 255. 255. 255. 253. 251. 247. 238.]


In [16]:
# The percentage error of the Qtable. This has to be measured against the same opponent that 
# it was trained against, as this is a measure of how 'complete' the training is.
error = result.error()

In [17]:
error.mean()

0.0006385809554732589

In [18]:
# The following calculates the payoff that the Q-Table gives against the different opponenets. 
# It may reach a state in which the Q-Table was not trained. This will cause an error saying 
# either 'max action reached' or 'min action reached'.
# for i in range(len(AdversaryModes)):
#     print(AdversaryModes(i))
#     adversaryProbs=[0]*len(AdversaryModes)
#     adversaryProbs[i]=1
#     result = Test(game, Qtable, discountFactor, adversaryProbs)
#     payoff, advPayoff, actions, advActions, demandPotential = result.totalPayoff()
#     print(payoff)
