/
exp3.py
238 lines (196 loc) · 8.36 KB
/
exp3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#!/usr/bin/env python3
import asyncio
import collections
import copy
import math
import numpy as np
import random
import sys
from game import Game
import moves
#Exp3
class Exp3Agent:
def __init__(self, teams, format, gamma=0.3, verbose=False):
self.teams = teams
self.format = format
self.verbose = verbose
self.mcData = []
for i in range(2):
data = {
'gamma': gamma,
'countTable': collections.defaultdict(int),
'expValueTable': collections.defaultdict(int),
'seenStates': {},
}
self.mcData.append(data)
async def search(self, ps, pid=0, limit=100, seed=None, initActions=[[],[]]):
await mcSearchExp3(
ps,
self.format,
self.teams,
self.mcData,
limit=limit,
seed=seed,
p1InitActions=initActions[0],
p2InitActions=initActions[1],
verbose=self.verbose)
def getProbs(self, player, state, actions):
return getProbsExp3(self.mcData[player], state, actions)
def combine(self):
self.mcData = combineExp3Data([self.mcData])[0]
#initActions is a list of initial actions that will be blindy taken
#mcData has countTable, which maps (state, action) to count
#mcData has expValueTable, which maps (stat, action) to an expected value
#both should be defaultdict to 0
#mcData has gamma, which is a number [0,1], prob of picking random move
#iter is the iteration number, which may be used to compute gamma
async def mcExp3Impl(requestQueue, cmdQueue, cmdHeader, mcData,
format, iter=0, initActions=[], verbose=False):
countTable = mcData['countTable']
expValueTable = mcData['expValueTable']
gamma = mcData['gamma']
seenStates = mcData['seenStates']
#history so we can update probTable
history = []
#we're going to be popping off this
initActions = copy.deepcopy(initActions)
running = True
inInitActions = True
while running:
request = await requestQueue.get()
if verbose:
print(cmdHeader, 'got request', request)
if request[0] == Game.REQUEST or request[0] == Game.ERROR:
req = request[1]
state = req['stateHash']
seenStates[state] = True
actions = moves.getMoves(format, req)
#check if we ran out of initActions on the previous turn
#if so, we need to change the PRNG
if inInitActions and len(initActions) == 0:
inInitActions = False
#no problem if both players reset the PRNG
await cmdQueue.put('>resetPRNG')
#calculate a probability for each action
#need the probs from the initActions so we can update,
#so we always calculate this
eta = gamma / len(actions)
expValues = [expValueTable[(state,action)] for action in actions]
maxExpValues = max(expValues)
ws = [expValueTable[(state, action)] - maxExpValues for action in actions]
xs = [math.exp(eta * w) for w in ws]
xSum = np.sum(xs)
probs = np.array([(1-gamma) * x / xSum + gamma / len(actions) for x in xs])
#illegal moves might have a negative probability, which should just be 0
probs = [p if p > 0 else 0 for p in probs]
probs = probs / np.sum(probs)
if len(initActions) > 0:
#blindly pick init action
bestAction = initActions[0]
bestActionIndex = actions.index(bestAction)
bestActionProb = probs[bestActionIndex]
initActions = initActions[1:]
else:
#pick action based on probs
bestActionIndex = np.random.choice(len(actions), p=probs)
bestAction = actions[bestActionIndex]
bestActionProb = probs[bestActionIndex]
#save our action
history.append((state, bestAction, bestActionProb))
if verbose:
print('picked', cmdHeader + bestAction)
await cmdQueue.put(cmdHeader + bestAction)
elif request[0] == Game.END:
#update probTable with our history + result
reward = request[1]
#rescale reward from [-1,1] to [0,1]
reward = (reward + 1) / 2
for state, action, prob in history:
countTable[(state, action)] += 1
expValueTable[(state,action)] += reward / prob
running = False
#Exp3
async def mcSearchExp3(ps, format, teams, mcData, limit=100,
seed=None, p1InitActions=[], p2InitActions=[], verbose=False):
print(end='', file=sys.stderr)
for i in range(limit):
print('\rTurn Progress: ' + str(i) + '/' + str(limit), end='', file=sys.stderr)
game = Game(ps, teams, format=format, seed=seed, verbose=verbose)
await game.startGame()
await asyncio.gather(
mcExp3Impl(game.p1Queue, game.cmdQueue,
">p1", mcData=mcData[0], format=format,
initActions=p1InitActions, verbose=verbose),
mcExp3Impl(game.p2Queue, game.cmdQueue,
">p2", mcData=mcData[1], format=format,
initActions=p2InitActions, verbose=verbose))
print(file=sys.stderr)
def combineExp3Data(mcDatasets, valueModel=None):
num = len(mcDatasets)
#record which states were seen in the last iteration
seenStates = {}
for data in mcDatasets:
for j in range(2):
seen = data[j]['seenStates']
for state in seen:
seenStates[state] = True
if valueModel:
valueModel.purge(seenStates)
if num == 1:
for data in mcDatasets:
for j in range(2):
countTable = data[j]['countTable']
expValueTable = data[j]['expValueTable']
keys = list(countTable)
for state, action in keys:
if state not in seenStates:
del countTable[(state, action)]
if (state, action) in expValueTable:
del expValueTable[(state, action)]
return mcDatasets
#combine data on states that were seen in any search
#in the last iteration
combMcData = [{
'countTable': collections.defaultdict(int),
'expValueTable': collections.defaultdict(int),
'seenStates': {},
'avgGamma': mcDatasets[0][j]['avgGamma'],
'gamma': mcDatasets[0][j]['gamma']} for j in range(2)]
for data in mcDatasets:
for j in range(2):
countTable = data[j]['countTable']
expValueTable = data[j]['expValueTable']
for state, action in countTable:
if state in seenStates:
combMcData[j]['countTable'][(state, action)] += countTable[(state, action)]
combMcData[j]['expValueTable'][(state, action)] += expValueTable[(state, action)]
#copy the combined data back into the datasets
return [copy.deepcopy(combMcData) for j in range(num)]
#returns the final probabilities of each action in the state
def getProbsExp3(mcData, state, actions):
countTable = mcData['countTable']
counts = [countTable[(state, action)] for action in actions]
expValueTable = mcData['expValueTable']
totalCount = np.sum(counts)
#not sure if I should make this adjustment or not
#experiments seem to show that it helps
gamma = mcData['gamma']
probs = np.array([max(0, c - gamma * totalCount / len(actions)) for c in counts])
probs = probs / np.sum(probs)
return probs
#should return the expected value for the state
#I'm assuming x * p / c gives the expected value of a move with the given probability
#and averaging that for all moves gives the expected value for the state
#I'm not sure about the math but the numbers seem to work out
def getExpValueExp3(mcData, state, actions, probs):
countTable = mcData['countTable']
counts = [countTable[(state, action)] for action in actions]
expValueTable = mcData['expValueTable']
xvs = []
for i in range(len(actions)):
action = actions[i]
if counts[i] == 0:
continue
xv = expValueTable[(state, action)] * probs[i] / counts[i]
xvs.append(xv)
return np.mean(xvs)