/
rm.py
292 lines (246 loc) · 10.7 KB
/
rm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env python3
import asyncio
import collections
import copy
import math
import numpy as np
import random
#from sqlitedict import SqliteDict
import sys
from game import Game
import model
import moves
#MC Regret Matching (technically MCCFR with Outcome Samplingling, I think)
#options for how to store the table
#memory is faster, but db allows for more data
MEMORY = 1
DB = 2
class RegretMatchAgent:
#exploration is the probability of taking a random action instead of
#following the regret matching distribution
#initExpVal is the expected value for state-action-actions that haven't
#been seen before
#the value applies to both players instead of being zero sum
#DCFR parameters:
#posReg is whether when floor cumulative regret at 0 or not
#probScaling determines how much to weight to give to later iterations'
#contributions to the final strategy's probability
#regScaling is like probScaling but for contributions to cumulative regret
#tableType determines how we store our data
#dbLocation is the sqlite db file
#dbClear is whether to clear the db before running
#this should be done if any parameters change, or if we just want a fresh
#start
def __init__(self, teams, format,
exploration=0.3, initExpVal=0,
posReg=False, probScaling=0, regScaling=0,
tableType=MEMORY,
dbLocation='./rm-agent', dbClear=True,
verbose=False):
self.teams = teams
self.format = format
self.exploration = exploration
self.initExpVal = initExpVal
#DCFR parameters
self.posReg = posReg
self.probScaling = probScaling
self.regScaling = regScaling
self.verbose = verbose
self.tableType = tableType
#I'm hardcoding reward and count as tables
#There hasn't been enough success with models to justify
#abstracting them out
if tableType == MEMORY:
self.regretTables = [{}, {}]
self.probTables = [{}, {}]
self.rewardTable = {}
self.countTable = {}
elif tableType == DB:
#I was planning on leaving autocommit off, but there are some bugs
autocommit = True
#cumulative regret, (state, action) -> regret
self.regretTables = []
#cumulative probability, (state, action) -> (non-normalized) probability
self.probTables = []
for i in range(2):
self.regretTables.append(
SqliteDict(dbLocation, tablename='regret' + str(i), autocommit=autocommit))
self.probTables.append(
SqliteDict(dbLocation, tablename='prob' + str(i), autocommit=autocommit))
if dbClear:
self.regretTables[i].clear()
self.probTables[i].clear()
#cumulative reward, (state, action1, action2) -> p1's cumulative reward
self.rewardTable = SqliteDict(dbLocation, tablename='reward', autocommit=autocommit)
#count, (state, action1, action2) -> count
self.countTable = SqliteDict(dbLocation, tablename='count', autocommit=autocommit)
if dbClear:
self.rewardTable.clear()
self.countTable.clear()
def close(self):
if self.tableType == DB:
for i in range(2):
self.regretTables[i].close()
self.probTables[i].close()
self.rewardTable.close()
self.countTable.close()
async def search(self, ps, pid=0, limit=100, seed=None, initActions=[[], []]):
#this throws away some good data, but I'm not sure it will matter too much
#right now memory usage is our biggest enemy
#if self.tableType == MEMORY:
#self.regretTables = [{}, {}]
#self.probTables = [{}, {}]
#self.rewardTable = {}
#self.countTable = {}
#turn init actions into a useful history
history = [(None, a1, a2) for a1, a2 in zip(*initActions)]
#insert the seed in the first turn
if len(history) > 0:
_, a1, a2 = history[0]
history[0] = (seed, a1, a2)
print(end='', file=sys.stderr)
for i in range(limit):
print('\rTurn Progress: ' + str(i) + '/' + str(limit), end='', file=sys.stderr)
game = Game(ps, self.teams, format=self.format, seed=seed, verbose=self.verbose)
await game.startGame()
await game.applyHistory(history)
await self.rmRecur(ps, game, seed, i)
print(file=sys.stderr)
def combine(self):
#no need to combine when we're using a DB
pass
#for getting final action probabilites
def getProbs(self, player, state, actions):
pt = self.probTables[player]
probs = np.array([dictGet(pt, (state, a)) for a in actions])
pSum = np.sum(probs)
if pSum > 0:
return probs / np.sum(probs)
else:
return np.array([1 / len(actions) for a in actions])
#recursive implementation of rm
#assumes the game is running and is caught up
#returns p1's expected reward for the playthrough
#(which is just the actual reward for rm)
async def rmRecur(self, ps, game, startSeed, iter, depth=0):
cmdHeaders = ['>p1', '>p2']
queues = [game.p1Queue, game.p2Queue]
#all the actions both players can pick
playerActions = [[], []]
#the probabilitiy of picking each action
playerProbs = [[], []]
#the indices of the actions actually picked
pickedActions = [0, 0]
#both players make their move
for i in range(2):
request = (await queues[i].get())
if request[0] == Game.END:
winner = await game.winner
#have to clear the results out of the queues
while not game.p1Queue.empty():
await game.p1Queue.get()
while not game.p2Queue.empty():
await game.p2Queue.get()
if winner == 'bot1':
return 1
else:
return 0
req = request[1]
state = req['stateHash']
#get rm probs for the actions
actions = moves.getMoves(self.format, req)
playerActions[i] = actions
probs = self.regretMatch(i, state, actions)
#add exploration, which adds a chance to play randomly
exploreProbs = probs * (1 - self.exploration) + self.exploration / len(probs)
playerProbs[i] = probs
#and sample one action
pickedActions[i] = np.random.choice(len(actions), p=exploreProbs)
#apply the picked actions to the game
seed = Game.getSeed()
await game.cmdQueue.put('>resetPRNG ' + str(seed))
for i in range(2):
pickedAction = pickedActions[i]
action = playerActions[i][pickedAction]
await game.cmdQueue.put(cmdHeaders[i] + action)
#get the reward so we can update our regrets
reward = await self.rmRecur(ps, game, startSeed, iter, depth=depth+1)
#save the reward
a1 = playerActions[0][pickedActions[0]]
a2 = playerActions[1][pickedActions[1]]
self.addReward(state, a1, a2, reward)
#need to update both players' regret and strategy
for i in range(2):
#update each action's regret and probability in average strategy
rt = self.regretTables[i]
pt = self.probTables[i]
actions = playerActions[i]
for j in range(len(actions)):
#update stategy with this iteration's strategy
#which just means adding the current probability of each action
probScale = ((iter+1) / (iter+2))**self.probScaling
prob = dictGet(pt, (state, actions[j]))
pt[hash((state, actions[j]))] = probScale * prob + playerProbs[i][j]
#immediate regret of picked actions is 0, so just skip
if j == pickedActions[i]:
continue
#get existing regret so we can add to it
regret = dictGet(rt, (state, actions[j]))
if self.regScaling != 0:
regret *= ((iter+1)**self.regScaling) / ((iter+1)**self.regScaling + 1)
#get i's possible action and -i's actual action
#in player order
if i == 0:
a1 = actions[j]
a2 = playerActions[1][pickedActions[1]]
myReward = reward
else:
a1 = playerActions[0][pickedActions[0]]
a2 = actions[j]
myReward = 1 - reward
#get expected value for the potential turn
expValue = self.getExpValue(i, state, a1, a2)
#add immediate regret
if self.posReg:
rt[hash((state, actions[j]))] = max(regret + expValue - myReward, 0)
else:
rt[hash((state, actions[j]))] = regret + expValue - myReward
#pass the actual reward up
return reward
#generates probabilities for each action
def regretMatch(self, player, state, actions):
rt = self.regretTables[player]
regrets = np.array([max(0, dictGet(rt, (state, a))) for a in actions])
rSum = np.sum(regrets)
if rSum > 0:
return regrets / rSum
else:
return np.array([1 / len(actions) for a in actions])
#saves the actual reward so it can be used by getExpValue later
def addReward(self, state, a1, a2, reward):
rewardSum = dictGet(self.rewardTable, (state, a1, a2))
count = dictGet(self.countTable, (state, a1, a2))
self.rewardTable[hash((state, a1, a2))] = rewardSum + reward
self.countTable[hash((state, a1, a2))] = count + 1
#returns expected reward for the (state, action, action) tuple
#which is just the tabular value
def getExpValue(self, player, state, a1, a2):
reward = dictGet(self.rewardTable, (state, a1, a2))
count = dictGet(self.countTable, (state, a1, a2))
if count == 0:
return self.initExpVal
else:
expValue = reward / count
#player 2 is the minimizing player
if player == 1:
expValue = 1 - expValue
return expValue
#convenience method, treats dict like defaultdict(int)
#which is needed for sqlitedict
#there's probably a better way
def dictGet(table, key):
#sqlite is stricter about keys, so we have to use a hash
key = hash(key)
if not key in table:
table[key] = 0
return table[key]