-
Notifications
You must be signed in to change notification settings - Fork 1
/
policyGradExp.py
270 lines (220 loc) · 8.65 KB
/
policyGradExp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
from QTPAgent import JointQTPAgent, OptimalPolicyQueryAgent
from policyGradientAgents import PolicyGradientQueryAgent,\
AprilAgent, SamplingAgent
from actionQueryAgents import MILPActionAgent
from trajAgents import BeliefChangeTrajAgent, RandomTrajAgent, DisagreeTrajAgent,\
MILPTrajAgent
import CMPExp
import util
import sys
import random
import getopt
import config
from cmp import QueryType
import numpy
from tabularNavigation import ThreeStateToy
from driving import Driving
flags = "r:l:s:d:a:vq:P:t:k:n:y:x"
"""
For all experiments that we need function approximation, now including mountain car and some toy domains.
We have arguments for policy gradient algorithms, and some candidate feature extractors.
"""
def experiment(cmp, feat, featLength, rewardSet, initialPhi):
# discount factor
gamma = 1
agentName = 'MILP-POLICY'
try:
opts, args = getopt.getopt(sys.argv[1:], flags)
except getopt.GetoptError:
print 'unknown flag encountered'
sys.exit(2)
for opt, arg in opts:
if opt == '-r':
random.seed(int(arg))
numpy.random.seed(int(arg))
elif opt == '-a':
agentName = arg
elif opt == '-v':
config.VERBOSE = True
elif opt == '-x':
config.DEBUG = True
# the true reward function is chosen according to initialPhi
trueRewardIdx = util.sample(initialPhi, range(len(rewardSet)))
if config.VERBOSE:
print 'true reward', trueRewardIdx
# continue initializing the cmp object
cmp.decorate(gamma, None, trueRewardIdx, rewardSet[trueRewardIdx])
cmp.setPossibleRewardValues([0, 1])
if agentName == 'JQTP' or agentName == 'NQ' or agentName == 'WAIT':
queryType = QueryType.ACTION
agent = JointQTPAgent(cmp, rewardSet, initialPhi, queryType, gamma)
elif agentName == "MILP":
agent = MILPActionAgent(cmp, rewardSet, initialPhi, queryType, gamma)
elif agentName == "OPT-POLICY":
queryType = QueryType.POLICY
agent = OptimalPolicyQueryAgent(cmp, rewardSet, initialPhi, queryType, gamma)
elif agentName == "MILP-POLICY":
queryType = QueryType.POLICY
agent = PolicyGradientQueryAgent(cmp, rewardSet, initialPhi, queryType, feat, featLength, gamma)
elif agentName == "SAMPLE-POLICY":
queryType = QueryType.POLICY
agent = SamplingAgent(cmp, rewardSet, initialPhi, queryType, feat, featLength, gamma)
elif agentName == "RAND-POLICY":
config.SAMPLE_TIMES = 1
queryType = QueryType.POLICY
agent = SamplingAgent(cmp, rewardSet, initialPhi, queryType, feat, featLength, gamma)
elif agentName == "MILP-SIMILAR":
queryType = QueryType.SIMILAR
agent = MILPTrajAgent(cmp, rewardSet, initialPhi, queryType, feat, featLength, gamma)
elif agentName == "SIMILAR-DISAGREE":
queryType = QueryType.SIMILAR
agent = DisagreeTrajAgent(cmp, rewardSet, initialPhi, queryType, feat, featLength, gamma)
elif agentName == "SIMILAR-VARIATION":
queryType = QueryType.SIMILAR
agent = BeliefChangeTrajAgent(cmp, rewardSet, initialPhi, queryType, feat, featLength, gamma)
elif agentName == "SIMILAR-RANDOM":
queryType = QueryType.SIMILAR
agent = RandomTrajAgent(cmp, rewardSet, initialPhi, queryType, gamma)
else:
raise Exception("Unknown Agent " + agentName)
if config.VERBOSE:
print "Query type:", queryType
ret, qValue, time = CMPExp.experiment(cmp, agent, gamma, rewardSet, queryType, horizon=cmp.horizon)
print ret, qValue, time
if config.PRINT == 'perf':
f = open(agentName + str(config.opts) + '.out',"a")
f.write(str(qValue) + ' ' + str(time) + '\n')
f.close()
def mountainCarExp(Domain):
def feat((x, v), a):
"""
the length of feat is vecLen * |A|
features for each action occupy different subsets of the feature vector
"""
vec = (x, v, x**2, x**3, x * v, x**2 * v, x**3 * v, v**2, 1)
#vec = (x, v, x**2, v**2, x * v, 1)
#vec = (x,)
vecLen = len(vec)
if a == -1:
return numpy.array(vec + (0,) * 2 * vecLen)
elif a == 0:
return numpy.array((0,) * vecLen + vec + (0,) * vecLen)
elif a == 1:
return numpy.array((0,) * 2 * vecLen + vec)
else:
raise Exception('unknown action. different domain?')
featLength = len(feat((0, 0), 1)) # just use an arbitrary state to compute the length of feature
horizon = 20
def makeReward(loc0, loc1, reward):
def r(s, a):
# rewards are given on states. action a is dummy here
if s[0] > loc0 and s[0] < loc1 and s[1] >= -0.02 and s[1] <= 0.02:
return reward
else:
return 0
return r
rewardSet = [makeReward(i, i + 0.1, 10 if i == -1.0 or i == 0.9 else 1) for i in [-1.0, -0.7, -0.4, 0.3, 0.6, 0.9]]
rewardCandNum = len(rewardSet)
initialPhi = [1.0 / rewardCandNum] * rewardCandNum
terminalReward = util.Counter()
cmp = Domain(0, horizon, terminalReward)
experiment(cmp, feat, featLength, rewardSet, initialPhi)
def drivingExp():
numOfLanes = 3
length = 5
numOfCars = 5
carLength = 0.5
#cars = [(random.random() * length, random.randint(0, numOfLanes - 1)) for _ in range(numOfCars)]
cars = [(1, 1), (2, 0), (3, 2), (4, 0), (4, 1)]
print cars
def feat(s, a):
# binary vector to indicate the current lane of the robot
vec = [1 if s[1] == i else 0 for i in range(numOfLanes)]
for i in [s[1] - 1, s[1], s[1] + 1]:
laneFeat = [0, 0, 0]
for car in cars:
if car[1] == i and car[0] - s[0]:
if car[0] - s[0] < carLength:
laneFeat[0] = 1
elif car[0] - s[0] < 1:
laneFeat[1] = 1
else:
laneFeat[2] = 1
vec = vec + laneFeat
vec = tuple(vec)
vecLen = len(vec)
if a == 'W':
return numpy.array(vec + (0,) * 2 * vecLen)
elif a == 'E':
return numpy.array((0,) * vecLen + vec + (0,) * vecLen)
elif a == 'N':
return numpy.array((0,) * 2 * vecLen + vec)
else:
raise Exception('unknown action ' + str(a) + '. different domain?')
featLength = len(feat((0, 1), 'N')) # just use an arbitrary state to compute the length of feature
horizon = 50
terminalReward = util.Counter()
cmp = Driving(0, horizon, terminalReward, length=length, lanes=numOfLanes)
drivers = {
'niceDriver': {'cars': -1, 'lane': None},
'nastyDriver': {'cars': 1, 'lane': None},
#'dangerousDriver': {'cars': -1, 'backOfCars': 1, 'lane': None},
'rightNiceDriver': {'cars': -1, 'lane': 0},
#'rightNastyDriver': {'cars': 1, 'lane': 0},
'leftNiceDriver': {'cars': -1, 'lane': numOfLanes - 1},
#'leftNastyDriver': {'cars': 1, 'lane': numOfLanes - 1},
#'middleNiceDriver': {'cars': -1, 'lane': numOfLanes / 2},
#'middleNastyDriver': {'cars': 1, 'lane': numOfLanes / 2},
#'leftLaneDriver': {'cars': 0, 'lane': 0},
'middleLaneDriver': {'cars': 0, 'lane': numOfLanes / 2},
#'rightLaneDriver': {'cars': 0, 'lane': numOfLanes - 1},
}
def makeReward(spec):
def r(s, a):
reward = 0
for car in cars:
if car[1] == s[1] and car[0] > s[0] and car[0] < s[0] + carLength:
reward += spec['cars']
if 'backOfCars' in spec.keys():
if car[1] == s[1] and car[0] < s[0] and car[0] > s[0] - carLength:
reward += spec['backOfCars']
if spec['lane']:
if car[1] == spec['lane']:
reward += 0.01
return reward
return r
rewardSet = [makeReward(driverSpec) for driverName, driverSpec in drivers.items()]
rewardCandNum = len(rewardSet)
initialPhi = [1.0 / rewardCandNum] * rewardCandNum
experiment(cmp, feat, featLength, rewardSet, initialPhi)
def threeStateExp():
# a simple domain to make sure our pg works
featLength = 3
if config.POLICY_TYPE == 'linear':
# direct policy control.
# (normalized) theta1, theta2, theta3 are the prob. of reaching three terminal states
# this is a very simple problem to solve. check this first when debugging!
feat = None
elif config.POLICY_TYPE == 'softmax':
# theta1 .. theta3 are the values of states
def feat(s, a):
if not s == (0, 0): vec = (0, 0, 0)
elif a == (1, 0): vec = (1, 0, 0)
elif a == (-1, 0): vec = (0, 1, 0)
elif a == (0, 1): vec =(0, 0, 1)
else: raise Exception('should be unreachable')
return numpy.array(vec)
horizon = 1
rewardSet = [lambda s, a: 1 if a == (-1, 0) else (.6 if a == (0, 1) else 0),\
lambda s, a: 1 if a == (1, 0) else (.6 if a == (0, 1) else 0)]
rewardCandNum = len(rewardSet)
initialPhi = [1.0 / rewardCandNum] * rewardCandNum
terminalReward = util.Counter()
Domain = ThreeStateToy
cmp = Domain(0, horizon, terminalReward)
experiment(cmp, feat, featLength, rewardSet, initialPhi)
if __name__ == '__main__':
#mountainCarExp(MountainCar)
#mountainCarExp(MountainCarToy)
#threeStateExp()
drivingExp()