/
result.py
504 lines (441 loc) · 18.6 KB
/
result.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
# This file is part of BenchExec, a framework for reliable benchmarking:
# https://github.com/sosy-lab/benchexec
#
# SPDX-FileCopyrightText: 2007-2020 Dirk Beyer <https://www.sosy-lab.org>
#
# SPDX-License-Identifier: Apache-2.0
import collections
import functools
import os
from benchexec import BenchExecException
# CONSTANTS
# categorization of a run result
# 'correct' and 'wrong' refer to whether the tool's result matches the expected result.
# 'confirmed' and 'unconfirmed' refer to whether the tool's result was confirmed (e.g., by witness validation)
CATEGORY_CORRECT = "correct"
"""run result given by tool is correct (we use 'correct' instead of 'correct-confirmed')"""
CATEGORY_CORRECT_UNCONFIRMED = "correct-unconfirmed"
"""run result given by tool is correct but not confirmed"""
CATEGORY_WRONG = "wrong"
"""run result given by tool is wrong (we use 'wrong' instead of 'wrong-unconfirmed')"""
CATEGORY_UNKNOWN = "unknown"
"""run result given by tool is "unknown" (i.e., no answer)"""
CATEGORY_ERROR = "error"
"""tool failed, crashed, or hit a resource limit"""
CATEGORY_MISSING = "missing"
"""BenchExec could not determine whether run result was correct or wrong
because no property was defined, and no other categories apply."""
# internal property names used in this module (should not contain spaces)
# previously used by SV-COMP (http://sv-comp.sosy-lab.org/2014/rules.php)
_PROP_LABEL = "unreach-label"
# currently used by SV-COMP (http://sv-comp.sosy-lab.org/2016/rules.php)
_PROP_CALL = "unreach-call"
_PROP_TERMINATION = "termination"
_PROP_OVERFLOW = "no-overflow"
_PROP_DEADLOCK = "no-deadlock"
_PROP_DEREF = "valid-deref"
_PROP_FREE = "valid-free"
_PROP_MEMTRACK = "valid-memtrack"
_PROP_MEMCLEANUP = "valid-memcleanup"
# for Java verification:
_PROP_ASSERT = "assert"
# specification given as an automaton:
_PROP_AUTOMATON = "observer-automaton"
# for solvers:
_PROP_SAT = "sat"
# internal meta property
_PROP_MEMSAFETY = "valid-memsafety"
# possible run results (output of a tool)
RESULT_DONE = "done"
"""tool terminated properly and true/false does not make sense"""
RESULT_UNKNOWN = "unknown"
"""tool could not find out an answer due to incompleteness"""
RESULT_ERROR = "ERROR" # or any other value not listed here
"""tool could not complete due to an error
(it is recommended to instead use a string with more details about the error)"""
RESULT_TRUE_PROP = "true"
"""property holds"""
RESULT_FALSE_PROP = "false"
"""property does not hold"""
RESULT_FALSE_REACH = RESULT_FALSE_PROP + "(" + _PROP_CALL + ")"
_RESULT_FALSE_REACH_OLD = RESULT_FALSE_PROP + "(reach)"
"""SV-COMP reachability property violated"""
RESULT_FALSE_TERMINATION = RESULT_FALSE_PROP + "(" + _PROP_TERMINATION + ")"
"""SV-COMP termination property violated"""
RESULT_FALSE_OVERFLOW = RESULT_FALSE_PROP + "(" + _PROP_OVERFLOW + ")"
"""SV-COMP overflow property violated"""
RESULT_FALSE_DEADLOCK = RESULT_FALSE_PROP + "(" + _PROP_DEADLOCK + ")"
"""deadlock property violated""" # not yet part of SV-COMP
RESULT_FALSE_DEREF = RESULT_FALSE_PROP + "(" + _PROP_DEREF + ")"
"""SV-COMP valid-deref property violated"""
RESULT_FALSE_FREE = RESULT_FALSE_PROP + "(" + _PROP_FREE + ")"
"""SV-COMP valid-free property violated"""
RESULT_FALSE_MEMTRACK = RESULT_FALSE_PROP + "(" + _PROP_MEMTRACK + ")"
"""SV-COMP valid-memtrack property violated"""
RESULT_FALSE_MEMCLEANUP = RESULT_FALSE_PROP + "(" + _PROP_MEMCLEANUP + ")"
"""SV-COMP valid-memcleanup property violated"""
RESULT_WITNESS_CONFIRMED = "witness confirmed"
"""SV-COMP property violated and witness confirmed"""
RESULT_SAT = "sat"
"""task is satisfiable"""
RESULT_UNSAT = "unsat"
"""task is unsatisfiable"""
# List of all possible results.
# If a result is not in this list, it is handled as RESULT_CLASS_OTHER.
RESULT_LIST = [
RESULT_TRUE_PROP,
RESULT_FALSE_PROP,
RESULT_FALSE_REACH,
_RESULT_FALSE_REACH_OLD,
RESULT_FALSE_TERMINATION,
RESULT_FALSE_DEREF,
RESULT_FALSE_FREE,
RESULT_FALSE_MEMTRACK,
RESULT_FALSE_MEMCLEANUP,
RESULT_WITNESS_CONFIRMED,
RESULT_SAT,
RESULT_UNSAT,
RESULT_FALSE_OVERFLOW,
RESULT_FALSE_DEADLOCK,
]
RESULT_LIST_OTHER = [RESULT_DONE, RESULT_ERROR, RESULT_UNKNOWN]
"""list of unspecific standard results besides true/false"""
# Classification of results
RESULT_CLASS_TRUE = "true"
RESULT_CLASS_FALSE = "false"
RESULT_CLASS_OTHER = "other"
# This maps content of property files to property name.
_PROPERTY_NAMES = {
"LTL(G ! label(": _PROP_LABEL,
"LTL(G ! call(__VERIFIER_error()))": _PROP_CALL,
"LTL(F end)": _PROP_TERMINATION,
"LTL(G valid-free)": _PROP_FREE,
"LTL(G valid-deref)": _PROP_DEREF,
"LTL(G valid-memtrack)": _PROP_MEMTRACK,
"LTL(G valid-memcleanup)": _PROP_MEMCLEANUP,
"OBSERVER AUTOMATON": _PROP_AUTOMATON,
"SATISFIABLE": _PROP_SAT,
"LTL(G ! overflow)": _PROP_OVERFLOW,
"LTL(G ! deadlock)": _PROP_DEADLOCK,
}
# This maps a possible result substring of a file name
# to the expected result string of the tool and the set of properties
# for which this result is relevant.
_FILE_RESULTS = {
"_true-unreach-label": (RESULT_TRUE_PROP, {_PROP_LABEL}),
"_true-unreach-call": (RESULT_TRUE_PROP, {_PROP_CALL}),
"_true_assert": (RESULT_TRUE_PROP, {_PROP_ASSERT}),
"_true-termination": (RESULT_TRUE_PROP, {_PROP_TERMINATION}),
"_true-valid-deref": (RESULT_TRUE_PROP, {_PROP_DEREF}),
"_true-valid-free": (RESULT_TRUE_PROP, {_PROP_FREE}),
"_true-valid-memtrack": (RESULT_TRUE_PROP, {_PROP_MEMTRACK}),
"_true-valid-memcleanup": (RESULT_TRUE_PROP, {_PROP_MEMCLEANUP}),
"_true-valid-memsafety": (
RESULT_TRUE_PROP,
{_PROP_DEREF, _PROP_FREE, _PROP_MEMTRACK},
),
"_true-no-overflow": (RESULT_TRUE_PROP, {_PROP_OVERFLOW}),
"_true-no-deadlock": (RESULT_TRUE_PROP, {_PROP_DEADLOCK}),
"_false-unreach-label": (RESULT_FALSE_REACH, {_PROP_LABEL}),
"_false-unreach-call": (RESULT_FALSE_REACH, {_PROP_CALL}),
"_false_assert": (RESULT_FALSE_REACH, {_PROP_ASSERT}),
"_false-termination": (RESULT_FALSE_TERMINATION, {_PROP_TERMINATION}),
"_false-valid-deref": (RESULT_FALSE_DEREF, {_PROP_DEREF}),
"_false-valid-free": (RESULT_FALSE_FREE, {_PROP_FREE}),
"_false-valid-memtrack": (RESULT_FALSE_MEMTRACK, {_PROP_MEMTRACK}),
"_false-valid-memcleanup": (RESULT_FALSE_MEMCLEANUP, {_PROP_MEMCLEANUP}),
"_false-no-overflow": (RESULT_FALSE_OVERFLOW, {_PROP_OVERFLOW}),
"_false-no-deadlock": (RESULT_FALSE_DEADLOCK, {_PROP_DEADLOCK}),
"_sat": (RESULT_SAT, {_PROP_SAT}),
"_unsat": (RESULT_UNSAT, {_PROP_SAT}),
}
_MEMSAFETY_SUBPROPERTIES = {_PROP_DEREF, _PROP_FREE, _PROP_MEMTRACK}
# Map a property to all possible results for it.
_VALID_RESULTS_PER_PROPERTY = {
_PROP_ASSERT: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_REACH},
_PROP_LABEL: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_REACH},
_PROP_CALL: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_REACH},
_PROP_AUTOMATON: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_REACH},
_PROP_DEREF: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_DEREF},
_PROP_FREE: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_FREE},
_PROP_MEMTRACK: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_MEMTRACK},
_PROP_MEMSAFETY: {
RESULT_TRUE_PROP,
RESULT_FALSE_DEREF,
RESULT_FALSE_FREE,
RESULT_FALSE_MEMTRACK,
},
_PROP_MEMCLEANUP: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_MEMCLEANUP},
_PROP_OVERFLOW: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_OVERFLOW},
_PROP_DEADLOCK: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_DEADLOCK},
_PROP_TERMINATION: {RESULT_TRUE_PROP, RESULT_FALSE_PROP, RESULT_FALSE_TERMINATION},
_PROP_SAT: {RESULT_SAT, RESULT_UNSAT},
}
# Score values taken from http://sv-comp.sosy-lab.org/
# (use values 0 to disable scores completely for a given property).
_SCORE_CORRECT_TRUE = 2
_SCORE_CORRECT_UNCONFIRMED_TRUE = 1
_SCORE_CORRECT_FALSE = 1
_SCORE_CORRECT_UNCONFIRMED_FALSE = 0
_SCORE_UNKNOWN = 0
_SCORE_WRONG_FALSE = -16
_SCORE_WRONG_TRUE = -32
ExpectedResult = collections.namedtuple("ExpectedResult", "result subproperty")
"""Stores the expected result and respective information for a task"""
class Property(object):
"""Stores information about a property"""
def __init__(self, filename, is_well_known, is_svcomp, name, subproperties):
self.filename = filename
self.is_well_known = is_well_known
self.is_svcomp = is_svcomp
self.name = name
self.subproperties = subproperties
@property
def names(self):
return self.subproperties or [self.name]
def compute_score(self, category, result):
if not self.is_svcomp:
return None
return _svcomp_score(category, result)
def max_score(self, expected_result):
"""
Return the maximum possible score for a task that uses this property.
@param expected_result:
an ExpectedResult indicating whether the property is expected to hold for the task
"""
if not self.is_svcomp or not expected_result:
return 0
return _svcomp_max_score(expected_result.result)
def __repr__(self):
return "{}({self.filename!r}, {self.is_well_known!r}, {self.is_svcomp!r}, {self.name!r}, {self.subproperties!r})".format(
self.__class__.__name__, self=self
)
def __str__(self):
return (
("SV-COMP-" if self.is_svcomp else "")
+ "Property "
+ (self.name if self.is_well_known else "from " + self.filename)
)
@classmethod
@functools.lru_cache() # cache because it reads files
def create(cls, propertyfile, allow_unknown):
"""
Create a Property instance by attempting to parse the given property file.
@param propertyfile: A file name of a property file
@param allow_unknown: Whether to accept unknown properties
"""
with open(propertyfile) as f:
content = f.read().strip()
# parse content for known properties
is_svcomp = False
known_properties = []
only_known_svcomp_property = True
if content == "OBSERVER AUTOMATON" or content == "SATISFIABLE":
known_properties = [_PROPERTY_NAMES[content]]
elif content.startswith("CHECK"):
is_svcomp = True
for line in filter(None, content.splitlines()):
if content.startswith("CHECK"):
# SV-COMP property, either a well-known one or a new one
props_in_line = [
prop
for (substring, prop) in _PROPERTY_NAMES.items()
if substring in line
]
if len(props_in_line) == 1:
known_properties.append(props_in_line[0])
else:
only_known_svcomp_property = False
else:
# not actually an SV-COMP property file
is_svcomp = False
known_properties = []
break
# check if some known property content was found
subproperties = None
if only_known_svcomp_property and len(known_properties) == 1:
is_well_known = True
name = known_properties[0]
elif (
only_known_svcomp_property
and set(known_properties) == _MEMSAFETY_SUBPROPERTIES
):
is_well_known = True
name = _PROP_MEMSAFETY
subproperties = list(known_properties)
else:
if not allow_unknown:
raise BenchExecException(
'File "{0}" does not contain a known property.'.format(propertyfile)
)
is_well_known = False
name = os.path.splitext(os.path.basename(propertyfile))[0]
return cls(propertyfile, is_well_known, is_svcomp, name, subproperties)
@classmethod
def create_from_names(cls, property_names):
"""
Create a Property instance from a list of well-known property names
@param property_names: a non-empty list of property names
"""
assert property_names
if len(property_names) == 1:
name = property_names[0]
subproperties = None
else:
name = (
_PROP_MEMSAFETY
if set(property_names) == _MEMSAFETY_SUBPROPERTIES
else "unknown property"
)
subproperties = list(property_names)
is_well_known = name in _VALID_RESULTS_PER_PROPERTY.keys()
is_svcomp = is_well_known and (_PROP_SAT not in property_names)
return cls(None, is_well_known, is_svcomp, name, subproperties)
def expected_results_of_file(filename):
"""Create a dict of property->ExpectedResult from information encoded in a filename."""
results = {}
for (filename_part, (expected_result, for_properties)) in _FILE_RESULTS.items():
if filename_part in filename:
expected_result_class = get_result_classification(expected_result)
assert expected_result_class in {RESULT_CLASS_TRUE, RESULT_CLASS_FALSE}
expected_result = expected_result_class == RESULT_CLASS_TRUE
subproperty = None
if len(for_properties) > 1:
assert for_properties == _MEMSAFETY_SUBPROPERTIES
assert expected_result
prop = _PROP_MEMSAFETY
else:
prop = next(iter(for_properties))
if prop in _MEMSAFETY_SUBPROPERTIES and not expected_result:
subproperty = prop
prop = _PROP_MEMSAFETY
if prop in results:
raise BenchExecException(
"Duplicate property {} in filename {}".format(prop, filename)
)
results[prop] = ExpectedResult(expected_result, subproperty)
return results
def _svcomp_max_score(expected_result):
"""
Return the maximum possible score for a task according to the SV-COMP scoring scheme.
@param expected_result: whether the property is fulfilled for the task or not
"""
if expected_result is True:
return _SCORE_CORRECT_TRUE
elif expected_result is False:
return _SCORE_CORRECT_FALSE
return 0
def _svcomp_score(category, result):
"""
Return the achieved score of a task according to the SV-COMP scoring scheme.
@param category: result category as determined by get_result_category
@param result: the result given by the tool
"""
assert result is not None
result_class = get_result_classification(result)
if category == CATEGORY_CORRECT_UNCONFIRMED:
if result_class == RESULT_CLASS_TRUE:
return _SCORE_CORRECT_UNCONFIRMED_TRUE
elif result_class == RESULT_CLASS_FALSE:
return _SCORE_CORRECT_UNCONFIRMED_FALSE
else:
assert False
elif category == CATEGORY_CORRECT:
if result_class == RESULT_CLASS_TRUE:
return _SCORE_CORRECT_TRUE
elif result_class == RESULT_CLASS_FALSE:
return _SCORE_CORRECT_FALSE
else:
assert False, result
elif category == CATEGORY_WRONG:
if result_class == RESULT_CLASS_TRUE:
return _SCORE_WRONG_TRUE
elif result_class == RESULT_CLASS_FALSE:
return _SCORE_WRONG_FALSE
else:
assert False
else:
return _SCORE_UNKNOWN
def score_for_task(properties, category, result):
"""
Return the possible score of task, depending on whether the result is correct or not.
"""
assert result is not None
if properties and Property.create_from_names(properties).is_svcomp:
return _svcomp_score(category, result)
return None
def get_result_classification(result):
"""
Classify the given result into "true" (property holds),
"false" (property does not hold), "unknown", and "error".
@param result: The result given by the tool (needs to be one of the RESULT_* strings to be recognized).
@return One of RESULT_CLASS_* strings
"""
if result not in RESULT_LIST:
if (
result
and result.startswith(RESULT_FALSE_PROP + "(")
and result.endswith(")")
):
return RESULT_CLASS_FALSE
return RESULT_CLASS_OTHER
if result == RESULT_TRUE_PROP or result == RESULT_SAT:
return RESULT_CLASS_TRUE
else:
return RESULT_CLASS_FALSE
def get_result_category(expected_results, result, properties):
"""
This function determines the relation between actual result and expected result
for the given file and properties.
@param filename: The file name of the input file.
@param result: The result given by the tool (needs to be one of the RESULT_* strings to be recognized).
@param properties: The list of property names to check.
@return One of the CATEGORY_* strings.
"""
result_class = get_result_classification(result)
if result_class == RESULT_CLASS_OTHER:
if result == RESULT_UNKNOWN:
return CATEGORY_UNKNOWN
elif result == RESULT_DONE:
return CATEGORY_MISSING
else:
return CATEGORY_ERROR
if not properties:
# Without property we cannot return correct or wrong results.
return CATEGORY_MISSING
# For now, we have at most one property
assert len(properties) == 1, properties
prop = properties[0]
expected_result = expected_results.get(prop.filename)
if not expected_result or expected_result.result is None:
# expected result of task is unknown
return CATEGORY_MISSING
if prop.is_well_known:
# for well-known properties, only support hard-coded results
is_valid_result = result in _VALID_RESULTS_PER_PROPERTY[prop.name]
elif expected_result.subproperty:
is_valid_result = result in {
RESULT_TRUE_PROP,
RESULT_FALSE_PROP + "(" + expected_result.subproperty + ")",
}
else:
is_valid_result = (result == RESULT_TRUE_PROP) or result.startswith(
RESULT_FALSE_PROP
)
if not is_valid_result:
return CATEGORY_UNKNOWN # result does not match property
if expected_result.result:
return CATEGORY_CORRECT if result_class == RESULT_CLASS_TRUE else CATEGORY_WRONG
else:
if expected_result.subproperty:
return (
CATEGORY_CORRECT
if result == RESULT_FALSE_PROP + "(" + expected_result.subproperty + ")"
else CATEGORY_WRONG
)
else:
return (
CATEGORY_CORRECT
if result_class == RESULT_CLASS_FALSE
else CATEGORY_WRONG
)