/
datasetconstruction.py
528 lines (441 loc) · 23.7 KB
/
datasetconstruction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
""" Functions for creating datasets """
from __future__ import division, print_function, absolute_import, unicode_literals
#***************************************************************************************************
# Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
# in this software.
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
#***************************************************************************************************
import numpy as _np
import numpy.random as _rndm
import warnings as _warnings
import collections as _collections
import itertools as _itertools
from ..objects import dataset as _ds
from ..objects import labeldicts as _ld
from ..baseobjs import label as _lbl
from ..tools import compattools as _compat
from . import circuitconstruction as _gstrc
from pprint import pprint
def generate_fake_data(modelOrDataset, circuit_list, nSamples,
sampleError="multinomial", seed=None, randState=None,
aliasDict=None, collisionAction="aggregate",
recordZeroCnts=True, comm=None, memLimit=None, times=None):
"""Creates a DataSet using the probabilities obtained from a model.
Parameters
----------
modelOrDataset : Model or DataSet object
If a Model, the model whose probabilities generate the data.
If a DataSet, the data set whose frequencies generate the data.
circuit_list : list of (tuples or Circuits) or None
Each tuple or Circuit contains operation labels and
specifies a gate sequence whose counts are included
in the returned DataSet. e.g. ``[ (), ('Gx',), ('Gx','Gy') ]``
nSamples : int or list of ints or None
The simulated number of samples for each operation sequence. This only has
effect when ``sampleError == "binomial"`` or ``"multinomial"``. If an
integer, all operation sequences have this number of total samples. If a list,
integer elements specify the number of samples for the corresponding
operation sequence. If ``None``, then `modelOrDataset` must be a
:class:`~pygsti.objects.DataSet`, and total counts are taken from it
(on a per-circuit basis).
sampleError : string, optional
What type of sample error is included in the counts. Can be:
- "none" - no sample error: counts are floating point numbers such
that the exact probabilty can be found by the ratio of count / total.
- "clip" - no sample error, but clip probabilities to [0,1] so, e.g.,
counts are always positive.
- "round" - same as "clip", except counts are rounded to the nearest
integer.
- "binomial" - the number of counts is taken from a binomial
distribution. Distribution has parameters p = (clipped) probability
of the operation sequence and n = number of samples. This can only be used
when there are exactly two SPAM labels in modelOrDataset.
- "multinomial" - counts are taken from a multinomial distribution.
Distribution has parameters p_k = (clipped) probability of the gate
string using the k-th SPAM label and n = number of samples.
seed : int, optional
If not ``None``, a seed for numpy's random number generator, which
is used to sample from the binomial or multinomial distribution.
randState : numpy.random.RandomState
A RandomState object to generate samples from. Can be useful to set
instead of `seed` if you want reproducible distribution samples across
multiple random function calls but you don't want to bother with
manually incrementing seeds between those calls.
aliasDict : dict, optional
A dictionary mapping single operation labels into tuples of one or more
other operation labels which translate the given operation sequences before values
are computed using `modelOrDataset`. The resulting Dataset, however,
contains the *un-translated* operation sequences as keys.
collisionAction : {"aggregate", "keepseparate"}
Determines how duplicate operation sequences are handled by the resulting
`DataSet`. Please see the constructor documentation for `DataSet`.
recordZeroCnts : bool, optional
Whether zero-counts are actually recorded (stored) in the returned
DataSet. If False, then zero counts are ignored, except for
potentially registering new outcome labels.
comm : mpi4py.MPI.Comm, optional
When not ``None``, an MPI communicator for distributing the computation
across multiple processors and ensuring that the *same* dataset is
generated on each processor.
memLimit : int, optional
A rough memory limit in bytes which is used to determine job allocation
when there are multiple processors.
times : iterable, optional
When not None, a list of time-stamps at which data should be sampled.
`nSamples` samples will be simulated at each time value, meaning that
each circuit in `circuit_list` will be evaluated with the given time
value as its *start time*.
Returns
-------
DataSet
A static data set filled with counts for the specified operation sequences.
"""
NTOL = 10
TOL = 1 / (10**-NTOL)
if isinstance(modelOrDataset, _ds.DataSet):
dsGen = modelOrDataset
gsGen = None
dataset = _ds.DataSet(collisionAction=collisionAction,
outcomeLabelIndices=dsGen.olIndex) # keep same outcome labels
else:
gsGen = modelOrDataset
dsGen = None
dataset = _ds.DataSet(collisionAction=collisionAction)
if aliasDict:
aliasDict = {_lbl.Label(ky): tuple((_lbl.Label(el) for el in val))
for ky, val in aliasDict.items()} # convert to use Labels
if gsGen and times is None:
trans_circuit_list = [_gstrc.translate_circuit(s, aliasDict)
for s in circuit_list]
all_probs = gsGen.bulk_probs(trans_circuit_list, comm=comm, memLimit=memLimit)
#all_dprobs = gsGen.bulk_dprobs(circuit_list) #DEBUG - not needed here!!!
if comm is None or comm.Get_rank() == 0: # only root rank computes
if sampleError in ("binomial", "multinomial"):
if randState is None:
rndm = _rndm.RandomState(seed) # ok if seed is None
else:
rndm = randState
for k, s in enumerate(circuit_list):
#print("DB GEN %d of %d (len %d)" % (k,len(circuit_list),len(s)))
trans_s = _gstrc.translate_circuit(s, aliasDict)
circuit_times = times if times is not None else ["N/A dummy"]
counts_list = []
for tm in circuit_times:
if gsGen:
if times is None:
ps = all_probs[trans_s]
else:
ps = gsGen.probs(trans_s, time=tm)
if sampleError in ("binomial", "multinomial"):
#Adjust to probabilities if needed (and warn if not close to in-bounds)
for ol in ps:
if ps[ol] < 0:
if ps[ol] < -TOL: _warnings.warn("Clipping probs < 0 to 0")
ps[ol] = 0.0
elif ps[ol] > 1:
if ps[ol] > (1 + TOL): _warnings.warn("Clipping probs > 1 to 1")
ps[ol] = 1.0
else:
ps = _collections.OrderedDict([(ol, frac) for ol, frac
in dsGen[trans_s].fractions.items()])
if gsGen and sampleError in ("binomial", "multinomial"):
#Check that sum ~= 1 (and nudge if needed) since binomial and
# multinomial random calls assume this.
psum = sum(ps.values())
adjusted = False
if psum > 1 + TOL:
adjusted = True
_warnings.warn("Adjusting sum(probs) > 1 to 1")
if psum < 1 - TOL:
adjusted = True
_warnings.warn("Adjusting sum(probs) < 1 to 1")
# A cleaner probability cleanup.. lol
OVERTOL = 1.0 + TOL
UNDERTOL = 1.0 - TOL
def normalized(): return UNDERTOL <= sum(ps.values()) <= OVERTOL
if not normalized():
m = max(ps.values())
ps = {lbl: round(p / m, NTOL) for lbl, p in ps.items()}
print(sum(ps.values()))
assert normalized(), 'psum={}'.format(sum(ps.values()))
if adjusted:
_warnings.warn('Adjustment finished')
if nSamples is None and dsGen is not None:
N = dsGen[trans_s].total # use the number of samples from the generating dataset
#Note: total() accounts for other intermediate-measurment branches automatically
else:
try:
N = nSamples[k] # try to treat nSamples as a list
except:
N = nSamples # if not indexable, nSamples should be a single number
nWeightedSamples = N
counts = {} # don't use an ordered dict here - add_count_dict will sort keys
labels = [ol for ol, _ in sorted(list(ps.items()), key=lambda x: x[1])]
# "outcome labels" - sort by prob for consistent generation
if sampleError == "binomial":
if len(labels) == 1: # Special case when labels[0] == 1.0 (100%)
counts[labels[0]] = nWeightedSamples
else:
assert(len(labels) == 2)
ol0, ol1 = labels[0], labels[1]
counts[ol0] = rndm.binomial(nWeightedSamples, ps[ol0])
counts[ol1] = nWeightedSamples - counts[ol0]
elif sampleError == "multinomial":
countsArray = rndm.multinomial(nWeightedSamples,
[ps[ol] for ol in labels], size=1) # well-ordered list of probs
for i, ol in enumerate(labels):
counts[ol] = countsArray[0, i]
else:
for outcomeLabel, p in ps.items():
pc = _np.clip(p, 0, 1) # Note: *not* used in "none" case
if sampleError == "none":
counts[outcomeLabel] = float(nWeightedSamples * p)
elif sampleError == "clip":
counts[outcomeLabel] = float(nWeightedSamples * pc)
elif sampleError == "round":
counts[outcomeLabel] = int(round(nWeightedSamples * pc))
else:
raise ValueError(
"Invalid sample error parameter: '%s' "
"Valid options are 'none', 'round', 'binomial', or 'multinomial'" % sampleError
)
counts_list.append(counts)
if times is None:
assert(len(counts_list) == 1)
dataset.add_count_dict(s, counts_list[0], recordZeroCnts=recordZeroCnts)
else:
dataset.add_series_data(s, counts_list, times, recordZeroCnts=recordZeroCnts)
dataset.done_adding_data()
if comm is not None: # broadcast to non-root procs
dataset = comm.bcast(dataset if (comm.Get_rank() == 0) else None, root=0)
return dataset
def merge_outcomes(dataset, label_merge_dict, recordZeroCnts=True):
"""
Creates a DataSet which merges certain outcomes in input DataSet;
used, for example, to aggregate a 2-qubit 4-outcome DataSet into a 1-qubit 2-outcome
DataSet.
Parameters
----------
dataset : DataSet object
The input DataSet whose results will be simplified according to the rules
set forth in label_merge_dict
label_merge_dict : dictionary
The dictionary whose keys define the new DataSet outcomes, and whose items
are lists of input DataSet outcomes that are to be summed together. For example,
if a two-qubit DataSet has outcome labels "00", "01", "10", and "11", and
we want to ''aggregate out'' the second qubit, we could use label_merge_dict =
{'0':['00','01'],'1':['10','11']}. When doing this, however, it may be better
to use :function:`filter_qubits` which also updates the operation sequences.
recordZeroCnts : bool, optional
Whether zero-counts are actually recorded (stored) in the returned
(merged) DataSet. If False, then zero counts are ignored, except for
potentially registering new outcome labels.
Returns
-------
merged_dataset : DataSet object
The DataSet with outcomes merged according to the rules given in label_merge_dict.
"""
# strings -> tuple outcome labels in keys and values of label_merge_dict
to_outcome = _ld.OutcomeLabelDict.to_outcome # shorthand
label_merge_dict = {to_outcome(key): list(map(to_outcome, val))
for key, val in label_merge_dict.items()}
new_outcomes = label_merge_dict.keys()
merged_dataset = _ds.DataSet(outcomeLabels=new_outcomes)
merge_dict_old_outcomes = [outcome for sublist in label_merge_dict.values() for outcome in sublist]
if not set(dataset.get_outcome_labels()).issubset(merge_dict_old_outcomes):
raise ValueError(
"`label_merge_dict` must account for all the outcomes in original dataset."
" It's missing directives for:\n%s" %
'\n'.join(set(map(str, dataset.get_outcome_labels())) - set(map(str, merge_dict_old_outcomes)))
)
# New code that works for time-series data.
for key in dataset.keys():
times, linecounts = dataset[key].get_timeseries()
count_dict_list = []
for i in range(len(times)):
count_dict = {}
for new_outcome in new_outcomes:
count_dict[new_outcome] = 0
for old_outcome in label_merge_dict[new_outcome]:
count_dict[new_outcome] += linecounts[i].get(old_outcome, 0)
if recordZeroCnts is False:
for new_outcome in new_outcomes:
if count_dict[new_outcome] == 0:
del count_dict[new_outcome]
count_dict_list.append(count_dict)
merged_dataset.add_series_data(key, count_dict_list, times, aux=dataset[key].aux)
# Old code that doesn't work for time-series data.
#for key in dataset.keys():
# linecounts = dataset[key].counts
# count_dict = {}
# for new_outcome in new_outcomes:
# count_dict[new_outcome] = 0
# for old_outcome in label_merge_dict[new_outcome]:
# count_dict[new_outcome] += linecounts.get(old_outcome, 0)
# merged_dataset.add_count_dict(key, count_dict, aux=dataset[key].aux,
# recordZeroCnts=recordZeroCnts)
merged_dataset.done_adding_data()
return merged_dataset
def create_qubit_merge_dict(nQubits, qubits_to_keep):
"""
Creates a dictionary appropriate for use with :function:`merge_outcomes`,
that aggregates all but the specified `qubits_to_keep` when the outcome
labels are those of `nQubits` qubits (i.e. strings of 0's and 1's).
Parameters
----------
nQubits : int
The total number of qubits
qubits_to_keep : list
A list of integers specifying which qubits should be kept, that is,
*not* aggregated, when the returned dictionary is passed to
`merge_outcomes`.
Returns
-------
dict
"""
outcome_labels = [''.join(map(str, t)) for t in _itertools.product([0, 1], repeat=nQubits)]
return create_merge_dict(qubits_to_keep, outcome_labels)
def create_merge_dict(indices_to_keep, outcome_labels):
"""
Creates a dictionary appropriate for use with :function:`merge_outcomes`,
that aggregates all but the specified `indices_to_keep`.
In particular, each element of `outcome_labels` should be a n-character
string (or a 1-tuple of such a string). The returned dictionary's keys
will be all the unique results of keeping only the characters indexed by
`indices_to_keep` from each outcome label. The dictionary's values will
be a list of all the original outcome labels which reduce to the key value
when the non-`indices_to_keep` characters are removed.
For example, if `outcome_labels == ['00','01','10','11']` and
`indices_to_keep == [1]` then this function returns the dict
`{'0': ['00','10'], '1': ['01','11'] }`.
Note: if the elements of `outcome_labels` are 1-tuples then so are the
elements of the returned dictionary's values.
Parameters
----------
indices_to_keep : list
A list of integer indices specifying which character positions should be
kept (i.e. *not* aggregated together by `merge_outcomes`).
outcome_labels : list
A list of the outcome labels to potentially merge. This can be a list
of strings or of 1-tuples containing strings.
Returns
-------
dict
"""
merge_dict = _collections.defaultdict(list)
for ol in outcome_labels:
if _compat.isstr(ol):
reduced = ''.join([ol[i] for i in indices_to_keep])
else:
assert(len(ol) == 1) # should be a 1-tuple
reduced = (''.join([ol[0][i] for i in indices_to_keep]),) # a tuple
merge_dict[reduced].append(ol)
return dict(merge_dict) # return a *normal* dict
def filter_dataset(dataset, sectors_to_keep, sindices_to_keep=None,
new_sectors=None, idle=((),), recordZeroCnts=True,
filtercircuits=True):
"""
Creates a DataSet is the restriction of `dataset`to the sectors
identified by `sectors_to_keep`.
More specifically, this function aggregates (sums) outcomes in `dataset`
which differ only in sectors (usually qubits - see below) *not* in
`sectors_to_keep`, and removes any operation labels which act specifically on
sectors not in `sectors_to_keep` (e.g. an idle gate acting on *all*
sectors because it's `.sslbls` is None will *not* be removed).
Here "sectors" are state-space labels, present in the operation sequences of
`dataset`. Each sector also corresponds to a particular character position
within the outcomes labels of `dataset`. Thus, for this function to work,
the outcome labels of `dataset` must all be 1-tuples whose sole element is
an n-character string such that each character represents the outcome of
a single sector. If the state-space labels are integers, then they can
serve as both a label and an outcome-string position. The argument
`new_sectors` may be given to rename the kept state-space labels in the
returned `DataSet`'s operation sequences.
A typical case is when the state-space is that of *n* qubits, and the
state space labels the intergers 0 to *n-1*. As stated above, in this
case there is no need to specify `sindices_to_keep`. One may want to
"rebase" the indices to 0 in the returned data set using `new_sectors`
(E.g. `sectors_to_keep == [4,5,6]` and `new_sectors == [0,1,2]`).
Parameters
----------
dataset : DataSet object
The input DataSet whose data will be processed.
sectors_to_keep : list or tuple
The state-space labels (strings or integers) of the "sectors" to keep in
the returned DataSet.
sindices_to_keep : list or tuple, optional
The 0-based indices of the labels in `sectors_to_keep` which give the
postiions of the corresponding letters in each outcome string (see above).
If the state space labels are integers (labeling *qubits*) thath are also
letter-positions, then this may be left as `None`. For example, if the
outcome strings of `dataset` are '00','01','10',and '11' and the first
position refers to qubit "Q1" and the second to qubit "Q2" (present in
operation labels), then to extract just "Q2" data `sectors_to_keep` should be
`["Q2"]` and `sindices_to_keep` should be `[1]`.
new_sectors : list or tuple, optional
New sectors names to map the elements of `sectors_to_keep` onto in the
output DataSet's operation sequences. None means the labels are not renamed.
This can be useful if, for instance, you want to run a 2-qubit protocol
that expects the qubits to be labeled "0" and "1" on qubits "4" and "5"
of a larger set. Simply set `sectors_to_keep == [4,5]` and
`new_sectors == [0,1]`.
idle : string or Label, optional
The operation label to be used when there are no kept components of a
"layer" (element) of a circuit.
recordZeroCnts : bool, optional
Whether zero-counts present in the original `dataset` are recorded
(stored) in the returned (filtered) DataSet. If False, then such
zero counts are ignored, except for potentially registering new
outcome labels.
filtercircuits : bool, optional
Whether or not to "filter" the circuits, by removing gates that act
outside of the `sectors_to_keep`.
Returns
-------
filtered_dataset : DataSet object
The DataSet with outcomes and operation sequences filtered as described above.
"""
if sindices_to_keep is None:
sindices_to_keep = sectors_to_keep
#ds_merged = merge_outcomes(dataset, create_merge_dict(sindices_to_keep,
# dataset.get_outcome_labels()),
# recordZeroCnts=recordZeroCnts)
ds_merged = dataset.merge_outcomes(create_merge_dict(sindices_to_keep,
dataset.get_outcome_labels()),
recordZeroCnts=recordZeroCnts)
ds_merged = ds_merged.copy_nonstatic()
if filtercircuits:
ds_merged.process_circuits(lambda s: _gstrc.filter_circuit(
s, sectors_to_keep, new_sectors, idle), aggregate=True)
ds_merged.done_adding_data()
return ds_merged
def trim_to_constant_numtimesteps(ds):
"""
Returns a new dataset that has data for the same number of time steps for
every circuit. This is achieved by discarding all time-series data for every
circuit with a time step index beyond 'min-time-step-index', where
'min-time-step-index' is the minimum number of time steps over circuits.
Parameters
----------
ds : DataSet
The dataset to trim.
Returns
-------
DataSet
The trimmed dataset, obtained by potentially discarding some of the data.
"""
trimmedds = ds.copy_nonstatic()
numtimes = []
for circuit in ds.keys():
numtimes.append(ds[circuit].get_number_of_times())
minnumtimes = min(numtimes)
for circuit in ds.keys():
times, series = ds[circuit].get_timeseries()
trimmedtimes = times[0:minnumtimes]
trimmedseries = series[0:minnumtimes]
trimmedds.add_series_data(circuit, trimmedseries, trimmedtimes, aux=ds.auxInfo[circuit])
trimmedds.done_adding_data()
return trimmedds