/
dataset.py
2493 lines (2098 loc) · 102 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
""" Defines the DataSet class and supporting classes and functions """
#***************************************************************************************************
# Copyright 2015, 2019 National Technology & Engineering Solutions of Sandia, LLC (NTESS).
# Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights
# in this software.
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0 or in the LICENSE file in the root pyGSTi directory.
#***************************************************************************************************
import numpy as _np
import numbers as _numbers
import uuid as _uuid
#import scipy.special as _sps
#import scipy.fftpack as _fft
#from scipy.integrate import quad as _quad
#from scipy.interpolate import interp1d as _interp1d
import pickle as _pickle
import copy as _copy
import warnings as _warnings
import bisect as _bisect
import itertools as _itertools
from collections import OrderedDict as _OrderedDict
from collections import defaultdict as _defaultdict
from ..tools import listtools as _lt
from ..tools import compattools as _compat
from . import circuit as _cir
from . import labeldicts as _ld
#from . import dataset as _ds
Oindex_type = _np.uint32
Time_type = _np.float64
Repcount_type = _np.float32
DATAROW_AUTOCACHECOUNT_THRESHOLD = 256
# thought: _np.uint16 but doesn't play well with rescaling
class DataSetKVIterator(object):
""" Iterator class for op_string,DataSetRow pairs of a DataSet """
def __init__(self, dataset, strip_occurrence_tags=False):
self.dataset = dataset
if strip_occurrence_tags:
self.gsIter = map(DataSet.strip_occurence_tag, dataset.cirIndex.keys())
else:
self.gsIter = dataset.cirIndex.__iter__()
oliData = self.dataset.oliData
timeData = self.dataset.timeData
repData = self.dataset.repData
auxInfo = dataset.auxInfo
def getcache(opstr):
return dataset.cnt_cache[opstr] if dataset.bStatic else None
if repData is None:
self.tupIter = ((oliData[gsi], timeData[gsi], None, getcache(opstr), auxInfo[opstr])
for opstr, gsi in self.dataset.cirIndex.items())
else:
self.tupIter = ((oliData[gsi], timeData[gsi], repData[gsi], getcache(opstr), auxInfo[opstr])
for opstr, gsi in self.dataset.cirIndex.items())
#Note: gsi above will be an index for a non-static dataset and
# a slice for a static dataset.
def __iter__(self):
return self
def __next__(self):
return next(self.gsIter), DataSetRow(self.dataset, *(next(self.tupIter)))
next = __next__
class DataSetValueIterator(object):
""" Iterator class for DataSetRow values of a DataSet """
def __init__(self, dataset):
self.dataset = dataset
oliData = self.dataset.oliData
timeData = self.dataset.timeData
repData = self.dataset.repData
auxInfo = dataset.auxInfo
def getcache(opstr):
return dataset.cnt_cache[opstr] if dataset.bStatic else None
if repData is None:
self.tupIter = ((oliData[gsi], timeData[gsi], None, getcache(opstr), auxInfo[opstr])
for opstr, gsi in self.dataset.cirIndex.items())
else:
self.tupIter = ((oliData[gsi], timeData[gsi], repData[gsi], getcache(opstr), auxInfo[opstr])
for opstr, gsi in self.dataset.cirIndex.items())
#Note: gsi above will be an index for a non-static dataset and
# a slice for a static dataset.
def __iter__(self):
return self
def __next__(self):
return DataSetRow(self.dataset, *(next(self.tupIter)))
next = __next__
class DataSetRow(object):
"""
Encapsulates DataSet time series data for a single circuit. Outwardly
looks similar to a list with `(outcome_label, time_index, repetition_count)`
tuples as the values.
"""
def __init__(self, dataset, row_oli_data, row_time_data, row_rep_data,
cached_cnts, aux):
self.dataset = dataset
self.oli = row_oli_data
self.time = row_time_data
self.reps = row_rep_data
self._cntcache = cached_cnts
self.aux = aux
@property
def outcomes(self):
"""
Returns this row's sequence of outcome labels, one per "bin" of repetition
counts (returned by :method:`get_counts`).
"""
return [self.dataset.ol[i] for i in self.oli]
@outcomes.setter
def outcomes(self, value):
raise ValueError("outcomes property is read-only")
def get_expanded_ol(self):
"""
Returns this row's sequence of outcome labels, with repetition counts
expanded, so there's one element in the returned list for *each* count.
"""
if self.reps is not None:
ol = []
for oli, _, nreps in zip(self.oli, self.time, self.reps):
nreps = _round_int_repcnt(nreps)
ol.extend([self.dataset.ol[oli]] * nreps)
return ol
else: return self.outcomes
def get_expanded_oli(self):
"""
Returns this row's sequence of outcome label indices, with repetition counts
expanded, so there's one element in the returned list for *each* count.
"""
if self.reps is not None:
inds = []
for oli, _, nreps in zip(self.oli, self.time, self.reps):
nreps = _round_int_repcnt(nreps)
inds.extend([oli] * nreps)
return _np.array(inds, dtype=self.dataset.oliType)
else: return self.oli.copy()
def get_expanded_times(self):
"""
Returns this row's sequence of time stamps, with repetition counts
expanded, so there's one element in the returned list for *each* count.
"""
if self.reps is not None:
times = []
for _, time, nreps in zip(self.oli, self.time, self.reps):
nreps = _round_int_repcnt(nreps)
times.extend([time] * nreps)
return _np.array(times, dtype=self.dataset.timeType)
else: return self.time.copy()
def get_times(self):
"""
Returns the a list containing the unique data collection times
at which there is at least one measurement result.
"""
times = []
last_time = None
for t in self.time:
if t != last_time:
times.append(t)
last_time = t
return times
def get_timeseries_for_outcomes(self):
"""
Returns data in a time-series format. This can be a much less
succinct format than returned by `get_timeseries`. E.g., it is
highly inefficient for many-qubit data.
Returns
-------
times : list
The time steps, containing the unique data collection times.
reps : dict
A dictionary of lists containing the number of times each
measurement outcome was observed at the unique data collection
times in `times`.
"""
times = []
last_time = None
seriesDict = {self.dataset.olIndex[ol]: [] for ol in self.dataset.get_outcome_labels()}
#REMOVED: (though this gives slightly different behavior)
#for outcome_label in self.outcomes:
# if outcome_label not in seriesDict.keys():
# seriesDict[outcome_label] = []
if self.reps is None:
reps = _np.ones(len(self.time), int)
else: reps = self.reps
# An alternate implementation that appears to be (surprisingly?) slower...
##Get time bin locations
#time_bins_borders = []
#last_time = None
#for i, t in enumerate(self.time):
# if t != last_time:
# time_bins_borders.append(i)
# last_time = t
#time_bins_borders.append(len(self.time))
#nTimes = len(time_bins_borders) - 1
#
#seriesDict = {self.dataset.olIndex[ol]: _np.zeros(nTimes, int) for ol in self.dataset.get_outcome_labels()}
#
#for i in range(nTimes):
# slc = slice(time_bins_borders[i],time_bins_borders[i+1])
# times.append( self.time[slc.start] )
# for oli, rep in zip(self.oli[slc], reps[slc]):
# seriesDict[oli][i] += rep
for t, oli, rep in zip(self.time, self.oli, reps):
if t != last_time:
times.append(t)
last_time = t
for sd_oli in seriesDict.keys():
if sd_oli == oli: seriesDict[sd_oli].append(rep)
else: seriesDict[sd_oli].append(0)
else:
seriesDict[oli][-1] += rep
return times, {ol: seriesDict[oli] for ol, oli in self.dataset.olIndex.items()}
def get_timeseries(self):
"""
Returns data in a time-series format.
Returns
-------
times : list
The time steps, containing the unique data collection times.
reps : list
A list of dictionaries containing the counts dict corresponding
to the list of unique data collection times in `times`.
"""
times = []
series = []
last_time = None
if self.reps is None:
reps = list(_np.ones(len(self.time), int))
else: reps = self.reps
for t, outcome_label, rep in zip(self.time, self.outcomes, reps):
if t != last_time:
times.append(t)
last_time = t
series.append({outcome_label: rep})
else:
if outcome_label in series[-1]:
series[-1][outcome_label] += rep
else:
series[-1][outcome_label] = rep
return times, series
def get_reps_timeseries(self):
"""
Tthe number of measurement results at each
data collection time.
Returns
-------
times : list
The time steps.
reps : list
The total number of counts at each time step.
"""
times = []
reps = []
last_time = None
if self.reps is None:
return list(self.time), list(_np.ones(len(self.time), int))
else:
for t, rep in zip(self.time, self.reps):
if t != last_time:
times.append(t)
last_time = t
reps.append(rep)
else:
reps[-1] += rep
return times, reps
def get_number_of_times(self):
"""
Returns the number of data collection times.
"""
return len(self.get_times())
def has_constant_totalcounts(self):
"""
Returns True if the numbers of counts is the same at
all data collection times. Otherwise returns False.
"""
times, reps = self.get_reps_timeseries()
firstrep = reps[0]
fixedtotalcounts = all([firstrep == i for i in reps])
return fixedtotalcounts
def get_totalcounts_per_timestep(self):
"""
Returns the number of total counts per time-step, when this
is constant. If it varies over the times that there is at least
one measurement result for then this function will raise an error.
"""
times, reps = self.get_reps_timeseries()
firstrep = reps[0]
assert(all([firstrep == i for i in reps])), "The total counts is not the same at all time steps!"
return firstrep
def get_meantimestep(self):
"""
Returns the mean time-step. Will raise an error for data that is
a trivial time-series (i.e., data all at one time).
"""
times = _np.array(self.get_times())
assert(len(times) >= 2), "Mean time-step is ill-defined when there is not multiple data times!"
return _np.mean(_np.diff(times))
def __iter__(self):
if self.reps is not None:
return ((self.dataset.ol[i], t, n) for (i, t, n) in zip(self.oli, self.time, self.reps))
else:
return ((self.dataset.ol[i], t, 1) for (i, t) in zip(self.oli, self.time))
def __contains__(self, outcome_label):
""" Checks whether data counts for `outcomelabel` are available."""
return outcome_label in self.counts
def __getitem__(self, index_or_outcome_label):
if isinstance(index_or_outcome_label, _numbers.Integral): # raw index
i = index_or_outcome_label
if self.reps is not None:
return (self.dataset.ol[self.oli[i]], self.time[i], self.reps[i])
else:
return (self.dataset.ol[self.oli[i]], self.time[i], 1)
elif isinstance(index_or_outcome_label, _numbers.Real): # timestamp
return self.counts_at_time(index_or_outcome_label)
else:
if len(self.dataset.olIndex) > DATAROW_AUTOCACHECOUNT_THRESHOLD:
#There are a lot of outcomes in this dataset - it's not worth computing
# and caching *all* of the counts just to extract the one being asked for now.
outcome_label = _ld.OutcomeLabelDict.to_outcome(index_or_outcome_label)
if outcome_label not in self.dataset.olIndex:
raise KeyError("%s is not an index, timestamp, or outcome label!"
% str(index_or_outcome_label))
return self._get_single_count(outcome_label)
else:
#Compute and cache *all* of the counts, since there aren't so many of them.
try:
return self.counts[index_or_outcome_label]
except KeyError:
# if outcome label isn't in counts but *is* in the dataset's
# outcome labels then return 0 (~= return self.allcounts[...])
key = _ld.OutcomeLabelDict.to_outcome(index_or_outcome_label)
if key in self.dataset.get_outcome_labels(): return 0
raise KeyError("%s is not an index, timestamp, or outcome label!"
% str(index_or_outcome_label))
def __setitem__(self, index_or_outcome_label, val):
if isinstance(index_or_outcome_label, _numbers.Integral):
index = index_or_outcome_label; tup = val
assert(len(tup) in (2, 3)), "Must set to a (<outcomeLabel>,<time>[,<repetitions>]) value"
ol = _ld.OutcomeLabelDict.to_outcome(tup[0]) # strings -> tuple outcome labels
self.oli[index] = self.dataset.olIndex[ol]
self.time[index] = tup[1]
if self.reps is not None:
self.reps[index] = tup[2] if len(tup) == 3 else 1
else:
assert(len(tup) == 2 or tup[2] == 1), "Repetitions must == 1 (not tracking reps)"
else:
outcomeLbl = _ld.OutcomeLabelDict.to_outcome(index_or_outcome_label) # strings -> tuple outcome labels
count = val
assert(all([t == self.time[0] for t in self.time])), \
"Cannot set outcome counts directly on a DataSet with non-trivially timestamped data"
assert(self.reps is not None), \
"Cannot set outcome counts directly on a DataSet without repetition data"
outcomeIndxToLookFor = self.dataset.olIndex.get(outcomeLbl, None)
for i, outcomeIndx in enumerate(self.oli):
if outcomeIndx == outcomeIndxToLookFor:
self.reps[i] = count; break
else: # need to add a new label & entry to reps[]
raise NotImplementedError("Cannot create new outcome labels by assignment")
def _get_single_count(self, outcome_label, timestamp=None):
if timestamp is not None:
tslc = _np.where(_np.isclose(self.time, timestamp))[0]
else: tslc = slice(None)
if self.reps is None:
i = self.dataset.olIndex[outcome_label]
return float(_np.count_nonzero(_np.equal(self.oli[tslc], i)))
else:
i = self.dataset.olIndex[outcome_label]
inds = _np.nonzero(_np.equal(self.oli[tslc], i))[0]
if len(inds) > 0:
return float(sum(self.reps[tslc][inds]))
else:
return 0.0
def _get_counts(self, timestamp=None, all_outcomes=False):
"""
Returns this row's sequence of "repetition counts", that is, the number of
repetitions of each outcome label in the `outcomes` list, or
equivalently, each outcome label index in this rows `.oli` member.
"""
#Note: when all_outcomes == False we don't add outcome labels that
# aren't present for any of this row's elements (i.e. the #summed
# is zero)
cntDict = _ld.OutcomeLabelDict()
if timestamp is not None:
tslc = _np.where(_np.isclose(self.time, timestamp))[0]
else: tslc = slice(None)
nOutcomes = len(self.dataset.olIndex)
nIndices = len(self.oli[tslc])
if nOutcomes <= nIndices or all_outcomes:
if self.reps is None:
for ol, i in self.dataset.olIndex.items():
cnt = float(_np.count_nonzero(_np.equal(self.oli[tslc], i)))
if all_outcomes or cnt > 0:
cntDict.set_unsafe(ol, cnt)
else:
for ol, i in self.dataset.olIndex.items():
inds = _np.nonzero(_np.equal(self.oli[tslc], i))[0]
if all_outcomes or len(inds) > 0:
cntDict.set_unsafe(ol, float(sum(self.reps[tslc][inds])))
else:
if self.reps is None:
for ol_index in self.oli[tslc]:
ol = self.dataset.ol[ol_index]
cntDict.set_unsafe(ol, 1.0 + cntDict.get_unsafe(ol, 0.0))
else:
for ol_index, reps in zip(self.oli[tslc], self.reps[tslc]):
ol = self.dataset.ol[ol_index]
cntDict.set_unsafe(ol, reps + cntDict.get_unsafe(ol, 0.0))
return cntDict
@property
def counts(self):
if self._cntcache: return self._cntcache # if not None *and* len > 0
ret = self._get_counts()
if self._cntcache is not None: # == and empty dict {}
self._cntcache.update(ret)
return ret
@property
def allcounts(self):
return self._get_counts(all_outcomes=True)
@property
def fractions(self, all_outcomes=False):
"""
Returns this row's sequence of "repetition counts", that is, the number of
repetitions of each outcome label in the `outcomes` list, or
equivalently, each outcome label index in this rows `.oli` member.
"""
cnts = self._get_counts(all_outcomes)
total = sum(cnts.values())
return _OrderedDict([(k, cnt / total) for k, cnt in cnts.items()])
@property
def total(self):
""" Returns the total number of counts contained in this row."""
if self.reps is None:
return float(len(self.oli))
else:
return sum(self.reps)
#TODO: remove in favor of fractions property?
def fraction(self, outcomelabel):
""" Returns the fraction of total counts for `outcomelabel`."""
d = self.counts
if outcomelabel not in d:
return 0.0 # Note: similar to an "all_outcomes=True" default
total = sum(d.values())
return d[outcomelabel] / total
def counts_at_time(self, timestamp):
""" Returns a dictionary of counts at a particular time """
return self._get_counts(timestamp)
def timeseries(self, outcomelabel, timestamps=None):
"""
Returns timestamps and counts for a single outcome label
or for aggregated counts if `outcomelabel == "all"`.
Parameters
----------
outcomelabel : str or tuple
The outcome label to extract a series for. If the special value
`"all"` is used, total (aggregated over all outcomes) counts are
returned.
timestamps : list or array, optional
If not None, an array of time stamps to extract counts for,
which will also be returned as `times`. Times at which
there is no data will be returned as zero-counts.
Returns
-------
times, counts : numpy.ndarray
"""
if outcomelabel == 'all':
olis = list(self.dataset.olIndex.values())
else:
outcomelabel = _ld.OutcomeLabelDict.to_outcome(outcomelabel)
olis = [self.dataset.olIndex[outcomelabel]]
times = []
counts = []
last_t = -1e100
tsIndx = 0
for i, (t, oli) in enumerate(zip(self.time, self.oli)):
if timestamps is not None:
while tsIndx < len(timestamps) and t > timestamps[tsIndx] \
and not _np.isclose(t, timestamps[tsIndx], rtol=0., atol=1e-12):
times.append(timestamps[tsIndx])
counts.append(0)
tsIndx += 1
if oli in olis and (timestamps is None or _np.isclose(t, timestamps[tsIndx], rtol=0., atol=1e-12)):
if not _np.isclose(t, last_t, rtol=0., atol=1e-12):
times.append(t); tsIndx += 1
counts.append(0)
last_t = t
counts[-1] += 1 if (self.reps is None) else self.reps[i]
if timestamps is not None:
while tsIndx < len(timestamps):
times.append(timestamps[tsIndx])
counts.append(0)
tsIndx += 1
return _np.array(times, self.dataset.timeType), \
_np.array(counts, self.dataset.repType)
def scale(self, factor):
""" Scales all the counts of this row by the given factor """
if self.dataset.bStatic: raise ValueError("Cannot scale rows of a *static* DataSet.")
if self.reps is None:
raise ValueError(("Cannot scale a DataSet without repetition "
"counts. Call DataSet.build_repetition_counts()"
" and try this again."))
for i, cnt in enumerate(self.reps):
self.reps[i] = cnt * factor
def as_dict(self):
""" Returns the (outcomeLabel,count) pairs as a dictionary."""
return dict(self.counts)
def to_str(self, mode="auto"):
"""
Render this DataSetRow as a string.
Parameters
----------
mode : {"auto","time-dependent","time-independent"}
Whether to display the data as time-series of outcome counts
(`"time-dependent"`) or to report per-outcome counts aggregated over
time (`"time-independent"`). If `"auto"` is specified, then the
time-independent mode is used only if all time stamps in the
DataSetRow are equal (trivial time dependence).
Returns
-------
str
"""
if mode == "auto":
if all([t == self.time[0] for t in self.time]):
mode = "time-independent"
else: mode = "time-dependent"
assert(mode in ('time-dependent', 'time-independent')), "Invalid `mode` argument: %s" % mode
if mode == "time-dependent":
s = "Outcome Label Indices = " + str(self.oli) + "\n"
s += "Time stamps = " + str(self.time) + "\n"
if self.reps is not None:
s += "Repetitions = " + str(self.reps) + "\n"
else:
s += "( no repetitions )\n"
return s
else: # time-independent
return str(self.as_dict())
def __str__(self):
return self.to_str()
def __len__(self):
return len(self.oli)
def _round_int_repcnt(nreps):
""" Helper function to localize warning message """
if float(nreps).is_integer():
return int(nreps)
else:
_warnings.warn("Rounding fractional repetition count to next lowest whole number!")
return int(round(nreps))
class DataSet(object):
"""
The DataSet class associates circuits with counts or time series of
counts for each outcome label, and can be thought of as a table with gate
strings labeling the rows and outcome labels and/or time labeling the
columns. It is designed to behave similarly to a dictionary of
dictionaries, so that counts are accessed by:
`count = dataset[circuit][outcomeLabel]`
in the time-independent case, and in the time-dependent case, for *integer*
time index `i >= 0`,
`outcomeLabel = dataset[circuit][i].outcome`
`count = dataset[circuit][i].count`
`time = dataset[circuit][i].time`
"""
@classmethod
def strip_occurence_tag(cls, circuit):
return circuit[:-1] if (len(circuit) > 0 and circuit[-1].name.startswith("#")) else circuit
def __init__(self, oli_data=None, time_data=None, rep_data=None,
circuits=None, circuit_indices=None,
outcome_labels=None, outcome_label_indices=None,
static=False, file_to_load_from=None, collision_action="aggregate",
comment=None, aux_info=None):
"""
Initialize a DataSet.
Parameters
----------
oli_data : list or numpy.ndarray
When `static == True`, a 1D numpy array containing outcome label
indices (integers), concatenated for all sequences. Otherwise, a
list of 1D numpy arrays, one array per gate sequence. In either
case, this quantity is indexed by the values of `circuit_indices`
or the index of `circuits`.
time_data : list or numpy.ndarray
Same format at `oli_data` except stores floating-point timestamp
values.
rep_data : list or numpy.ndarray
Same format at `oli_data` except stores integer repetition counts
for each "data bin" (i.e. (outcome,time) pair). If all repetitions
equal 1 ("single-shot" timestampted data), then `rep_data` can be
`None` (no repetitions).
circuits : list of (tuples or Circuits)
Each element is a tuple of operation labels or a Circuit object. Indices for these strings
are assumed to ascend from 0. These indices must correspond to the time series of spam-label
indices (above). Only specify this argument OR circuit_indices, not both.
circuit_indices : ordered dictionary
An OrderedDict with keys equal to circuits (tuples of operation labels) and values equal to
integer indices associating a row/element of counts with the circuit. Only
specify this argument OR circuits, not both.
outcome_labels : list of strings or int
Specifies the set of spam labels for the DataSet. Indices for the spam labels
are assumed to ascend from 0, starting with the first element of this list. These
indices will associate each elememtn of `timeseries` with a spam label. Only
specify this argument OR outcome_label_indices, not both. If an int, specifies that
the outcome labels should be those for a standard set of this many qubits.
outcome_label_indices : ordered dictionary
An OrderedDict with keys equal to spam labels (strings) and value equal to
integer indices associating a spam label with given index. Only
specify this argument OR outcome_labels, not both.
static : bool
When True, create a read-only, i.e. "static" DataSet which cannot be modified. In
this case you must specify the timeseries data, circuits, and spam labels.
When False, create a DataSet that can have time series data added to it. In this case,
you only need to specify the spam labels.
file_to_load_from : string or file object
Specify this argument and no others to create a static DataSet by loading
from a file (just like using the load(...) function).
collision_action : {"aggregate","overwrite","keepseparate"}
Specifies how duplicate circuits should be handled. "aggregate"
adds duplicate-sequence counts to the same circuit's data at the
next integer timestamp. "overwrite" only keeps the latest given
data for a circuit. "keepseparate" tags duplicate-sequences by
appending a final "#<number>" operation label to the duplicated gate
sequence, which can then be accessed via the `get_row` and `set_row`
functions.
comment : string, optional
A user-specified comment string that gets carried around with the
data. A common use for this field is to attach to the data details
regarding its collection.
aux_info : dict, optional
A user-specified dictionary of per-circuit auxiliary information.
Keys should be the circuits in this DataSet and value should
be Python dictionaries.
Returns
-------
DataSet
a new data set object.
"""
# uuid for efficient hashing (set when done adding data or loading from file)
self.uuid = None
#Optionally load from a file
if file_to_load_from is not None:
assert(oli_data is None and time_data is None and rep_data is None
and circuits is None and circuit_indices is None
and outcome_labels is None and outcome_label_indices is None)
self.load(file_to_load_from)
return
# self.cirIndex : Ordered dictionary where keys = Circuit objects,
# values = slices into oli, time, & rep arrays (static case) or
# integer list indices (non-static case)
if circuit_indices is not None:
self.cirIndex = _OrderedDict([(opstr if isinstance(opstr, _cir.Circuit) else _cir.Circuit(opstr), i)
for opstr, i in circuit_indices.items()])
#convert keys to Circuits if necessary
elif not static:
if circuits is not None:
dictData = [(opstr if isinstance(opstr, _cir.Circuit) else _cir.Circuit(opstr), i)
for (i, opstr) in enumerate(circuits)] # convert to Circuits if necessary
self.cirIndex = _OrderedDict(dictData)
else:
self.cirIndex = _OrderedDict()
else: raise ValueError("Must specify circuit_indices when creating a static DataSet")
# self.olIndex : Ordered dictionary where
# keys = outcome labels (strings or tuples),
# values = integer indices mapping oli_data (integers) onto
# the outcome labels.
if outcome_label_indices is not None:
self.olIndex = outcome_label_indices
self.olIndex_max = max(self.olIndex.values()) if len(self.olIndex) > 0 else -1
elif outcome_labels is not None:
if isinstance(outcome_labels, int):
nqubits = outcome_labels
tup_outcomeLabels = [("".join(x),) for x in _itertools.product(*([('0', '1')] * nqubits))]
else:
tup_outcomeLabels = [_ld.OutcomeLabelDict.to_outcome(ol)
for ol in outcome_labels] # strings -> tuple outcome labels
self.olIndex = _OrderedDict([(ol, i) for (i, ol) in enumerate(tup_outcomeLabels)])
self.olIndex_max = len(tup_outcomeLabels) - 1
else:
self.olIndex = _OrderedDict() # OK, as outcome labels are added as they appear
self.olIndex_max = -1
# self.ol : Ordered dictionary where keys = integer indices, values = outcome
# labels (strings or tuples) -- just the reverse of self.olIndex
self.ol = _OrderedDict([(i, ol) for (ol, i) in self.olIndex.items()])
# sanity checks that indices are >= 0
if not static: # otherwise values() below are slices
if self.cirIndex: assert(min(self.cirIndex.values()) >= 0)
if self.olIndex: assert(min(self.olIndex.values()) >= 0)
# self.oliData : when static == True a 1D numpy array containing concatenated outcome label indices.
# when static == False a list of 1D numpy arrays, one array per gate sequence.
# self.timeData : when static == True a 1D numpy array containing concatenated time stamps.
# when static == False a list of 1D numpy arrays, one array per gate sequence.
# self.repData : when static == True a 1D numpy array containing concatenated repetition counts.
# when static == False a list of 1D numpy arrays, one array per gate sequence.
# (can be None, in which case no repetitions are assumed)
if oli_data is not None:
# check that sizes/lengths all match
assert(len(time_data) == len(oli_data)), "time_data must be same size as oli_data"
if rep_data is not None:
assert(len(rep_data) == len(oli_data)), "rep_data must be same size as oli_data"
self.oliData = oli_data
self.timeData = time_data
self.repData = rep_data
if len(self.cirIndex) > 0:
maxOlIndex = self.olIndex_max
if static:
assert(max([_np.amax(self.oliData[i]) if (len(self.oliData[i]) > 0) else 0
for i in self.cirIndex.values()]) <= maxOlIndex)
# self.oliData.shape[0] > maxIndex doesn't make sense since cirIndex holds slices
else:
#Note: for non-static datasets, assume *all* data in self.oliData is "in" this data set, i.e.,
# it can't be that this is a truncated dataset with pointers to more data than it actually owns.
maxIndex = max(self.cirIndex.values())
assert(len(self.oliData) > maxIndex)
if len(self.oliData) > 0:
assert(all([max(oliSeries) <= maxOlIndex for oliSeries in self.oliData]))
#else cirIndex has length 0 so there are no circuits in this dataset (even though oli_data can contain data)
elif not static:
assert(time_data is None), "time_data must be None when oli_data is"
assert(rep_data is None), "rep_data must be None when oli_data is"
assert(len(self.cirIndex) == 0), "circuit specified without data!"
self.oliData = []
self.timeData = []
self.repData = None
else:
raise ValueError("Series data must be specified when creating a static DataSet")
# self.bStatic
self.bStatic = static
# collision action
assert(collision_action in ('aggregate', 'overwrite', 'keepseparate'))
self.collisionAction = collision_action
# comment
self.comment = comment
# self.ffdata : fourier filtering data
self.ffdata = {}
#data types - should stay in sync with MultiDataSet
self.oliType = Oindex_type
self.timeType = Time_type
self.repType = Repcount_type
#auxiliary info
if aux_info is None:
self.auxInfo = _defaultdict(dict)
else:
self.auxInfo = _defaultdict(dict, aux_info)
# count cache (only used when static; not saved/loaded from disk)
if static:
self.cnt_cache = {opstr: _ld.OutcomeLabelDict() for opstr in self.cirIndex}
else:
self.cnt_cache = None
def __iter__(self):
return self.cirIndex.__iter__() # iterator over circuits
def __len__(self):
return len(self.cirIndex)
def __contains__(self, circuit):
"""
Test whether data set contains a given circuit.
Parameters
----------
circuit : tuple or Circuit
A tuple of operation labels or a Circuit instance
which specifies the the circuit to check for.
Returns
-------
bool
whether circuit was found.
"""
if not isinstance(circuit, _cir.Circuit):
circuit = _cir.Circuit(circuit)
return circuit in self.cirIndex
def __hash__(self):
if self.uuid is not None:
return hash(self.uuid)
else:
raise TypeError('Use digest hash')
def __getitem__(self, circuit):
return self.get_row(circuit)
def __setitem__(self, circuit, outcome_dict_or_series):
ca = self.collisionAction
self.collisionAction = 'overwrite' # overwrite data when assigning (this seems mose natural)
try:
ret = self.set_row(circuit, outcome_dict_or_series)
finally:
self.collisionAction = ca
return ret
def __delitem__(self, circuit):
if not isinstance(circuit, _cir.Circuit):
circuit = _cir.Circuit(circuit)
self._remove([self.cirIndex[circuit]])
def get_row(self, circuit, occurrence=0):
"""
Get a row of data from this DataSet. This gives the same
functionality as [ ] indexing except you can specify the
occurrence number separately from the gate sequence.
Parameters
----------
circuit : Circuit or tuple
The gate sequence to extract data for.
occurrence : int, optional
0-based occurrence index, specifying which occurrence of
a repeated gate sequence to extract data for.
Returns
-------
DataSetRow
"""
#Convert to circuit - needed for occurrence > 0 case and
# because name-only Labels still don't hash the same as strings
# so key lookups need to be done at least with tuples of Labels.
if not isinstance(circuit, _cir.Circuit):
circuit = _cir.Circuit.fromtup(circuit)
if occurrence > 0:
circuit = circuit + _cir.Circuit(("#%d" % occurrence,))
#Note: cirIndex value is either an int (non-static) or a slice (static)
repData = self.repData[self.cirIndex[circuit]] \
if (self.repData is not None) else None
return DataSetRow(self, self.oliData[self.cirIndex[circuit]],
self.timeData[self.cirIndex[circuit]], repData,
self.cnt_cache[circuit] if self.bStatic else None,
self.auxInfo[circuit])
def set_row(self, circuit, outcome_dict_or_series, occurrence=0):
"""
Set the counts for a row of this DataSet. This gives the same
functionality as [ ] indexing except you can specify the
occurrence number separately from the gate sequence.
Parameters
----------
circuit : Circuit or tuple
The gate sequence to extract data for.
countDict : dict
The dictionary of counts (data).
occurrence : int, optional
0-based occurrence index, specifying which occurrence of
a repeated gate sequence to extract data for.
"""
if not isinstance(circuit, _cir.Circuit):
circuit = _cir.Circuit(circuit)
if occurrence > 0:
circuit = _cir.Circuit(circuit) + _cir.Circuit(("#%d" % occurrence,))
if isinstance(outcome_dict_or_series, dict): # a dict of counts
self.add_count_dict(circuit, outcome_dict_or_series)
else: # a tuple of lists
assert(len(outcome_dict_or_series) >= 2), \
"Must minimally set with (outcome-label-list, time-stamp-list)"
self.add_raw_series_data(circuit, *outcome_dict_or_series)