packages/pygsti/algorithms/core.py

#*****************************************************************
#    pyGSTi 0.9:  Copyright 2015 Sandia Corporation              
#    This Software is released under the GPL license detailed    
#    in the file "license.txt" in the top-level pyGSTi directory 
#*****************************************************************
""" Core GST algorithms """

import sys as _sys
import numpy as _np
import scipy.optimize as _spo
import scipy.stats as _stats
import warnings as _warnings

from .. import optimize as _opt
from .. import tools as _tools
from .. import objects as _objs

#Note on where 4x4 or possibly other integral-qubit dimensions are needed:
# 1) Need to use Jamiol. Isomorphism to contract to CPTP or even gauge optimize to CPTP
#       since we need to know a Choi matrix basis to perform the Jamiol. isomorphism
# 2) Need pauilVector <=> matrix in contractToValidSpam
# 3) use Jamiol. Iso in print_gateset_info(...)


###################################################################################
#                 Linear Inversion GST (LGST)
###################################################################################

def do_lgst(dataset, specs, targetGateset=None, gateLabels=None, gateLabelAliases={},
           spamDict=None, guessGatesetForGauge=None, svdTruncateTo=0, identityVec=None, verbosity=0):
  """
  Performs Linear-inversion Gate Set Tomography on the dataset.

  Parameters
  ----------
  dataset : DataSet
      The data used to generate the LGST estimates
      
  specs : 2-tuple
      A (rhoSpecs,ESpecs) tuple usually generated by calling build_spam_specs(...)
  
  targetGateset : GateSet, optional
      A gateset used to specify which gate labels should be estimated, a guess
       for the in which gauge these estimates should be returned, and the SPAM
       labels used to connect the dataset values to rhoVec and EVec indices.

  gateLabels : list, optional
      A list of which gate labels (or aliases) should be estimated.
      Defaults to the gate labels in targetGateset.
      e.g. ['Gi','Gx','Gy','Gx2']

  gateLabelAliases : dictionary, optional
      Dictionary whose keys are gate label "aliases" and whose values are tuples
      corresponding to what that gate label should be expanded into before querying
      the dataset.
      Defaults to the empty dictionary (no aliases defined)
      e.g. gateLabelAliases['Gx^3'] = ('Gx','Gx','Gx')

  spamDict : dictionary, optional
      Dictionary mapping (rhoVec_index,EVec_index) integer tuples to string spam labels.
      Defaults to the spam dictionary of targetGateset
      e.g. spamDict[(0,0)] == "plus"

  guessGatesetForGauge : GateSet, optional
      A gateset used to compute a gauge transformation that is applied to
      the LGST estimates before they are returned.  This gauge transformation
      is computed such that if the estimated gates matched the gateset given,
      then the gate matrices would match, i.e. the gauge would be the same as
      the gateset supplied.
      Defaults to targetGateset.

  svdTruncateTo : int, optional
      The Hilbert space dimension to truncate the gate matrices to using
      a SVD to keep only the largest svdToTruncateTo singular values of
      the I_tildle LGST matrix.
      Defaults to 0 (no truncation)
      
  identityVec : numpy array, optional
      The vectorized identity density matrix in whatever basis is being
      used.  Size should be [ dmDim^2, 1], e.g. numpy.array([[1.41],[0],[0],[0]]).
      Defaults to that of the targetGateset.

  verbosity : int, optional
      How much detail to send to stdout.

  Returns
  -------
  Gateset
      A gateset containing all of the estimated labels (or aliases)
  """

  #Notes:
  # We compute,
  # I_tilde = AB   (trunc,trunc), where trunc <= K = min(nRhoSpecs,nESpecs)
  # X_tilde = AXB  (trunc,trunc)
  # and  A, B for *target* gateset. (but target gateset may need dimension increase to get to trunc... and then A,B are rank deficient)
  # We would like to get X or it's gauge equivalent.
  #  We do:       1)  (I^-1)*AXB ~= B^-1 X B := Xhat -- we solve Ii*A*B = identity for Ii
  #               2) B * Xhat * B^-1 ==> X  (but what if B is non-invertible -- say rectangular) Want B*(something) ~ identity ??
     # for lower rank target gatesets, want a gauge tranformation that brings Xhat => X of "increased dim" gateset
     # want "B^-1" such that B(gsDim,nRhoSpecs) "B^-1"(nRhoSpecs,gsDim) ~ Identity(gsDim)
     #   Ub,sb,Vb = svd(B) so B = Ub*diag(sb)*Vb  where Ub = (gsDim,M), s = (M,M), Vb = (M,rhoSpecs)
     #   if B^-1 := VbT*sb^-1*Ub^-1 then B*B^-1 = I(gsDim)
     # similarly, can get want "A^-1" such that "A^-1"(gsDim,nESpecs) A(nESpecs,gsDim) ~ Identity(gsDim)
     # or do we want not Ii*A*B = I but B*Ii*A = I(gsDim), so something like Ii = (B^-1)(A^-1) using pseudoinversese above. 
     #   (but we can't do this, since we only have AB, not A and B separately)
  # A is (trunc, gsDim)
  # B is (gsDim, trunc)

  # With no svd truncation (but we always truncate; this is just for reference)
  # AXB     = (nESpecs, nRhoSpecs)
  # I (=AB) = (nESpecs, nRhoSpecs)
  # A       = (nESpecs, gsDim)
  # B       = (gsDim, nRhoSpecs)

  #Process input parameters
  rhoSpecs, ESpecs = specs
  K = min(len(rhoSpecs), len(ESpecs))

  if gateLabels is not None:
    gateLabelsToEstimate = gateLabels
  elif targetGateset is not None:
    gateLabelsToEstimate = targetGateset.keys()
  else: raise ValueError("do_lgst cannot determine gate labels to estimate from supplied parameters")

  if spamDict is None:
    if targetGateset is not None:
      spamDict = targetGateset.get_spam_label_dict()
    else: raise ValueError("do_lgst cannot determine SPAM dictionary from supplied parameters")

  if guessGatesetForGauge is None:
    guessGatesetForGauge = targetGateset # (which may also be None)

  if identityVec is None and guessGatesetForGauge is not None:
      identityVec = guessGatesetForGauge.identityVec
  if identityVec is None: #check again in case targetGateset.identityVec == None
    for (ir,ie) in spamDict.keys():
      if ie == -1 and ir != -1: #then identityVec is required b/c this spamlabel represents Evec = identityVec - sum(other_Evecs)
        raise ValueError("do_lgst cannot determine the identity vector from supplied parameters")
    #otherwise identityVec is not required, so OK if it's None


  #OLD
  #if guessGatesetForGauge is not None:  # apply svdTruncation necessary for gauge transformation if guessGatesetForGauge is given
  #  truncNeeded = len(guessGatesetForGauge.rhoVecs[0])
  #  if svdTruncateTo > 0 and svdTruncateTo != truncNeeded:
  #    raise ValueError("svdTruncateTo == %d is not equal to the gateset dimension (%d) used to guess the LGST gauge" % (svdTruncateTo,truncNeeded))
  #  svdTruncateTo = truncNeeded

  #assert(len(ESpecs) == len(rhoSpecs)) #specify the same number of rho's and E's (for now)
  lgstGateset = _objs.GateSet()

  #Create truncation projector -- just trims columns (Pj) or rows (Pjt) of a matrix.
  # note K = min(nRhoSpecs,nESpecs), and dot(Pjt,Pj) == identity(trunc)
  trunc = svdTruncateTo if svdTruncateTo > 0 else K
  assert(trunc <= K)
  Pj = _np.zeros( (K,trunc), 'd') # shape = (K, trunc) projector with only trunc columns
  for i in range(trunc): Pj[i,i] = 1.0
  Pjt = _np.transpose(Pj)         # shape = (trunc, K)

  ABMat = _constructAB(rhoSpecs, ESpecs, spamDict, dataset)  # shape = (nESpecs, nRhoSpecs)

  U,s,V = _np.linalg.svd(ABMat, full_matrices=False)  
  if verbosity > 2: print "LGST: Singular values of I_tilde (truncating to first %d of %d) = \n" % (trunc,len(s)) ,s
  Ud,Vd = _np.transpose(_np.conjugate(U)), _np.transpose(_np.conjugate(V))  # Udagger, Vdagger
  ABMat_p = _np.dot(Pjt, _np.dot(_np.diag(s), Pj)) #truncate ABMat => ABMat' (note diag(s) = Ud*ABMat*Vd), shape = (trunc, trunc)
  # U shape = (nESpecs, K)
  # V shape = (K, nRhoSpecs)
  # Ud shape = (K, nESpecs)
  # Vd shape = (nRhoSpecs, K)

  #print "DEBUG: dataset = ",dataset
  #print "DEBUG: ABmat = \n",ABMat
  #print "DEBUG: Evals(ABmat) = \n",_np.linalg.eigvals(ABMat)
  rankAB = _np.linalg.matrix_rank(ABMat_p)
  if rankAB < ABMat_p.shape[0]:
    raise ValueError("LGST AB matrix is rank %d < %d. Choose better rhoSpecs and/or ESpecs, or decrease svdTruncateTo" \
                          % (rankAB, ABMat_p.shape[0]))

  invABMat_p = _np.dot(Pjt, _np.dot(_np.diag(1.0/s), Pj)) # (trunc,trunc)
  assert( _np.linalg.norm( _np.linalg.inv(ABMat_p) - invABMat_p ) < 1e-8 ) #check inverse is correct (TODO: comment out later)
  assert( len( (_np.isnan(invABMat_p)).nonzero()[0] ) == 0 )

  for gateLabel in gateLabelsToEstimate:
    gateLabelTuple = gateLabelAliases.get(gateLabel, (gateLabel,))
    X = _constructXMatrix(rhoSpecs, ESpecs, spamDict, gateLabelTuple, dataset)  # shape (nESpecs, nRhoSpecs)
    X2 = _np.dot(Ud, _np.dot(X, Vd)) # shape (K,K) this should be close to rank "svdTruncateTo" (which is <= K) -- TODO: check this

    if svdTruncateTo > 0 and verbosity > 4: 
      print "LGST DEBUG: %s before trunc to first %d row and cols = \n" % (gateLabel,svdTruncateTo)
      _tools.print_mx(X2)
    X_p = _np.dot(Pjt, _np.dot(X2, Pj)) #truncate X => X', shape (trunc, trunc)
    lgstGateset.set_gate(gateLabel, _objs.FullyParameterizedGate(_np.dot(invABMat_p,X_p))) # shape (trunc,trunc)
    #print "DEBUG: X(%s) = \n" % gateLabel,X
    #print "DEBUG: Evals(X) = \n",_np.linalg.eigvals(X)
    #print "DEBUG: %s = \n" % gateLabel,lgstGateset[ gateLabel ]

  # Form EVecs
  nEVecs = max( [ b for (a,b) in spamDict.keys() ] ) + 1
  for iEVec in range(nEVecs):
    EVec = _np.zeros( (1,len(rhoSpecs)) )  # shape (1,nRhoSpecs)
    for i,rhospec in enumerate(rhoSpecs):
      gateString = rhospec.str; spamLabel = spamDict[ (rhospec.i,iEVec) ]
      dsRow = dataset[ gateString ]
      EVec[0,i] = dsRow.fraction(spamLabel)
    EVec_p = _np.dot( _np.dot(EVec, Vd), Pj ) #truncate Evec => Evec', shape (1,trunc)
    lgstGateset.set_evec( _np.transpose(EVec_p), iEVec )

  # Form rhoVecs
  nrhoVecs = max( [ a for (a,b) in spamDict.keys() ] ) + 1
  for irhoVec in range(nrhoVecs):
    rhoVec = _np.zeros((len(ESpecs),1)) # shape (nESpecs,1)
    for i,espec in enumerate(ESpecs):
      gateString = espec.str; spamLabel = spamDict[ (irhoVec, espec.i) ]
      dsRow = dataset[ gateString ]
      rhoVec[i] = dsRow.fraction(spamLabel)
    rhoVec_p = _np.dot( Pjt, _np.dot(Ud, rhoVec) ) #truncate rhoVec => rhoVec', shape (trunc, 1)
    rhoVec_p = _np.dot(invABMat_p,rhoVec_p)      
    lgstGateset.set_rhovec( rhoVec_p, irhoVec )

  # Add identity vector to gateset (needed before adding spam labels)
  #  Pad with zeros if needed (ROBIN - is this correct?)
  if identityVec is not None:
    Idim = identityVec.shape[0]
    assert(Idim <= trunc)
    if Idim < trunc:
      padded_identityVec = _np.concatenate( (identityVec, _np.zeros( (trunc-Idim,1), 'd')) )
    else:
      padded_identityVec = identityVec
    lgstGateset.set_identity_vec( padded_identityVec )

  # Add SPAM label info to gateset
  for (rhoIndex, EIndex) in spamDict.keys():
    lgstGateset.add_spam_label( rhoIndex, EIndex, spamDict[ (rhoIndex,EIndex) ] )

  # Perform "guess" gauge transformation by computing the "B" matrix
  #  assuming rhos, Es, and gates are those of a guesstimate of the gateset
  if guessGatesetForGauge is not None:  
    guessTrunc = len(guessGatesetForGauge.rhoVecs[0]) # dimension of guess gateset == the truncation to apply to it's B matrix
    assert(guessTrunc <= trunc)  # the dimension of the gateset for gauge guessing cannot exceed the dimension of the gateset being estimated

    guessPj = _np.zeros( (K,guessTrunc), 'd') # shape = (K, guessTrunc) projector with only trunc columns
    for i in range(guessTrunc): guessPj[i,i] = 1.0
    guessPjt = _np.transpose(guessPj)         # shape = (guessTrunc, K)

    AMat = _constructA(ESpecs, guessGatesetForGauge)    # shape = (nESpecs, gsDim)
    AMat_p = _np.dot( guessPjt, _np.dot(Ud, AMat)) #truncate Evec => Evec', shape (guessTrunc,gsDim) (square!)

    BMat = _constructB(rhoSpecs, guessGatesetForGauge)  # shape = (gsDim, nRhoSpecs)
    BMat_p = _np.dot( _np.dot(BMat, Vd), guessPj ) #truncate Evec => Evec', shape (gsDim,guessTrunc) (square!)

    if verbosity > 3:
      guess_ABMat = _np.dot(AMat,BMat)
      guess_U,guess_s,guess_V = _np.linalg.svd(guess_ABMat, full_matrices=False)  
      print "LGST: Singular values of target I_tilde (truncating to first %d of %d) = \n" % (guessTrunc,len(guess_s)) ,guess_s

    if guessTrunc < trunc:  # if the dimension of the gauge-guess gateset is smaller than the matrices being estimated, pad B with identity
      if verbosity > 2: 
        print "LGST: Padding target B with sqrt of low singular values of I_tilde: \n", s[guessTrunc:trunc]

      BMat_p_padded = _np.identity(trunc, 'd')
      BMat_p_padded[0:guessTrunc, 0:guessTrunc] = BMat_p
      for i in range(guessTrunc,trunc):
        BMat_p_padded[i,i] = _np.sqrt( s[i] ) #set diagonal as sqrt of actual AB matrix's singular values
      lgstGateset.transform( S=_np.linalg.inv(BMat_p_padded), Si=BMat_p_padded )
    else:
      lgstGateset.transform( S=_np.linalg.inv(BMat_p), Si=BMat_p )

    # RESET identity vector after lgstGateset.transform since this transforms gateset back to what we think is
    #   close to the basis of guessGatesetForGauge.  The only reason we set it earlier is as a placeholder
    #   so that lgstGateset.add_spam_label doesn't fail.
    lgstGateset.set_identity_vec( padded_identityVec )
    
    # Force lgstGateset to have gates parameterized in the same was as those in guessGatesetForGauge
    for gateLabel in gateLabelsToEstimate:
      if gateLabel in guessGatesetForGauge:
        new_gate = guessGatesetForGauge.get_gate(gateLabel).copy()
        _objs.gate.optimize_gate( new_gate, lgstGateset.get_gate(gateLabel), bG0=True )
        lgstGateset.set_gate( gateLabel, new_gate )

    #inv_BMat_p = _np.dot(invABMat_p, AMat_p) # should be equal to inv(BMat_p) when trunc == gsDim ?? check??
    #lgstGateset.transform( S=_np.dot(invABMat_p, AMat_p), Si=BMat_p ) # lgstGateset had dim trunc, so after transform is has dim gsDim

  lgstGateset.log("Created by LGST", {'rhoSpecs': rhoSpecs, 'ESpecs': ESpecs })

  if verbosity > 2: print ""
  if verbosity > 1: print "--- LGST ---"
  if verbosity > 4:
    print "Resulting gate set:\n", lgstGateset

  return lgstGateset


def _constructAB(rhoSpecs, ESpecs, spamDict, dataset):
  AB = _np.empty( (len(ESpecs),len(rhoSpecs)) )
  for i,espec in enumerate(ESpecs):
    for j,rhospec in enumerate(rhoSpecs):
      gateLabelString = rhospec.str + espec.str # LEXICOGRAPHICAL VS MATRIX ORDER
      spamLabel = spamDict[ (rhospec.i,espec.i) ]
      dsRow = dataset[gateLabelString]
      AB[i,j] = dsRow.fraction(spamLabel)
      #print "DEBUG: AB[%d,%d] = (" % (i,j), espec + rhospec, ") = ", AB[i,j] #DEBUG
  return AB    

def _constructXMatrix(rhoSpecs, ESpecs, spamDict, gateLabelTuple, dataset):
  X = _np.empty( (len(ESpecs),len(rhoSpecs)) )
  for i,espec in enumerate(ESpecs):
    for j,rhospec in enumerate(rhoSpecs):
      gateLabelString = rhospec.str + _objs.GateString(gateLabelTuple) + espec.str # LEXICOGRAPHICAL VS MATRIX ORDER
      spamLabel = spamDict[ (rhospec.i,espec.i) ]
      try:
        dsRow = dataset[gateLabelString]
      except:
        raise KeyError("Missing data needed to construct X matrix for " + str(gateLabelTuple) \
                           + ": gate string " + str(gateLabelString))
      X[i,j] = dsRow.fraction(spamLabel)
  return X

def _constructA(ESpecs, gs):
  n = len(ESpecs); dim = gs.get_dimension()
  A = _np.empty( (n,dim) )
  for k,espec in enumerate(ESpecs):
    #Build fiducial < E_k | := < EVec[ ESpec[0] ] | Gatestring(ESpec[1:]) 
    st = _np.dot( _np.transpose( gs.EVecs[ espec.i ] ), gs.product(espec.str) ) # 1xN vector
    A[k,:] = st[0,:] # E_k == kth row of A
  return A

def _constructB(rhoSpecs, gs):
  n = len(rhoSpecs); dim = gs.get_dimension()
  B = _np.empty( (dim,n) )
  for k,rhospec in enumerate(rhoSpecs):
    #Build fiducial | rho_k > := Gatestring(rhoSpec[0:-1]) | rhoVec[ rhoSpec[-1] ] >
    st = _np.dot( gs.product(rhospec.str), gs.rhoVecs[ rhospec.i ] ) # Nx1 vector
    B[:,k] = st[:,0] # rho_k == kth column of B
  return B


def gram_rank_and_evals(dataset, specs, targetGateset=None, spamDict=None):
  """
  Returns the rank and eigenvalues of the Gram matrix for a dataset.

  Parameters
  ----------
  dataset : DataSet
      The data used to populate the Gram matrix

  specs : 2-tuple
      A (rhoSpecs,ESpecs) tuple usually generated by calling build_spam_specs(...)

  targetGateset : GateSet, optional
      A gateset used to specify the SPAM labels used to connect
      the dataset values to rhoVec and EVec indices.

  spamDict : dictionary, optional
      Dictionary mapping (rhoVec_index,EVec_index) integer tuples to string spam labels.
      Defaults to the spam dictionary of targetGateset
      e.g. spamDict[(0,0)] == "plus"

  Returns
  -------
  rank : int
      the rank of the Gram matrix
  eigenvalues : numpy array
      the eigenvalues of the Gram matrix
  """

  rhoSpecs, ESpecs = specs

  if spamDict is None:
    if targetGateset is not None:
      spamDict = targetGateset.get_spam_label_dict()
    else: raise ValueError("do_lgst cannot determine SPAM dictionary from supplied parameters")
  
  ABMat = _constructAB(rhoSpecs, ESpecs, spamDict, dataset)
  U,s,V = _np.linalg.svd(ABMat)
  return _np.linalg.matrix_rank(ABMat), s #_np.linalg.eigvals(ABMat)


###################################################################################
#                 Extended Linear GST (ExLGST)
##################################################################################

#Given dataset D
# Chi2 statistic = sum_k (p_k-f_k)^2/ (N f_kt(1-f_kt) ) where f_kt ~ f_k with +1/+2 to avoid zero denom

def do_exlgst(dataset, startGateset, gateStringsToUseInEstimation, specs, 
             targetGateset=None, spamDict=None, guessGatesetForGauge=None,
             svdTruncateTo=0, maxiter=100000, maxfev=None, tol=1e-6, 
             opt_gates=True, opt_G0=True, regularizeFactor=0, verbosity=0,
             check_jacobian=False):
  """
  Performs Extended Linear-inversion Gate Set Tomography on the dataset.

  Parameters
  ----------
  dataset : DataSet
      The data used to generate Extended-LGST estimates

  startGateset : GateSet
      The GateSet used as a starting point for the least-squares
      optimization.

  gateStringsToUseInEstimation : list of (tuples or GateStrings)
      Each element of this list specifies a gate string that is 
      estimated using LGST and used in the overall least-squares 
      fit that determines the final "extended LGST" gateset.
      e.g. [ (), ('Gx',), ('Gx','Gy') ] 

  specs : 2-tuple
      A (rhoSpecs,ESpecs) tuple usually generated by calling build_spam_specs(...)

  targetGateset : GateSet, optional
      A gateset used to provide a guess for gauge in which LGST estimates should be returned,
      and the SPAM labels used to connect the dataset values to rhoVec and EVec indices.

  spamDict : dictionary, optional
      Dictionary mapping (rhoVec_index,EVec_index) integer tuples to string spam labels.
      Defaults to the spam dictionary of targetGateset
      e.g. spamDict[(0,0)] == "plus"

  guessGatesetForGauge : GateSet, optional
      A gateset used to compute a gauge transformation that is applied to
      the LGST estimates before they are returned.
      Defaults to targetGateset.

  svdTruncateTo : int, optional
      The Hilbert space dimension to truncate the gate matrices to using
      a SVD to keep only the largest svdToTruncateTo singular values of
      the I_tildle LGST matrix.
      Defaults to 0 (no truncation)

  maxiter : int, optional
      Maximum number of iterations for the least squares optimization

  maxfev : int, optional
      Maximum number of function evaluations for the least squares optimization
      Defaults to maxiter

  tol : float, optional
      The tolerance for the least squares optimization.

  opt_gates : bool, optional
      Whether the gate matrices should be optimized
      Defaults to True

  opt_G0 : bool, optional
      Whether the first row of gate matrices should be optimized.  If False, then
      when the startGateset has TP gates this will now be changed during the
      optimization and the resulting gates are guaranteed to be TP.

  regularizeFactor : float, optional
      Multiplicative prefactor of L2-like regularization term that penalizes gateset entries
      which have absolute value greater than 1.  When set to 0, no regularization is applied.

  verbosity : int, optional
      How much detail to send to stdout.

  check_jacobian : bool, optional
      If True, compare the analytic jacobian with a forward finite difference jacobean
      and print warning messages if there is disagreement.  Defaults to False.

  Returns
  -------
  numpy array
      The minimum error vector v = f(x_min), where f(x)**2 is the function being minimized.
  Gateset
      The gateset containing all of the estimated labels.
  """

  if maxfev is None: maxfev = maxiter
  opt_SPAM = opt_SP0 = False #no point in optimizing SPAM gate since it never enters objective function

  gs = startGateset.copy()
  gate_dim = len(gs.rhoVecs[0])

  if verbosity > 2: print ""
  if verbosity > 1:
    print "--- eLGST (least squares) ---"

  #convert list of GateStrings to list of raw tuples since that's all we'll need
  if len(gateStringsToUseInEstimation) > 0 and isinstance(gateStringsToUseInEstimation[0],_objs.GateString):
    gateStringsToUseInEstimation = [ gstr.tup for gstr in gateStringsToUseInEstimation ]

  #Setup and solve a least-squares problem where each element of each 
  # (lgst_estimated_process - process_estimate_using_current_gateset)  difference is a least-squares
  # term and the optimization is over the elements of the "current_gateset".  Note that: 
  #   lgst_estimated_process = LGST estimate for a gate string in gateStringsToUseInEstimation
  #   process_estimate_using_current_gateset = process mx you get from multiplying together the gate matrices of the current gateset

  #Step 1: get the lgst estimates for each of the "gate strings to use in estimation" list
  gateLabelAliases = {}
  for (i,gateStrTuple) in enumerate(gateStringsToUseInEstimation):
    gateLabelAliases["estimator%d" % i] = gateStrTuple
  
  lgstEstimates = do_lgst(dataset, specs, targetGateset, gateLabelAliases.keys(),
                         gateLabelAliases, spamDict, guessGatesetForGauge, svdTruncateTo,
                         verbosity=0) #override verbosity

  estimates = _np.empty( (len(gateStringsToUseInEstimation), gate_dim, gate_dim), 'd')
  for (i,gateStr) in enumerate(gateStringsToUseInEstimation):
    estimates[i] = lgstEstimates[ "estimator%d" % i ]
    
  evTree = gs.bulk_evaltree(gateStringsToUseInEstimation)
  maxGateStringLength = max([len(x) for x in gateStringsToUseInEstimation])

  #Step 2: create objective function for least squares optimization
  if verbosity <= 2:

    if regularizeFactor == 0:
      def objective_func(vectorGS):
        gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)    
        prods = gs.bulk_product(evTree)
        ret = (prods - estimates).flatten()
        #assert( len( (_np.isnan(ret)).nonzero()[0] ) == 0 )
        return ret
    else:
      def objective_func(vectorGS):
        gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)    
        prods = gs.bulk_product(evTree)
        gsVecNorm = regularizeFactor * _np.array( [ max(0,absx-1.0) for absx in map(abs,vectorGS) ], 'd')
        ret = _np.concatenate( ((prods - estimates).flatten(), gsVecNorm) )
        #assert( len( (_np.isnan(ret)).nonzero()[0] ) == 0 )
        return ret

  else:
    def objective_func(vectorGS):
      gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)    
      prods = gs.bulk_product(evTree)
      ret = (prods - estimates).flatten()

      #OLD (uncomment to check)
      #errvec = []
      #for (i,gateStr) in enumerate(gateStringsToUseInEstimation):
      #  term1 = lgstEstimates[ "estimator%d" % i ]
      #  term2 = gs.product(gateStr)
      #  if _np.linalg.norm(term2 - prods[i]) > 1e-6: 
      #    print "term 2 = \n",term2
      #    print "prod = \n",prods[i]
      #    print "Check failed for product %d: %s : %g" % (i,str(gateStr[0:10]),_np.linalg.norm(term2 - prods[i]))
      #  diff = (term2 - term1).flatten()
      #  errvec += list(diff)      
      #ret_chk = _np.array(errvec)
      #if _np.linalg.norm( ret - ret_chk ) > 1e-6:
      #  raise ValueError("Check failed with diff = %g" % _np.linalg.norm( ret - ret_chk ))

      if regularizeFactor > 0:
        gsVecNorm = regularizeFactor * _np.array( [ max(0,absx-1.0) for absx in map(abs,vectorGS) ], 'd')
        ret = _np.concatenate( (ret, gsVecNorm) )

      retSq = sum(ret*ret)
      print "%g: objfn vec in (%g,%g),  gs in (%g,%g), maxLen = %d" % \
            (retSq, _np.min(ret), _np.max(ret), _np.min(vectorGS), _np.max(vectorGS), maxGateStringLength)
      #assert( len( (_np.isnan(ret)).nonzero()[0] ) == 0 )
      return ret

    
  if verbosity <= 3:
    if regularizeFactor == 0:
      def jacobian(vectorGS):
        gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
        jac = gs.bulk_dproduct(evTree, gates=opt_gates, G0=opt_G0, flat=True) # shape == nGateStrings*nFlatGate, nDerivCols
        if check_jacobian: _opt.check_jac(objective_func, vectorGS, jac, tol=1e-3, eps=1e-6, errType='abs')
        return jac
    else:
      def jacobian(vectorGS):
        gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
        gsVecGrad = _np.diag( [ (regularizeFactor * _np.sign(x) if abs(x) > 1.0 else 0.0) for x in vectorGS ] )
        jac = gs.bulk_dproduct(evTree, gates=opt_gates, G0=opt_G0, flat=True) # shape == nGateStrings*nFlatGate, nDerivCols
        jac = _np.concatenate( (jac, gsVecGrad), axis=0 )  # shape == nGateStrings*nFlatGate+nDerivCols, nDerivCols
        if check_jacobian: _opt.check_jac(objective_func, vectorGS, jac, tol=1e-3, eps=1e-6, errType='abs')
        return jac
      #OLD return _np.concatenate( [ gs.dproduct(gateStr, G0=opt_G0, gates=opt_gates, flat=True) \
      #                           for gateStr in gateStringsToUseInEstimation ], axis=0 )

  else:
    def jacobian(vectorGS):
      gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
      jac = gs.bulk_dproduct(evTree, gates=opt_gates, G0=opt_G0, flat=True) # shape == nGateStrings*nFlatGate, nDerivCols
      if regularizeFactor > 0:
        gsVecGrad = _np.diag( [ (regularizeFactor * _np.sign(x) if abs(x) > 1.0 else 0.0) for x in vectorGS ] )
        jac = _np.concatenate( (jac, gsVecGrad), axis=0 )

      if check_jacobian:
        errSum, errs, fd_jac = _opt.check_jac(objective_func, vectorGS, jac, tol=1e-3, eps=1e-6, errType='abs')
        print "Jacobian has error %g and %d of %d indices with error > tol" % (errSum, len(errs), jac.shape[0]*jac.shape[1])      
        if len(errs) > 0:
          i,j = errs[0][0:2]; maxabs = _np.max(_np.abs(jac))
          print " ==> Worst index = %d,%d. Analytic jac = %g, Fwd Diff = %g" % (i,j, jac[i,j], fd_jac[i,j])
          print " ==> max err = ", errs[0][2]
          print " ==> max err/max = ", max([ x[2]/maxabs for x in errs ])

      return jac
      #OLD return _np.concatenate( [ gs.dproduct(gateStr, G0=opt_G0, gates=opt_gates, flat=True) \
      #                           for gateStr in gateStringsToUseInEstimation ], axis=0 )
    

  #def checked_jacobian(vectorGS):
  #  def obj_i(x, i): return objective_func(x)[i]
  #  def jac_i(x, i): return (jacobian(x))[i]
  #  y = objective_func(vectorGS)
  #  jac = jacobian(vectorGS); nJ = _np.linalg.norm(jac)
  #  for i in range(len(y)):
  #    err = _spo.check_grad(obj_i, jac_i, vectorGS, i)
  #    if err/nJ > 1e-6: print "Jacobian(%d) Error = %g (jac norm = %g)" % (i,err,nJ)
  #  return jac

  #Step 3: solve least squares minimization problem
  x0 = gs.to_vector(G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
  opt_x, opt_jac, info, msg, flag = \
      _spo.leastsq( objective_func, x0, xtol=tol, ftol=tol, gtol=tol,
               maxfev=maxfev*(len(x0)+1), full_output=True, Dfun=jacobian)
  full_minErrVec = objective_func(opt_x)
  minErrVec = full_minErrVec if regularizeFactor == 0 else full_minErrVec[0:-len(x0)] #don't include regularization terms

  #DEBUG: check without using our jacobian
  #opt_x_chk, opt_jac_chk, info_chk, msg_chk, flag_chk = \
  #    _spo.leastsq( objective_func, x0, xtol=tol, ftol=tol, gtol=tol,
  #             maxfev=maxfev*(len(x0)+1), full_output=True, epsfcn=1e-30)
  #minErrVec_chk = objective_func(opt_x_chk)

  gs.from_vector(opt_x,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
  gs.log("ExLGST", { 'method': "leastsq", 'tol': tol,  'maxiter': maxiter, 
                     'opt_G0': opt_G0, 'opt_SP0': opt_SP0 } )

  if verbosity > 1:
      print "  Sum of minimum least squares error (w/out reg terms) = %g" % sum([x**2 for x in minErrVec])
      #try: print "   log(likelihood) = ", _tools.logl(gs, dataset)
      #except: pass
      if targetGateset is not None and len(targetGateset.rhoVecs[0]) == len(gs.rhoVecs[0]):
        print "   frobenius distance to target = ", gs.frobeniusdist(targetGateset)

      #DEBUG
      #print "  Sum of minimum least squares error check = %g" % sum([x**2 for x in minErrVec_chk])
      #print "DEBUG : opt_x diff = ", _np.linalg.norm( opt_x - opt_x_chk )
      #print "DEBUG : opt_jac diff = ", _np.linalg.norm( opt_jac - opt_jac_chk )
      #print "DEBUG : flags (1,2,3,4=OK) = %d, check = %d" % (flag, flag_chk)

  return minErrVec, gs    


def do_iterative_exlgst(dataset, startGateset, specs, gateStringSetsToUseInEstimation, 
                      targetGateset=None, spamDict=None, guessGatesetForGauge=None,
                      svdTruncateTo=0, maxiter=100000, maxfev=None, tol=1e-6, 
                      opt_gates=True, opt_G0=True, regularizeFactor=0,
                      returnErrorVec=False, returnAll=False,
                      gateStringSetLabels=None, verbosity=0, check_jacobian=False):
  """
  Performs Iterated Extended Linear-inversion Gate Set Tomography on the dataset.

  Parameters
  ----------
  dataset : DataSet
      The data used to generate Extended-LGST estimates

  startGateset : GateSet
      The GateSet used as a starting point for the least-squares
      optimization.

  specs : 2-tuple
      A (rhoSpecs,ESpecs) tuple usually generated by calling build_spam_specs(...)

  gateStringSetsToUseInEstimation : list of lists of (tuples or GateStrings)
      The i-th element is a list of the gate strings to be used in the i-th iteration 
      of extended-LGST.  Each element of these lists is a gate string, specifed as
      either a GateString object or as a tuple of gate labels (but all must be specified
      using the same type).
      e.g. [ [ (), ('Gx',) ], [ (), ('Gx',), ('Gy',) ], [ (), ('Gx',), ('Gy',), ('Gx','Gy') ]  ]

  targetGateset : GateSet, optional
      A gateset used to provide a guess for gauge in which LGST estimates should be returned,
      and the SPAM labels used to connect the dataset values to rhoVec and EVec indices.

  spamDict : dictionary, optional
      Dictionary mapping (rhoVec_index,EVec_index) integer tuples to string spam labels.
      Defaults to the spam dictionary of targetGateset
      e.g. spamDict[(0,0)] == "plus"

  guessGatesetForGauge : GateSet, optional
      A gateset used to compute a gauge transformation that is applied to
      the LGST estimates before they are returned.
      Defaults to targetGateset.

  svdTruncateTo : int, optional
      The Hilbert space dimension to truncate the gate matrices to using
      a SVD to keep only the largest svdToTruncateTo singular values of
      the I_tildle LGST matrix.
      Defaults to 0 (no truncation)

  maxiter : int, optional
      Maximum number of iterations in each of the least squares optimizations

  maxfev : int, optional
      Maximum number of function evaluations for each of the least squares optimizations
      Defaults to maxiter

  tol : float, optional
      The tolerance for each of the least squares optimizations.

  opt_gates : bool, optional
      Whether the gate matrices should be optimized

  opt_G0 : bool, optional
      Whether the first row of gate matrices should be optimized.  If False, then
      when the startGateset has TP gates this will now be changed during the
      optimization and the resulting gates are guaranteed to be TP.

  regularizeFactor : float, optional
      Multiplicative prefactor of L2-like regularization term that penalizes gateset entries
      which have absolute value greater than 1.  When set to 0, no regularization is applied.

  returnErrorVec : bool, optional
      If True, return (errorVec, gateset), or (errorVecs, gatesets) if
      returnAll == True, instead of just the gateset or gatesets.

  returnAll : bool, optional
      If True return a list of gatesets (and errorVecs if returnErrorVec == True),
      one per iteration, instead of the results from just the final iteration.

  gateStringSetLabels : list of strings, optional
      An identification label for each of the gate string sets (used for displaying
      progress).  Must be the same length as gateStringSetsToUseInEstimation.

  verbosity : int, optional
      How much detail to send to stdout.

  check_jacobian : boolean, optional
      If True, compare the analytic jacobian with a forward finite difference jacobean
      and print warning messages if there is disagreement.

  Returns
  -------
  gateset               if returnAll == False and returnErrorVec == False
  gatesets              if returnAll == True  and returnErrorVec == False
  (errorVec, gateset)   if returnAll == False and returnErrorVec == True
  (errorVecs, gatesets) if returnAll == True  and returnErrorVec == True
      where errorVec is a numpy array of minimum error values v = f(x_min), where f(x)**2 is
      the function being minimized, gateset is the GateSet containing the final estimated gates.
      In cases when returnAll == True, gatesets and errorVecs are lists whose i-th elements are the
      errorVec and gateset corresponding to the results of the i-th iteration.
  """

# Parameter to add later??
#    whenCannotEstimate : string
#        What to do when a gate string to be estimated by LGST cannot because there isn't enough data.
#        Allowed values are:
#          'stop'   - stop algorithm and report an error (Default)
#          'warn'   - skip string, print a warning to stdout, and proceed
#          'ignore' - skip string silently and proceed

  #convert lists of GateStrings to lists of raw tuples since that's all we'll need
  if len(gateStringSetsToUseInEstimation ) > 0 and \
     len(gateStringSetsToUseInEstimation[0]) > 0 and \
     isinstance(gateStringSetsToUseInEstimation[0][0],_objs.GateString):
    gateStringLists = [ [gstr.tup for gstr in gsList] for gsList in gateStringSetsToUseInEstimation ]
  else:
    gateStringLists = gateStringSetsToUseInEstimation 

  #Run extended eLGST iteratively on given sets of estimatable strings
  elgstGatesets = [ ]; minErrs = [ ] #for returnAll == True case
  elgstGateset = startGateset.copy(); nIters = len(gateStringLists)
  for (i,stringsToEstimate) in enumerate(gateStringLists):
    if verbosity > 1: print "" #newline if we have more info to print
    if verbosity > 0:
      print "--- Iterative eLGST: Beginning iter %d of %d %s: %d gate strings ---" \
          % (i+1,nIters,("(%s) " % gateStringSetLabels[i]) if gateStringSetLabels else "", len(stringsToEstimate))
      _sys.stdout.flush()

    if stringsToEstimate is None or len(stringsToEstimate) == 0: continue
    minErr, elgstGateset = do_exlgst( dataset, elgstGateset, stringsToEstimate, specs,
                                     targetGateset, spamDict, guessGatesetForGauge,
                                     svdTruncateTo, maxiter, maxfev, tol, 
                                     opt_gates, opt_G0, regularizeFactor, verbosity,
                                     check_jacobian )
    if returnAll: 
      elgstGatesets.append(elgstGateset)
      minErrs.append(minErr)
      
  if returnErrorVec:
    return (minErrs, elgstGatesets) if returnAll else (minErr, elgstGateset)
  else:
    return elgstGatesets if returnAll else elgstGateset


###################################################################################
#                 Least-squares GST (LSGST)
##################################################################################

def do_mc2gst(dataset, startGateset, gateStringsToUse, 
            maxiter=100000, maxfev=None, tol=1e-6, 
            opt_gates=True, opt_G0=True, opt_SPAM=True, opt_SP0=True,
            cptp_penalty_factor=0, minProbClipForWeighting=1e-4, probClipInterval=None,
            useFreqWeightedChiSq=False, regularizeFactor=0, verbosity=0,
            check=False, check_jacobian=False, gatestringWeights=None, gateLabelAliases=None,
            memLimit=None):
  """
  Performs Least-Squares Gate Set Tomography on the dataset.

  Parameters
  ----------
  dataset : DataSet
      The dataset to obtain counts from.

  startGateset : GateSet
      The GateSet used as a starting point for the least-squares
      optimization.

  gateStringsToUse : list of (tuples or GateStrings)
      Each tuple contains gate labels and specifies a gate string whose
      probabilities are considered when trying to least-squares-fit the 
      probabilities given in the dataset.
      e.g. [ (), ('Gx',), ('Gx','Gy') ] 

  maxiter : int, optional
      Maximum number of iterations for the least squares optimization.

  maxfev : int, optional
      Maximum number of function evaluations for the least squares optimization.
      Defaults to maxiter.

  tol : float, optional
      The tolerance for the least squares optimization.

  opt_gates : bool, optional
      Whether the gate matrices should be optimized.

  opt_G0 : bool, optional
      Whether the first row of gate matrices should be optimized.  If False, then
      when the startGateset has TP gates this will now be changed during the
      optimization and the resulting gates are guaranteed to be TP.

  opt_SPAM : bool, optional
      Whether the rhoVecs and EVecs should be optimized

  opt_SP0 : bool, optional
      Whether the first element of the state preparation vectors
      (i.e. the rhoVecs) should be optimized.  If False, then rhoVecs
      in startingGateset that are trace == 1 will remain trace == 1 
      after the optimization.

  cptp_penalty_factor : float, optional
      If greater than zero, the least squares optimization also contains CPTP penalty
      terms which penalize non-CPTP-ness of the gateset being optimized.  This factor
      multiplies these CPTP penalty terms.

  minProbClipForWeighting : float, optional
      Sets the minimum and maximum probability p allowed in the chi^2 weights: N/(p*(1-p))
      by clipping probability p values to lie within the interval
      [ minProbClipForWeighting, 1-minProbClipForWeighting ].

  probClipInterval : 2-tuple or None, optional
     (min,max) values used to clip the probabilities predicted by gatesets during LSGST's
     least squares search for an optimal gateset (if not None).  Defaults to no clipping.

  useFreqWeightedChiSq : bool, optional
      If True, objective function uses only an approximate chi^2 weighting:  N/(f*(1-f)) 
      where f is the frequency obtained from the dataset, instead of the true chi^2: N/(p*(1-p))
      where p is a predicted probability.  Defaults to False, and only should use 
      True for backward compatibility.

  regularizeFactor : float, optional
      Multiplicative prefactor of L2-like regularization term that penalizes gateset entries
      which have absolute value greater than 1.  When set to 0, no regularization is applied.

  verbosity : int, optional
      How much detail to send to stdout.

  check : boolean, optional
      If True, perform extra checks within code to verify correctness.  Used
      for testing, and runs much slower when True.

  check_jacobian : boolean, optional
      If True, compare the analytic jacobian with a forward finite difference jacobean
      and print warning messages if there is disagreement.  Defaults to False.

  gatestringWeights : numpy array, optional
      An array of length len(gateStringsToUse).  Each element scales the
      least-squares term of the corresponding gate string in gateStringsToUse.
      The default is no weight scaling at all.

  gateLabelAliases : dictionary, optional
      Dictionary whose keys are gate label "aliases" and whose values are tuples
      corresponding to what that gate label should be expanded into before querying
      the dataset. Defaults to the empty dictionary (no aliases defined)
      e.g. gateLabelAliases['Gx^3'] = ('Gx','Gx','Gx')

  memLimit : int, optional
      A rough memory limit in bytes which restricts the amount of intermediate
      values that are computed and stored.

  Returns
  -------
  errorVec : numpy array
      Minimum error values v = f(x_best), where f(x)**2 is the function being minimized
  gateset : GateSet
      GateSet containing the estimated gates.
  """

  gs = startGateset.copy()
  if maxfev is None: maxfev = maxiter

  if verbosity > 2: print ""
  if verbosity > 1:
    print "--- Least Squares GST ---"

  #convert list of GateStrings to list of raw tuples since that's all we'll need
  if len(gateStringsToUse) > 0 and isinstance(gateStringsToUse[0],_objs.GateString):
    gateStringsToUse = [ gstr.tup for gstr in gateStringsToUse ]

  spamLabels = gs.get_spam_labels() #this list fixes the ordering of the spam labels
  spam_lbl_rows = { sl:i for (i,sl) in enumerate(spamLabels) }
  vec_gs_len = len(gs.to_vector(G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates))
  KM = len(spamLabels)*len(gateStringsToUse) #shorthand for this combined dimension used below

  if gateLabelAliases is not None: #then find & replace aliased gate labels with their expanded form
    dsGateStringsToUse = []
    for s in gateStringsToUse:
      for label,expandedStr in gateLabelAliases.iteritems():
        while label in tuple(s):
          i = tuple(s).index(label)
          s = tuple(s)[:i] + tuple(expandedStr) + tuple(s)[i+1:]
      dsGateStringsToUse.append(s)
  else:
    dsGateStringsToUse = gateStringsToUse # no difference in the strings used by the alias
    
  probs  = _np.empty( (len(spamLabels),len(gateStringsToUse)) )
  dprobs = _np.empty( (len(spamLabels),len(gateStringsToUse),vec_gs_len) )

  N = _np.array( [ dataset[gateStr].total() for gateStr in dsGateStringsToUse ], 'd')
  f = _np.empty( (len(spamLabels),len(gateStringsToUse)) )
  fweights = _np.empty( (len(spamLabels),len(gateStringsToUse)) )
  z = _np.zeros( (len(spamLabels),len(gateStringsToUse)) ) #always zeros - used for derivative below

  #Memory estimates - maybe make GateSet methods to get intermediate memory estimates
  ns = len(spamLabels); ng = len(gateStringsToUse); ne = vec_gs_len; gd = len(gs.rhoVecs[0])
  persistentMem = 8* (ng*(ns + ns*ne + 1 + 3*ns)) # Memory needed by final results in bytes
  intermedMem   = 8* (ng*(1 + gd**2 * (1 + ne))) # Memory needed by intermediate results in bytes (now just ~ that of dproduct)
  C = 1.0/1024.0**3 #; print "DEBUG: MEM",persistentMem," , ", intermedMem

  maxEvalSubTreeSize = None
  if memLimit is not None:
    if memLimit < persistentMem:
      raise MemoryError("Memory limit (%g GB) is < memory required to hold final results (%g GB)" % (memLimit*C, persistentMem*C))
    if memLimit < intermedMem:
      reductionFactor = float(intermedMem) / float(memLimit)
      maxEvalSubTreeSize = int(ng / reductionFactor)

  if verbosity > 2:
    print "Peristent Memory estimate: %d spam labels, %d gate strings, %d gateset params" % (ns,ng,ne)
    print "    ==> %g GB (p) + %g GB (dp) + %g GB (other) = %g GB (total)" % \
        (8*ns*ng*C, 8*ns*ng*ne*C,8*(ng+3*ns*ng)*C, persistentMem*C)
    print "Intermediate Memory estimate: %d gate strings, %d gate dimension, %d gateset params" % (ng,gd,ne)
    print "    ==> %g GB (p) + %g GB (dp) + %g GB (other) = %g GB (total)" % \
        (8*ng*gd*gd*C, 8*ng*gd*gd*ne*C,8*ng*C, intermedMem*C)
    if memLimit is not None: print "Memory limit = %g GB" % (memLimit*C)
    if maxEvalSubTreeSize is not None: print "Maximum eval sub-tree size = %d" % maxEvalSubTreeSize


  #NOTE on chi^2 expressions:
  #in general case:   chi^2 = sum (p_i-f_i)^2/p_i  (for i summed over outcomes)
  #in 2-outcome case: chi^2 = (p+ - f+)^2/p+ + (p- - f-)^2/p-
  #                         = (p - f)^2/p + (1-p - (1-f))^2/(1-p)
  #                         = (p - f)^2 * (1/p + 1/(1-p))
  #                         = (p - f)^2 * ( ((1-p) + p)/(p*(1-p)) )
  #                         = 1/(p*(1-p)) * (p - f)^2 

  for (i,gateStr) in enumerate(dsGateStringsToUse):
    for k,sl in enumerate(spamLabels):
      n = float(dataset[gateStr][sl])
      f[k,i] = n / N[i];  f2 = (n+1)/(N[i]+2)
      fweights[k,i] = _np.sqrt( N[i] / (f2*(1-f2)) )

  if gatestringWeights is not None: 
    fweights *= gatestringWeights[None,:] #b/c we necessarily used unweighted N[i]'s above
    N *= gatestringWeights #multiply N's by weights

  evTree = gs.bulk_evaltree(gateStringsToUse)
  maxGateStringLength = max([len(x) for x in gateStringsToUse])  

  if maxEvalSubTreeSize is not None:
    evTree.split(maxEvalSubTreeSize)
    if verbosity > 2: 
      print "Memory limit has imposed a division of the evaluation tree:"
      evTree.print_analysis()

  if useFreqWeightedChiSq:
    def get_weights(p):
      return fweights
    def get_dweights(p,wts):
      return z
  else:
    def get_weights(p):
      cp = _np.clip(p,minProbClipForWeighting,1-minProbClipForWeighting)
      return _np.sqrt(N / cp)  # nSpamLabels x nGateStrings array (K x M)
    def get_dweights(p,wts):  #derivative of weights w.r.t. p
      cp = _np.clip(p,minProbClipForWeighting,1-minProbClipForWeighting)
      dw = -0.5 * wts / cp   # nSpamLabels x nGateStrings array (K x M)
      dw[ _np.logical_or(p < minProbClipForWeighting, p>(1-minProbClipForWeighting)) ] = 0.0
      return dw

      
  if cptp_penalty_factor == 0:

    #Objective Function
    if verbosity <= 2:  # Fast versions of functions
      if regularizeFactor == 0:
        def objective_func(vectorGS):
          gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
          gs.bulk_fill_probs(probs, spam_lbl_rows, evTree, probClipInterval, check) 
          v = (probs-f)*get_weights(probs) # dims K x M (K = nSpamLabels, M = nGateStrings)
          return v.reshape([KM])

      else:
        def objective_func(vectorGS):
          gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
          #p = gs.bulk_pr('plus', evTree, clipTo=probClipInterval, check=check) #RESTRICTION
          gs.bulk_fill_probs(probs, spam_lbl_rows, evTree, probClipInterval, check) 
          weights = get_weights(probs)
          v = (probs-f)*weights # dims K x M (K = nSpamLabels, M = nGateStrings)
          gsVecNorm = regularizeFactor * _np.array( [ max(0,absx-1.0) for absx in map(abs,vectorGS) ], 'd')
          return _np.concatenate( (v.reshape([KM]), gsVecNorm) )

    else:  # Verbose (DEBUG) version of objective_func
      
      def objective_func(vectorGS):
        gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
        #p = gs.bulk_pr('plus', evTree, clipTo=probClipInterval, check=check) #RESTRICTION
        #p = _np.array( [ gs.pr('plus',gateStr) for gateStr in gateStringsToUse ] ) #OLD
        gs.bulk_fill_probs(probs, spam_lbl_rows, evTree, probClipInterval, check) 
        weights = get_weights(probs)

        v = (probs - f) * weights;  chisq = _np.sum(v*v)
        nClipped = len((_np.logical_or(probs < minProbClipForWeighting, probs > (1-minProbClipForWeighting))).nonzero()[0])
        print "%g: p in (%g,%g),  weights in (%g,%g),  gs in (%g,%g), maxLen = %d, nClipped=%d" % \
            (chisq, _np.min(probs), _np.max(probs), _np.min(weights), _np.max(weights), _np.min(vectorGS),
             _np.max(vectorGS), maxGateStringLength, nClipped)

        if regularizeFactor > 0:
          gsVecNorm = regularizeFactor * _np.array( [ max(0,absx-1.0) for absx in map(abs,vectorGS) ], 'd')
          return _np.concatenate( (v.reshape([KM]), gsVecNorm) )  
        else: return v.reshape([KM])

      
    # Jacobian function
    if verbosity <= 3: # Fast versions of functions
      if regularizeFactor == 0: # Fast un-regularized version
        def jacobian(vectorGS):
          gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
          gs.bulk_fill_dprobs(dprobs, spam_lbl_rows, evTree, 
                             G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates,
                             prMxToFill=probs, clipTo=probClipInterval, check=check)
          weights  = get_weights( probs )
          #jac = dpr * (weights+(pr-f)*get_dweights( pr, weights ))[:,None] #OLD  # (M,N) * (M,1) = (M,N)
          jac = dprobs * (weights+(probs-f)*get_dweights(probs, weights ))[:,:,None]  # (K,M,N) * (K,M,1)   (N = dim of vectorized gateset)
          jac = jac.reshape( [KM,vec_gs_len] )
          if check_jacobian: _opt.check_jac(objective_func, vectorGS, jac, tol=1e-3, eps=1e-6, errType='abs')

          # dpr has shape == (nGateStrings, nDerivCols), weights has shape == (nGateStrings,)
          # return shape == (nGateStrings, nDerivCols) where ret[i,j] = dP[i,j]*(weights+dweights*(p-f))[i]
          return jac

      else:
        def jacobian(vectorGS): # Fast regularized version
          gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
          gs.bulk_fill_dprobs(dprobs, spam_lbl_rows, evTree, 
                             G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates,
                             prMxToFill=probs, clipTo=probClipInterval, check=check)
          weights  = get_weights( probs )
          gsVecGrad = _np.diag( [ (regularizeFactor * _np.sign(x) if abs(x) > 1.0 else 0.0) for x in vectorGS ] ) # (N,N)
          jac = dprobs * (weights+(probs-f)*get_dweights( probs, weights ))[:,:,None]  # (K,M,N) * (K,M,1)   (N = dim of vectorized gateset)
          jac = _np.concatenate( (jac.reshape( [KM,vec_gs_len] ), gsVecGrad), axis=0 ) # (KM,N) + (N,N) = (KM+N,N)
          if check_jacobian: _opt.check_jac(objective_func, vectorGS, jac, tol=1e-3, eps=1e-6, errType='abs')

          # dpr has shape == (nGateStrings, nDerivCols), gsVecGrad has shape == (nDerivCols, nDerivCols)
          # return shape == (nGateStrings+nDerivCols, nDerivCols)
          return jac

    else: # Verbose (DEBUG) version
      def jacobian(vectorGS):
        gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
        gs.bulk_fill_dprobs(dprobs, spam_lbl_rows, evTree, 
                           G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates,
                           prMxToFill=probs, clipTo=probClipInterval, check=check)
        weights  = get_weights( probs )

        #Attempt to control leastsq by zeroing clipped weights -- this doesn't seem to help (nor should it)
        #weights[ _np.logical_or(pr < minProbClipForWeighting, pr > (1-minProbClipForWeighting)) ] = 0.0
    
        dPr_prefactor = (weights+(probs-f)*get_dweights( probs, weights )) # (K,M)
        jac = dprobs * dPr_prefactor[:,:,None] #  (K,M,N) * (K,M,1) = (K,M,N)  (N = dim of vectorized gateset)
        jac = jac.reshape( [KM,vec_gs_len] )

        if regularizeFactor > 0:
          gsVecGrad = _np.diag( [ (regularizeFactor * _np.sign(x) if abs(x) > 1.0 else 0.0) for x in vectorGS ] )
          jac = _np.concatenate( (jac, gsVecGrad), axis=0 ) # (KM,N) + (N,N) = (KM+N,N)

        #Zero-out insignificant entries in jacobian -- seemed to help some, but leaving this out, thinking less complicated == better
        #absJac = _np.abs(jac);  maxabs = _np.max(absJac)
        #jac[ absJac/maxabs < 5e-8 ] = 0.0

        #Rescale jacobian so it's not too large -- an attempt to fix wild leastsq behavior but didn't help
        #if maxabs > 1e7:
        #  print "Rescaling jacobian to 1e7 maxabs"
        #  jac = (jac / maxabs) * 1e7

        #U,s,V = _np.linalg.svd(jac)
        #print "DEBUG: s-vals of jac %s = " % (str(jac.shape)), s

        nClipped = len((_np.logical_or(probs < minProbClipForWeighting, probs > (1-minProbClipForWeighting))).nonzero()[0])
        print "jac in (%g,%g), pr in (%g,%g),  dpr in (%g,%g), prefactor in (%g,%g), gs in (%g,%g) maxLen = %d, nClipped = %d" % \
            (_np.min(jac), _np.max(jac), _np.min(probs), _np.max(probs), _np.min(dprobs), _np.max(dprobs), _np.min(dPr_prefactor), _np.max(dPr_prefactor),
             _np.min(vectorGS), _np.max(vectorGS), maxGateStringLength, nClipped)

        if check_jacobian:
          errSum, errs, fd_jac = _opt.check_jac(objective_func, vectorGS, jac, tol=1e-3, eps=1e-6, errType='abs')
          print "Jacobian has error %g and %d of %d indices with error > tol" % (errSum, len(errs), jac.shape[0]*jac.shape[1])
          if len(errs) > 0:
            i,j = errs[0][0:2]; maxabs = _np.max(_np.abs(jac))
            print " ==> Worst index = %d,%d. p=%g,  Analytic jac = %g, Fwd Diff = %g" % (i,j, (probs.reshape([KM]))[i], jac[i,j], fd_jac[i,j])
            print " ==> max err = ", errs[0][2]
            print " ==> max err/max = ", max([ x[2]/maxabs for x in errs ])

        return jac

      # OLD: return _np.concatenate( [ weights[i] * dPr_list[i] for i in range(len(gateStringsToUse)) ], axis=0 )
      #return _np.concatenate( [ weights[i] * gs.dpr('plus', gateStr, G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates) \
      #                           for (i,gateStr) in enumerate(gateStringsToUse) ], axis=0 ) #RESTRICTION: 'plus' assumes only a single 'plus' spam label
      # jacobian[k,l] = derivative of p[k] wrt vectorGS[l].  Just concatenate derivative of p[k]'s multiplied by weights

  else:
    raise NotImplementedError("CPTP-penalized LSGST not implemented yet.")
    #def objective_func(vectorGS):  #TODO: Upgrade from 'plus' restricted case
    #  gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
    #  p = gs.bulk_pr('plus', evTree, clipTo=probClipInterval, check=check) #RESTRICTION: 'plus' assumes only a single 'plus' spam label
    #  #p = _np.array( [ gs.pr('plus',gateStr) for gateStr in gateStringsToUse ] ) #OLD
    #  negEvals = cptp_penalty_factor * _np.array(_tools.mags_of_negative_choi_evals(gs))
    #  rhoPenalties = cptp_penalty_factor * _np.array([ _tools.rhovec_penalty(r) for r in gs.rhoVecs ])
    #  EPenalties   = cptp_penalty_factor * _np.array([ _tools.evec_penalty(e) for e in gs.EVecs ])
    #
    #  if regularizeFactor > 0:
    #    gsVecNorm = regularizeFactor * _np.array( [ max(0,absx-1.0) for absx in map(abs,vectorGS) ], 'd')
    #    return _np.concatenate( ((p - f) * get_weights(p), negEvals, rhoPenalties, EPenalties, gsVecNorm) )
    #  else: return _np.concatenate( ((p - f) * get_weights(p), negEvals, rhoPenalties, EPenalties) )
    #
    #jacobian = None


  #Step 3: solve least squares minimization problem
  x0 = gs.to_vector(G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
  opt_x, opt_jac, info, msg, flag = \
      _spo.leastsq( objective_func, x0, xtol=tol, ftol=tol, gtol=tol,
               maxfev=maxfev*(len(x0)+1), full_output=True, Dfun=jacobian ) 
  full_minErrVec = objective_func(opt_x)  #note: calls gs.from_vector(opt_x,...) so don't need to call this again
  minErrVec = full_minErrVec if regularizeFactor == 0 else full_minErrVec[0:-len(x0)] #don't include regularization terms
  soln_gs = gs.copy();
  soln_gs.log("LSGST", { 'method': "leastsq", 'tol': tol,  'maxiter': maxiter, 
                     'opt_G0': opt_G0, 'opt_SP0': opt_SP0 } )

  #opt_jac = _np.abs(jacobian(opt_x))
  #print "DEBUG: Jacobian (shape %s) at opt_x: min=%g, max=%g" % (str(opt_jac.shape),_np.min(opt_jac), _np.max(opt_jac))
  #print "DEBUG: leastsq finished with flag=%d: %s" % (flag,msg)

  if verbosity > 1:
    nGateStrings = len(gateStringsToUse)
    nDataParams  = nGateStrings*(len(dataset.get_spam_labels())-1) #number of independent parameters
                                                                 # in dataset (max. model # of params)
    try:
      nModelParams = gs.num_nongauge_params(opt_gates, opt_G0, opt_SPAM, opt_SP0) #len(x0)
    except: #numpy can throw a LinAlgError
      print "Warning: could not obtain number of *non-gauge* parameters - using total params instead"
      nModelParams = gs.num_params(opt_gates, opt_G0, opt_SPAM, opt_SP0) #just use total number of params
      
    totChi2 = sum([x**2 for x in minErrVec])
    pvalue = 1.0 - _stats.chi2.cdf(totChi2,nDataParams-nModelParams) # reject GST model if p-value < threshold (~0.05?)
    print "  Sum of Chi^2 = %g (%d data params - %d model params = expected mean of %g; p-value = %g)" % \
        (totChi2, nDataParams,  nModelParams, nDataParams-nModelParams, pvalue)
  
  #if targetGateset is not None:
  #  target_vec = targetGateset.to_vector(G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
  #  targetErrVec = objective_func(target_vec)
  #  return minErrVec, soln_gs, targetErrVec
  return minErrVec, soln_gs

def do_mc2gst_with_model_selection(dataset, startGateset, dimDelta, gateStringsToUse, 
                              maxiter=100000, maxfev=None, tol=1e-6, 
                              opt_gates=True, opt_G0=True, opt_SPAM=True, opt_SP0=True,
                              cptp_penalty_factor=0, minProbClipForWeighting=1e-4, probClipInterval=None,
                              useFreqWeightedChiSq=False, regularizeFactor=0, verbosity=0,
                              check=False, check_jacobian=False, gatestringWeights=None, memLimit=None):
  """
  Performs Least-Squares Gate Set Tomography on the dataset.

  Parameters
  ----------
  dataset : DataSet
      The dataset to obtain counts from.

  startGateset : GateSet
      The GateSet used as a starting point for the least-squares
      optimization.

  dimDelta : integer
      Amount by which to increment or decrement the dimension of 
      current gateset (initially startGateset) to obtain candidate
      alternative models for performing model selection.

  gateStringsToUse : list of (tuples or GateStrings)
      Each tuple contains gate labels and specifies a gate string whose
      probabilities are considered when trying to least-squares-fit the 
      probabilities given in the dataset.
      e.g. [ (), ('Gx',), ('Gx','Gy') ] 

  maxiter : int, optional
      Maximum number of iterations for the least squares optimization.

  maxfev : int, optional
      Maximum number of function evaluations for the least squares optimization.
      Defaults to maxiter.

  tol : float, optional
      The tolerance for the least squares optimization.

  opt_gates : bool, optional
      Whether the gate matrices should be optimized

  opt_G0 : bool, optional
      Whether the first row of gate matrices should be optimized.  If False, then
      when the startGateset has TP gates this will now be changed during the
      optimization and the resulting gates are guaranteed to be TP.

  opt_SPAM : bool, optional
      Whether the rhoVecs and EVecs should be optimized

  opt_SP0 : bool, optional
      Whether the first element of the state preparation vectors
      (i.e. the rhoVecs) should be optimized.  If False, then rhoVecs
      in startingGateset that are trace == 1 will remain trace == 1 
      after the optimization.

  cptp_penalty_factor : float, optional
      If greater than zero, the least squares optimization also contains CPTP penalty
      terms which penalize non-CPTP-ness of the gateset being optimized.  This factor
      multiplies these CPTP penalty terms.

  minProbClipForWeighting : float, optional
      Sets the minimum and maximum probability p allowed in the chi^2 weights: N/(p*(1-p))
      by clipping probability p values to lie within the interval
      [ minProbClipForWeighting, 1-minProbClipForWeighting ].  

  probClipInterval : 2-tuple or None, optional
     (min,max) values used to clip the probabilities predicted by gatesets during LSGST's
     least squares search for an optimal gateset (if not None).  Defaults to no clipping.

  useFreqWeightedChiSq : bool, optional
      If True, objective function uses only an approximate chi^2 weighting:  N/(f*(1-f)) 
      where f is the frequency obtained from the dataset, instead of the true chi^2: N/(p*(1-p))
      where p is a predicted probability.  Defaults to False, and only should use 
      True for backward compatibility.

  regularizeFactor : float, optional
      Multiplicative prefactor of L2-like regularization term that penalizes gateset entries
      which have absolute value greater than 1.  When set to 0, no regularization is applied.

  verbosity : int, optional
      How much detail to send to stdout.

  check : boolean, optional
      If True, perform extra checks within code to verify correctness.  Used
      for testing, and runs much slower when True.  Defaults to False.

  check_jacobian : boolean, optional
      If True, compare the analytic jacobian with a forward finite difference jacobean
      and print warning messages if there is disagreement.  Defaults to False.

  gatestringWeights : numpy array, optional
      An array of length len(gateStringsToUse).  Each element scales the
      least-squares term of the corresponding gate string in gateStringsToUse.
      The default is no weight scaling at all.

  memLimit : int, optional
      A rough memory limit in bytes which restricts the amount of intermediate
      values that are computed and stored.


  Returns
  -------
  errorVec : numpy array
      Minimum error values v = f(x_best), where f(x)**2 is the function being minimized
  gateset : GateSet
      GateSet containing the estimated gates.
  """

  dim = len(startGateset.rhoVecs[0])  
  nStrings = len(gateStringsToUse)

  #Run do_mc2gst multiple times - one for the starting Gateset and one for the starting gateset
  # with increased or decreased dimension as per dimDelta
  if verbosity > 2: print ""
  if verbosity > 1: 
    print "--- Least Squares GST with model selection (starting dim = %d) ---" % dim

  #convert list of GateStrings to list of raw tuples since that's all we'll need
  if len(gateStringsToUse) > 0 and isinstance(gateStringsToUse[0],_objs.GateString):
    gateStringsToUse = [ gstr.tup for gstr in gateStringsToUse ]

  minErr, gs = do_mc2gst(dataset, startGateset, gateStringsToUse, maxiter, maxfev, tol, 
                       opt_gates, opt_G0, opt_SPAM, opt_SP0, cptp_penalty_factor, 
                       minProbClipForWeighting, probClipInterval,
                       useFreqWeightedChiSq, regularizeFactor, verbosity,
                       check, check_jacobian, gatestringWeights, None, memLimit)
  chiSqBest = sum([x**2 for x in minErr]) #using only gateStringsToUse
  nParamsBest = len(startGateset.to_vector(G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates))
  origGS = bestGS = gs
  bestMinErr = minErr

  print "Dim %d: chi^2 = %g, nGateStrings=%d, nParams=%d (so expected mean = %d)" % \
      (dim, chiSqBest, nStrings, nParamsBest, nStrings-nParamsBest)

    #Notes on Model selection test:
    # compare chi2 - 2*(nStrings-nParams) for each model -- select lower one
    # So compare:  chi2_A - 2*nStrings + 2*nParams_A <> chi2_B - 2*nStrings + 2*nParamsB
    #              chi2_A + 2*nParams_A <> chi2_B + 2*nParams_B
    #              chi2_A - chi2_B <> 2*(nParams_B - nParams_A)


  #try decreasing the dimension
  curDim = dim
  tryDecreasedDim = True
  curStartGateset = origGS
  while tryDecreasedDim:
    curDim -= dimDelta
    curStartGateset = curStartGateset.decrease_dimension(curDim)
    nParams = len(curStartGateset.to_vector(G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates))

    minErr, gs = do_mc2gst(dataset, curStartGateset, gateStringsToUse, maxiter, maxfev, tol, 
                       opt_gates, opt_G0, opt_SPAM, opt_SP0, cptp_penalty_factor, 
                       minProbClipForWeighting, probClipInterval,
                       useFreqWeightedChiSq, regularizeFactor, verbosity,
                       check, check_jacobian, gatestringWeights, None, memLimit)

    chiSq = sum([x**2 for x in minErr]) #using only gateStringsToUse

    #Model selection test
    chi2diff = chiSq - chiSqBest
    paramDiff = nParams - nParamsBest
    if (chiSqBest - chiSq) > 2*(nParams - nParamsBest): # equivaletly: -chi2diff > 2*paramDiff
      bestGS, bestMinErr, chiSqBest, nParamsBest = gs, minErr, chiSq, nParams
      msResult = "Selected"
    else:
      msResult = "Rejected"
      tryDecreasedDim = False

    print "%s dim %d: chi^2 = %g (%+g w.r.t. expected mean of %d strings - %d params = %d) (dChi^2=%d, 2*dParams=%d)" % \
        (msResult, curDim, chiSq, chiSq-(nStrings-nParams), nStrings, nParams, nStrings-nParams, chi2diff, 2*paramDiff)


  #try increasing the dimension
  curDim = dim
  tryIncreasedDim = bool( curDim == dim ) # if we didn't decrease the dimension
  curStartGateset = origGS

  while tryIncreasedDim:
    curDim += dimDelta
    curStartGateset = curStartGateset.increase_dimension(curDim)
    curStartGateset = curStartGateset.kick(0.01) #give random kick here??
    nParams = len(curStartGateset.to_vector(G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates))
    if nParams > nStrings: 
      #Future: do "LSGST" for underconstrained nonlinear problems -- or just double up?
      tryIncreasedDim = False
      continue

    minErr, gs = do_mc2gst(dataset, curStartGateset, gateStringsToUse, maxiter, maxfev, tol, 
                       opt_gates, opt_G0, opt_SPAM, opt_SP0, cptp_penalty_factor, 
                       minProbClipForWeighting, probClipInterval,
                       useFreqWeightedChiSq, regularizeFactor, verbosity,
                       check, check_jacobian, gatestringWeights, None, memLimit)

    chiSq = sum([x**2 for x in minErr]) #using only gateStringsToUse
  
    #Model selection test
    chi2diff = chiSq - chiSqBest
    paramDiff = nParams - nParamsBest
    if (chiSqBest - chiSq) > 2*(nParams - nParamsBest): # equivaletly: -chi2diff > 2*paramDiff
      bestGS, bestMinErr, chiSqBest, nParamsBest = gs, minErr, chiSq, nParams
      msResult = "Selected"
    else:
      msResult = "Rejected"
      tryIncreasedDim = False

    print "%s dim %d: chi^2 = %g (%+g w.r.t. expected mean of %d strings - %d params = %d) (dChi^2=%d, 2*dParams=%d)" % \
        (msResult, curDim, chiSq, chiSq-(nStrings-nParams), nStrings, nParams, nStrings-nParams, chi2diff, 2*paramDiff)
          
  return bestMinErr, bestGS


def do_iterative_mc2gst(dataset, startGateset, gateStringSetsToUseInEstimation, 
                     maxiter=100000, maxfev=None, tol=1e-6, 
                     opt_gates=True, opt_G0=True, opt_SPAM=True, opt_SP0=True,
                     cptp_penalty_factor=0, minProbClipForWeighting=1e-4, probClipInterval=None,
                     useFreqWeightedChiSq=False, regularizeFactor=0, returnErrorVec=False, returnAll=False,
                     gateStringSetLabels=None, verbosity=0, check=False, check_jacobian=False,
                     gatestringWeightsDict=None, memLimit=None):
  """
  Performs Iterative Least Squares Gate Set Tomography on the dataset.

  Parameters
  ----------
  dataset : DataSet
      The data used to generate LSGST gate estimates

  startGateset : GateSet
      The GateSet used as a starting point for the least-squares
      optimization.

  gateStringSetsToUseInEstimation : list of lists of (tuples or GateStrings)
      The i-th element is a list of the gate strings to be used in the i-th iteration 
      of LSGST.  Each element of these lists is a gate string, specifed as
      either a GateString object or as a tuple of gate labels (but all must be specified
      using the same type).
      e.g. [ [ (), ('Gx',) ], [ (), ('Gx',), ('Gy',) ], [ (), ('Gx',), ('Gy',), ('Gx','Gy') ]  ]

  maxiter : int, optional
      Maximum number of iterations for the least squares optimization.

  maxfev : int, optional
      Maximum number of function evaluations for the least squares optimization.

  tol : float, optional
      The tolerance for the least squares optimization.

  opt_gates : bool, optional
      Whether the gate matrices should be optimized

  opt_G0 : bool, optional
      Whether the first row of gate matrices should be optimized.  If False, then
      when the startGateset has TP gates this will now be changed during the
      optimization and the resulting gates are guaranteed to be TP.

  opt_SPAM : bool, optional
      Whether the rhoVecs and EVecs should be optimized

  opt_SP0 : bool, optional
      Whether the first element of the state preparation vectors
      (i.e. the rhoVecs) should be optimized.  If False, then rhoVecs
      in startingGateset that are trace == 1 will remain trace == 1 
      after the optimization.

  cptp_penalty_factor : float, optional
      If greater than zero, the least squares optimization also contains CPTP penalty
      terms which penalize non-CPTP-ness of the gateset being optimized.  This factor
      multiplies these CPTP penalty terms.

  minProbClipForWeighting : float, optional
      Sets the minimum and maximum probability p allowed in the chi^2 weights: N/(p*(1-p))
      by clipping probability p values to lie within the interval
      [ minProbClipForWeighting, 1-minProbClipForWeighting ].

  probClipInterval : 2-tuple or None, optional
     (min,max) values used to clip the probabilities predicted by gatesets during LSGST's
     least squares search for an optimal gateset (if not None).  Defaults to no clipping.

  useFreqWeightedChiSq : bool, optional
      If True, objective function uses only an approximate chi^2 weighting:  N/(f*(1-f)) 
      where f is the frequency obtained from the dataset, instead of the true chi^2: N/(p*(1-p))
      where p is a predicted probability.  Defaults to False, and only should use 
      True for backward compatibility.

  regularizeFactor : float, optional
      Multiplicative prefactor of L2-like regularization term that penalizes gateset entries
      which have absolute value greater than 1.  When set to 0, no regularization is applied.

  returnErrorVec : bool, optional
      If True, return (errorVec, gateset), or (errorVecs, gatesets) if
      returnAll == True, instead of just the gateset or gatesets.

  returnAll : bool, optional
      If True return a list of gatesets (and errorVecs if returnErrorVec == True),
      one per iteration, instead of the results from just the final iteration.

  gateStringSetLabels : list of strings, optional
      An identification label for each of the gate string sets (used for displaying
      progress).  Must be the same length as gateStringSetsToUseInEstimation.

  verbosity : int, optional
      How much detail to send to stdout.

  check : boolean, optional
      If True, perform extra checks within code to verify correctness.  Used
      for testing, and runs much slower when True. 

  check_jacobian : boolean, optional
      If True, compare the analytic jacobian with a forward finite difference jacobean
      and print warning messages if there is disagreement.

  gatestringWeightsDict : dictionary, optional
      A dictionary with keys == gate strings and values == multiplicative scaling 
      factor for the corresponding gate string. The default is no weight scaling at all.

  memLimit : int, optional
      A rough memory limit in bytes which restricts the amount of intermediate
      values that are computed and stored.


  Returns
  -------
  gateset               if returnAll == False and returnErrorVec == False
  gatesets              if returnAll == True  and returnErrorVec == False
  (errorVec, gateset)   if returnAll == False and returnErrorVec == True
  (errorVecs, gatesets) if returnAll == True  and returnErrorVec == True
      where errorVec is a numpy array of minimum error values v = f(x_min), where f(x)**2 is
      the function being minimized, gateset is the GateSet containing the final estimated gates.
      In cases when returnAll == True, gatesets and errorVecs are lists whose i-th elements are the
      errorVec and gateset corresponding to the results of the i-th iteration.
  """


  #convert lists of GateStrings to lists of raw tuples since that's all we'll need
  if len(gateStringSetsToUseInEstimation ) > 0 and \
     len(gateStringSetsToUseInEstimation[0]) > 0 and \
     isinstance(gateStringSetsToUseInEstimation[0][0],_objs.GateString):
    gateStringLists = [ [gstr.tup for gstr in gsList] for gsList in gateStringSetsToUseInEstimation ]
  else:
    gateStringLists = gateStringSetsToUseInEstimation 

  #Run extended LSGST iteratively on given sets of estimatable strings
  lsgstGatesets = [ ]; minErrs = [ ] #for returnAll == True case
  lsgstGateset = startGateset.copy(); nIters = len(gateStringLists)
  for (i,stringsToEstimate) in enumerate(gateStringLists):
    if verbosity > 1: print ""
    if verbosity > 0:
      print "--- Iterative LSGST: Beginning iter %d of %d %s: %d gate strings ---" \
                  % (i+1,nIters,("(%s) " % gateStringSetLabels[i]) if gateStringSetLabels else "", len(stringsToEstimate))
      _sys.stdout.flush()

    if stringsToEstimate is None or len(stringsToEstimate) == 0: continue

    if gatestringWeightsDict is not None:
      gatestringWeights = _np.ones( len(stringsToEstimate), 'd')
      for gatestr,weight in gatestringWeightsDict.iteritems():
        if gatestr in stringsToEstimate:
          gatestringWeights[ stringsToEstimate.index(gatestr) ] = weight
    else: gatestringWeights = None

    minErr, lsgstGateset = do_mc2gst( dataset, lsgstGateset, stringsToEstimate,
                                    maxiter, maxfev, tol, opt_gates, opt_G0, opt_SPAM, opt_SP0,
                                    cptp_penalty_factor, minProbClipForWeighting, probClipInterval,
                                    useFreqWeightedChiSq, regularizeFactor, verbosity, check, check_jacobian,
                                    gatestringWeights, None, memLimit)
    if returnAll: 
      lsgstGatesets.append(lsgstGateset)
      minErrs.append(minErr)

  if returnErrorVec:
    return (minErrs, lsgstGatesets) if returnAll else (minErr, lsgstGateset)
  else:
    return lsgstGatesets if returnAll else lsgstGateset


def do_iterative_mc2gst_with_model_selection(dataset, startGateset, dimDelta, gateStringSetsToUseInEstimation, 
                                       maxiter=100000, maxfev=None, tol=1e-6, 
                                       opt_gates=True, opt_G0=True, opt_SPAM=True, opt_SP0=True,
                                       cptp_penalty_factor=0, minProbClipForWeighting=1e-4, probClipInterval=None,
                                       useFreqWeightedChiSq=False, regularizeFactor=0, returnErrorVec=False, returnAll=False,
                                       gateStringSetLabels=None, verbosity=0, check=False, check_jacobian=False,
                                       gatestringWeightsDict=None, memLimit=None):
  """
  Performs Iterative Least Squares Gate Set Tomography on the dataset, and at
  each iteration tests the current gateset model against gateset models with
  an increased and/or decreased dimension (model selection).

  Parameters
  ----------
  dataset : DataSet
      The data used to generate LSGST gate estimates

  startGateset : GateSet
      The GateSet used as a starting point for the least-squares
      optimization.

  dimDelta : integer
      Amount by which to increment or decrement the dimension of the
      current gateset when performing model selection

  gateStringSetsToUseInEstimation : list of lists of (tuples or GateStrings)
      The i-th element lists the gate strings to be used in the i-th iteration of LSGST. 
      Each element is a list of gate label tuples.
      e.g. [ [ (), ('Gx',) ], [ (), ('Gx',), ('Gy',) ], [ (), ('Gx',), ('Gy',), ('Gx','Gy') ]  ]

  maxiter : int, optional
      Maximum number of iterations for the least squares optimization.

  maxfev : int, optional
      Maximum number of function evaluations for the least squares optimization.
      Defaults to maxiter.

  tol : float, optional
      The tolerance for the least squares optimization.

  opt_gates : bool, optional
      Whether the gate matrices should be optimized

  opt_G0 : bool, optional
      Whether the first row of gate matrices should be optimized.  If False, then
      when the startGateset has TP gates this will now be changed during the
      optimization and the resulting gates are guaranteed to be TP.

  opt_SPAM : bool, optional
      Whether the rhoVecs and EVecs should be optimized

  opt_SP0 : bool, optional
      Whether the first element of the state preparation vectors
      (i.e. the rhoVecs) should be optimized.  If False, then rhoVecs
      in startingGateset that are trace == 1 will remain trace == 1 
      Defaults to True

  cptp_penalty_factor : float, optional
      If greater than zero, the least squares optimization also contains CPTP penalty
      terms which penalize non-CPTP-ness of the gateset being optimized.  This factor
      multiplies these CPTP penalty terms.

  minProbClipForWeighting : float, optional
      Sets the minimum and maximum probability p allowed in the chi^2 weights: N/(p*(1-p))
      by clipping probability p values to lie within the interval
      [ minProbClipForWeighting, 1-minProbClipForWeighting ].

  probClipInterval : 2-tuple or None, optional
     (min,max) values used to clip the probabilities predicted by gatesets during LSGST's
     least squares search for an optimal gateset (if not None).  Defaults to no clipping.

  useFreqWeightedChiSq : bool, optional
      If True, objective function uses only an approximate chi^2 weighting:  N/(f*(1-f)) 
      where f is the frequency obtained from the dataset, instead of the true chi^2: N/(p*(1-p))
      where p is a predicted probability.  Defaults to False, and only should use 
      True for backward compatibility.

  regularizeFactor : float, optional
      Multiplicative prefactor of L2-like regularization term that penalizes gateset entries
      which have absolute value greater than 1.  When set to 0, no regularization is applied.
      Defaults to 0.

  returnErrorVec : bool, optional
      If True, return (errorVec, gateset), or (errorVecs, gatesets) if
      returnAll == True, instead of just the gateset or gatesets.

  returnAll : bool, optional
      If True return a list of gatesets (and errorVecs if returnErrorVec == True),
      one per iteration, instead of the results from just the final iteration.

  gateStringSetLabels : list of strings, optional
      An identification label for each of the gate string sets (used for displaying
      progress).  Must be the same length as gateStringSetsToUseInEstimation.

  verbosity : int, optional
      How much detail to send to stdout.

  check : boolean, optional
      If True, perform extra checks within code to verify correctness.  Used
      for testing, and runs much slower when True.

  check_jacobian : boolean, optional
      If True, compare the analytic jacobian with a forward finite difference jacobean
      and print warning messages if there is disagreement.

  gatestringWeightsDict : dictionary, optional
      A dictionary with keys == gate strings and values == multiplicative scaling 
      factor for the corresponding gate string. The default is no weight scaling at all.

  memLimit : int, optional
      A rough memory limit in bytes which restricts the amount of intermediate
      values that are computed and stored.


  Returns
  -------
  gateset               if returnAll == False and returnErrorVec == False
  gatesets              if returnAll == True  and returnErrorVec == False
  (errorVec, gateset)   if returnAll == False and returnErrorVec == True
  (errorVecs, gatesets) if returnAll == True  and returnErrorVec == True
      where errorVec is a numpy array of minimum error values v = f(x_min), where f(x)**2 is
      the function being minimized, gateset is the GateSet containing the final estimated gates.
      In cases when returnAll == True, gatesets and errorVecs are lists whose i-th elements are the
      errorVec and gateset corresponding to the results of the i-th iteration.
  """

  #convert lists of GateStrings to lists of raw tuples since that's all we'll need
  if len(gateStringSetsToUseInEstimation ) > 0 and \
     len(gateStringSetsToUseInEstimation[0]) > 0 and \
     isinstance(gateStringSetsToUseInEstimation[0][0],_objs.GateString):
    gateStringLists = [ [gstr.tup for gstr in gsList] for gsList in gateStringSetsToUseInEstimation ]
  else:
    gateStringLists = gateStringSetsToUseInEstimation 

  #Run extended LSGST iteratively on given sets of estimatable strings
  lsgstGatesets = [ ]; minErrs = [ ] #for returnAll == True case
  lsgstGateset = startGateset.copy(); nIters = len(gateStringLists)
  for (i,stringsToEstimate) in enumerate(gateStringLists):
    if verbosity > 1: print ""
    if verbosity > 0:
      print "--- Iterative LSGST: Beginning iter %d of %d %s: %d gate strings ---" \
                  % (i+1,nIters,("(%s) " % gateStringSetLabels[i]) if gateStringSetLabels else "", len(stringsToEstimate))
      _sys.stdout.flush()

    if stringsToEstimate is None or len(stringsToEstimate) == 0: continue

    if gatestringWeightsDict is not None:
      gatestringWeights = _np.ones( len(stringsToEstimate), 'd')
      for gatestr,weight in gatestringWeightsDict.iteritems():
        if gatestr in stringsToEstimate:
          gatestringWeights[ stringsToEstimate.index(gatestr) ] = weight
    else: gatestringWeights = None

    minErr, lsgstGateset = do_mc2gst_with_model_selection( dataset, lsgstGateset, dimDelta, stringsToEstimate,
                                    maxiter, maxfev, tol, opt_gates, opt_G0, opt_SPAM, opt_SP0,
                                    cptp_penalty_factor, minProbClipForWeighting, probClipInterval,
                                    useFreqWeightedChiSq, regularizeFactor, verbosity, check, check_jacobian,
                                    gatestringWeights, memLimit)
    if returnAll: 
      lsgstGatesets.append(lsgstGateset)
      minErrs.append(minErr)

  if returnErrorVec:
    return (minErrs, lsgstGatesets) if returnAll else (minErr, lsgstGateset)
  else:
    return lsgstGatesets if returnAll else lsgstGateset


###################################################################################
#                 Maximum Likelihood Estimation GST (MLEGST)
##################################################################################

#OLD args: method='leastsq', constrainToCP=False, constrainToValidSpam=False, constrainType='wall',
#    method : string, optional
#        The method used to optimize the log-likelihood function.  Can be any method
#        known by scipy.optimize.minimize such as 'BFGS', 'Nelder-Mead', 'CG', 'L-BFGS-B',
#        or additionally:
#
#        - 'leastsq' -- least squares minimization of logl term by term
#        - 'custom' -- custom CG that often works better than 'CG'
#        - 'supersimplex' -- repeated application of 'Nelder-Mead' to converge it
#        - 'basinhopping' -- scipy.optimize.basinhopping using L-BFGS-B as a local optimizer
#        - 'swarm' -- particle swarm global optimization algorithm
#        - 'evolve' -- evolutionary global optimization algorithm using DEAP
#        - 'brute' -- Experimental: scipy.optimize.brute using 4 points along each dimensions
#  
#    constrainToCP : bool, optional
#        Whether to constrain the optimization over gatesets to CP gatesets only. Note
#        that to constraining to TP also is accomplished by setting opt_G0 to False and
#        using with a TP startingGateset.
#  
#    constrainToValidSpam : bool, optional
#        Whether to constrain the optimization to valid surface preparation and measurements.
#        This means the prepared density matrix must be positive semidefinite with trace 1, and
#        the effects must have eigenvalues between 0 and 1.
#        
#    constrainType : string, optional
#        How constraint violations are implemented.  Can be:
#
#        - 'wall' -- objective function gets crazy large when a constraint is violated
#        - 'projection' -- objective function projects solution back into valid space 
#           and returns the value there when a constraint is violated
#  
#    verbosity : int, optional
#        How much detail to send to stdout.


def do_mlgst(dataset, startGateset, gateStringsToUse, 
             maxiter=100000, maxfev=None, tol=1e-6,
             opt_gates=True, opt_G0=True, opt_SPAM=True, opt_SP0=True,
             minProbClip=1e-4, probClipInterval=None, radius=1e-4, poissonPicture=True,
             verbosity=0, check=False, gateLabelAliases=None, memLimit=None):

    """
    Performs Maximum Likelihood Estimation Gate Set Tomography on the dataset.
  
    Parameters
    ----------
    dataset : DataSet
        The data used to generate LMLEGST gate estimates
  
    startGateset : GateSet
        The GateSet used as a starting point for the maximum-likelihood estimation.
  
    maxiter : int, optional
        Maximum number of iterations for the optimization.
  
    maxfev : int, optional
        Maximum number of function evaluations for the optimization.
        Defaults to maxiter.
  
    tol : float, optional
        The tolerance for the least squares optimization.
  
    opt_gates : bool, optional
        Whether the gate matrices should be optimized
  
    opt_G0 : bool, optional
        Whether the first row of gate matrices should be optimized.  If False, then
        when the startGateset has TP gates this will now be changed during the
        optimization and the resulting gates are guaranteed to be TP.
  
    opt_SPAM : bool, optional
        Whether the rhoVecs and EVecs should be optimized
  
    opt_SP0 : bool, optional
        Whether the first element of the state preparation vectors
        (i.e. the rhoVecs) should be optimized.  If False, then rhoVecs
        in startingGateset that are trace == 1 will remain trace == 1 
        after the optimization.
  
    minProbClip : float, optional
        The minimum probability treated normally in the evaluation of the log-likelihood.
        A penalty function replaces the true log-likelihood for probabilities that lie
        below this threshold so that the log-likelihood never becomes undefined (which improves
        optimizer performance).

    probClipInterval : 2-tuple or None, optional
        (min,max) values used to clip the probabilities predicted by gatesets during MLEGST's
        search for an optimal gateset (if not None).  Defaults to no clipping.

    radius : float, optional
        Specifies the severity of rounding used to "patch" the zero-frequency
        terms of the log-likelihood.

    poissonPicture : boolean, optional
        Whether the Poisson-picture log-likelihood should be used.

    check : boolean, optional
      If True, perform extra checks within code to verify correctness.  Used
      for testing, and runs much slower when True.

    gateLabelAliases : dictionary, optional
      Dictionary whose keys are gate label "aliases" and whose values are tuples
      corresponding to what that gate label should be expanded into before querying
      the dataset. Defaults to the empty dictionary (no aliases defined)
      e.g. gateLabelAliases['Gx^3'] = ('Gx','Gx','Gx')
      
    memLimit : int, optional
      A rough memory limit in bytes which restricts the amount of intermediate
      values that are computed and stored.

  
    Returns
    -------
    maxLogL : float
        The maximum log-likelihood obtained.
    gateset : GateSet
        The gate set that maximized the log-likelihood.
    """
    gs = startGateset.copy()
    if maxfev is None: maxfev = maxiter
    
    if verbosity > 2: print ""
    if verbosity > 1:
        print "--- MLEGST ---"
        
    spamLabels = gs.get_spam_labels() #this list fixes the ordering of the spam labels
    vec_gs_len = gs.num_params(opt_gates,opt_G0,opt_SPAM,opt_SP0)

    if gateLabelAliases is not None: #then find & replace aliased gate labels with their expanded form
      dsGateStringsToUse = []
      for s in gateStringsToUse:
        for label,expandedStr in gateLabelAliases.iteritems():
          while label in tuple(s):
            i = tuple(s).index(label)
            s = tuple(s)[:i] + tuple(expandedStr) + tuple(s)[i+1:]
        dsGateStringsToUse.append(s)
    else:
      dsGateStringsToUse = gateStringsToUse # no difference in the strings used by the alias


    #Memory estimates - maybe make GateSet methods to get intermediate memory estimates
    #FOR NOW - just taken from LSGST which should be ballpark correct -- TODO: recalc these estimates for MLE...
    ns = len(spamLabels); ng = len(gateStringsToUse); ne = vec_gs_len; gd = len(gs.rhoVecs[0])
    persistentMem = 8* (ng*(ns + ns*ne + 1 + 3*ns)) # Memory needed by final results in bytes
    intermedMem   = 8* (ng*(1 + gd**2 * (1 + ne))) # Memory needed by intermediate results in bytes (now just ~ that of dproduct)
    C = 1.0/1024.0**3 #; print "DEBUG: MEM2 ",persistentMem,",",intermedMem

    maxEvalSubTreeSize = None
    if memLimit is not None:
      if memLimit < persistentMem:
        raise MemoryError("Memory limit (%g GB) is < memory required to hold final results (%g GB)" % (memLimit*C, persistentMem*C))
      if memLimit < intermedMem:
        reductionFactor = float(intermedMem) / float(memLimit)
        maxEvalSubTreeSize = int(ng / reductionFactor)
  
    if verbosity > 2:
      print "Peristent Memory estimate: %d spam labels, %d gate strings, %d gateset params" % (ns,ng,ne)
      print "    ==> %g GB (p) + %g GB (dp) + %g GB (other) = %g GB (total)" % \
          (8*ns*ng*C, 8*ns*ng*ne*C,8*(ng+3*ns*ng)*C, persistentMem*C)
      print "Intermediate Memory estimate: %d gate strings, %d gate dimension, %d gateset params" % (ng,gd,ne)
      print "    ==> %g GB (p) + %g GB (dp) + %g GB (other) = %g GB (total)" % \
          (8*ng*gd*gd*C, 8*ng*gd*gd*ne*C,8*ng*C, intermedMem*C)
      if memLimit is not None: print "Memory limit = %g GB" % (memLimit*C)
      if maxEvalSubTreeSize is not None: print "Maximum eval sub-tree size = %d" % maxEvalSubTreeSize


    evTree = gs.bulk_evaltree(gateStringsToUse)

    if maxEvalSubTreeSize is not None:
      evTree.split(maxEvalSubTreeSize)
      if verbosity > 2: 
        print "Memory limit has imposed a division of the evaluation tree:"
        evTree.print_analysis()


    spam_lbl_rows = { sl:i for (i,sl) in enumerate(spamLabels) }

    cntVecMx = _np.empty( (len(spamLabels),len(gateStringsToUse)), 'd' )
    probs = _np.empty( (len(spamLabels),len(gateStringsToUse)), 'd' )
    dprobs = _np.empty( (len(spamLabels),len(gateStringsToUse),vec_gs_len) )

    _tools.fill_count_vecs(cntVecMx, spam_lbl_rows, dataset, dsGateStringsToUse)
    logL_upperbound = _tools.logl_max(dataset, dsGateStringsToUse, cntVecMx, poissonPicture) # The theoretical upper bound on the log(likelihood)

    totalCntVec = _np.sum(cntVecMx, axis=0)
    minusCntVecMx = -1.0 * cntVecMx
    
    freqs = cntVecMx / totalCntVec[None,:]
    freqs_nozeros = _np.where(cntVecMx == 0, 1.0, freqs) # set zero freqs to 1.0 so np.log doesn't complain
    if poissonPicture:
      freqTerm = cntVecMx * ( _np.log(freqs_nozeros) - 1.0 ) 
    else:
      freqTerm = cntVecMx * _np.log(freqs_nozeros) 
    freqTerm[ cntVecMx == 0 ] = 0.0 # set 0 * log(0) terms explicitly to zero since numpy doesn't know this limiting behavior

    KM = len(spamLabels)*len(gateStringsToUse) #shorthand for this combined dimension used below      
    min_p = minProbClip
    a = radius # parameterizes "roundness" of f == 0 terms 

    if poissonPicture:
      
      # The log(Likelihood) within the Poisson picture is:
      #
      # L = prod_{i,sl} lambda_{i,sl}^N_{i,sl} e^{-lambda_{i,sl}} / N_{i,sl}!
      # 
      # Where lamba_{i,sl} := p_{i,sl}/N[i] is a rate, i indexes the gate string,
      #  and sl indexes the spam label.  N[i] is the total counts for the i-th gatestring, and
      #  so sum_{sl} N_{i,sl} == N[i]. We can ignore the p-independent N_j! and take the log:
      #
      # log L = sum_{i,sl} N_{i,sl} log(N[i]*p_{i,sl}) - N[i]*p_{i,sl} 
      #       = sum_{i,sl} N_{i,sl} log(p_{i,sl}) - N[i]*p_{i,sl}   (where we ignore the p-independent log(N[i]) terms)
      # 
      # The objective function computes the negative log(Likelihood) as a vector of leastsq
      #  terms, where each term == sqrt( N_{i,sl} * -log(p_{i,sl}) + N[i] * p_{i,sl} )
      #
      # See LikelihoodFunctions.py for details on patching
      
      def objective_func(vectorGS): 
        gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
        gs.bulk_fill_probs(probs, spam_lbl_rows, evTree, probClipInterval, check)
        pos_probs = _np.where(probs < min_p, min_p, probs)
        S = minusCntVecMx / min_p + totalCntVec[None,:]
        S2 = -0.5 * minusCntVecMx / (min_p**2) 
        v = freqTerm + minusCntVecMx * _np.log(pos_probs) + totalCntVec[None,:]*pos_probs # dims K x M (K = nSpamLabels, M = nGateStrings)
        v = _np.maximum(v,0)  #remove small negative elements due to roundoff error (above expression *cannot* really be negative)
        v = _np.where( probs < min_p, v + S*(probs - min_p) + S2*(probs - min_p)**2, v) #quadratic extrapolation of logl at min_p for probabilities < min_p
        v = _np.where( minusCntVecMx == 0, totalCntVec[None,:] * _np.where(probs >= a, probs, (-1.0/(3*a**2))*probs**3 + probs**2/a + a/3.0), v)
            #special handling for f == 0 terms using quadratic rounding of function with minimum: max(0,(a-p)^2)/(2a) + p
        v = _np.sqrt( v )       
        return v.reshape([KM])  #Note: no test for whether probs is in [0,1] so no guarantee that 
                                #      sqrt is well defined unless probClipInterval is set within [0,1].
      
      #  derivative of  sqrt( N_{i,sl} * -log(p_{i,sl}) + N[i] * p_{i,sl} ) terms:
      #   == 0.5 / sqrt( N_{i,sl} * -log(p_{i,sl}) + N[i] * p_{i,sl} ) * ( -N_{i,sl} / p_{i,sl} + N[i] ) * dp
      #  if p <  p_min then term == sqrt( N_{i,sl} * -log(p_min) + N[i] * p_min + S*(p-p_min) )
      #   and deriv == 0.5 / sqrt(...) * S * dp
      
      def jacobian(vectorGS):
          gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
          gs.bulk_fill_dprobs(dprobs, spam_lbl_rows, evTree, 
                             G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates,
                             prMxToFill=probs, clipTo=probClipInterval, check=check)
          
          pos_probs = _np.where(probs < min_p, min_p, probs)
          S = minusCntVecMx / min_p + totalCntVec[None,:]
          S2 = -0.5 * minusCntVecMx / (min_p**2) 
          v = freqTerm + minusCntVecMx * _np.log(pos_probs) + totalCntVec[None,:]*pos_probs # dims K x M (K = nSpamLabels, M = nGateStrings)
          v = _np.maximum(v,0)  #remove small negative elements due to roundoff error (above expression *cannot* really be negative)
          v = _np.where( probs < min_p, v + S*(probs - min_p) + S2*(probs - min_p)**2, v) #quadratic extrapolation of logl at min_p for probabilities < min_p
          v = _np.where( minusCntVecMx == 0, totalCntVec[None,:] * _np.where(probs >= a, probs, (-1.0/(3*a**2))*probs**3 + probs**2/a + a/3.0), v)

          v = _np.sqrt( v )
          v = _np.maximum(v,1e-100) #derivative diverges as v->0, but v always >= 0 so clip v to a small positive value to avoid divide by zero below
          dprobs_factor_pos = (0.5 / v) * (minusCntVecMx / pos_probs + totalCntVec[None,:])
          dprobs_factor_neg = (0.5 / v) * (S + 2*S2*(probs - min_p))
          dprobs_factor_zerofreq = (0.5 / v) * totalCntVec[None,:] * _np.where( probs >= a, 1.0, (-1.0/a**2)*probs**2 + 2*probs/a )
          dprobs_factor = _np.where( probs < min_p, dprobs_factor_neg, dprobs_factor_pos)
          dprobs_factor = _np.where( minusCntVecMx == 0, dprobs_factor_zerofreq, dprobs_factor )
          jac = dprobs * dprobs_factor[:,:,None] # (K,M,N) * (K,M,1)   (N = dim of vectorized gateset)
      
          jac = jac.reshape( [KM,vec_gs_len] )
          #if check_jacobian: _opt.check_jac(objective_func, vectorGS, jac, tol=1e-3, eps=1e-6, errType='abs')
          return jac

    else: # standard (non-Poisson-picture) logl

      # The log(Likelihood) within the standard picture is:
      #
      # L = prod_{i,sl} p_{i,sl}^N_{i,sl}
      # 
      # Where i indexes the gate string, and sl indexes the spam label.  
      #  N[i] is the total counts for the i-th gatestring, and
      #  so sum_{sl} N_{i,sl} == N[i]. We take the log:
      #
      # log L = sum_{i,sl} N_{i,sl} log(p_{i,sl})
      # 
      # The objective function computes the negative log(Likelihood) as a vector of leastsq
      #  terms, where each term == sqrt( N_{i,sl} * -log(p_{i,sl}) )
      #
      # See LikelihoodFunction.py for details on patching
      
      def objective_func(vectorGS): 
        gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
        gs.bulk_fill_probs(probs, spam_lbl_rows, evTree, probClipInterval, check)
        pos_probs = _np.where(probs < min_p, min_p, probs)
        S = minusCntVecMx / min_p 
        S2 = -0.5 * minusCntVecMx / (min_p**2) 
        v = freqTerm + minusCntVecMx * _np.log(pos_probs)  # dims K x M (K = nSpamLabels, M = nGateStrings)
        v = _np.maximum(v,0)  #remove small negative elements due to roundoff error (above expression *cannot* really be negative)
        v = _np.where( probs < min_p, v + S*(probs - min_p) + S2*(probs - min_p)**2, v) #quadratic extrapolation of logl at min_p for probabilities < min_p
        v = _np.where( minusCntVecMx == 0, 0.0, v)
        v = _np.sqrt( v )       
        return v.reshape([KM])  #Note: no test for whether probs is in [0,1] so no guarantee that 
                                #      sqrt is well defined unless probClipInterval is set within [0,1].
      
      #  derivative of  sqrt( N_{i,sl} * -log(p_{i,sl}) + N[i] * p_{i,sl} ) terms:
      #   == 0.5 / sqrt( N_{i,sl} * -log(p_{i,sl}) + N[i] * p_{i,sl} ) * ( -N_{i,sl} / p_{i,sl} + N[i] ) * dp
      #  if p <  p_min then term == sqrt( N_{i,sl} * -log(p_min) + N[i] * p_min + S*(p-p_min) )
      #   and deriv == 0.5 / sqrt(...) * S * dp
      
      def jacobian(vectorGS):
          gs.from_vector(vectorGS,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
          gs.bulk_fill_dprobs(dprobs, spam_lbl_rows, evTree, 
                             G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates,
                             prMxToFill=probs, clipTo=probClipInterval, check=check)
          
          pos_probs = _np.where(probs < min_p, min_p, probs)
          S = minusCntVecMx / min_p
          S2 = -0.5 * minusCntVecMx / (min_p**2) 
          v = freqTerm + minusCntVecMx * _np.log(pos_probs) # dims K x M (K = nSpamLabels, M = nGateStrings)
          v = _np.maximum(v,0)  #remove small negative elements due to roundoff error (above expression *cannot* really be negative)
          v = _np.where( probs < min_p, v + S*(probs - min_p) + S2*(probs - min_p)**2, v) #quadratic extrapolation of logl at min_p for probabilities < min_p
          v = _np.where( minusCntVecMx == 0, 0.0, v)
          v = _np.sqrt( v )

          v = _np.maximum(v,1e-100) #derivative diverges as v->0, but v always >= 0 so clip v to a small positive value to avoid divide by zero below
          dprobs_factor_pos = (0.5 / v) * (minusCntVecMx / pos_probs)
          dprobs_factor_neg = (0.5 / v) * (S + 2*S2*(probs - min_p))
          dprobs_factor = _np.where( probs < min_p, dprobs_factor_neg, dprobs_factor_pos)
          dprobs_factor = _np.where( minusCntVecMx == 0, 0.0, dprobs_factor )
          jac = dprobs * dprobs_factor[:,:,None] # (K,M,N) * (K,M,1)   (N = dim of vectorized gateset)
      
          jac = jac.reshape( [KM,vec_gs_len] )
          #if check_jacobian: _opt.check_jac(objective_func, vectorGS, jac, tol=1e-3, eps=1e-6, errType='abs')
          return jac
    

    #Run optimization (use leastsq)
    x0 = gs.to_vector(G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
    opt_x, opt_jac, info, msg, flag = \
        _spo.leastsq( objective_func, x0, xtol=tol, ftol=tol, gtol=0,
                      maxfev=maxfev*(len(x0)+1), full_output=True, Dfun=jacobian ) 
    if verbosity > 2: print "Least squares msg = ",msg, "; flag =",flag

    gs.from_vector(opt_x,G0=opt_G0, SP0=opt_SP0, SPAM=opt_SPAM, gates=opt_gates)
    gs.log("MLEGST", { 'tol': tol,  'maxiter': maxiter, 
                       'opt_G0': opt_G0, 'opt_SP0': opt_SP0 } )

    minErrVec = objective_func(opt_x)  #note: calls gs.from_vector(opt_x,...) so don't need to call this again
    deltaLogL = sum([x**2 for x in minErrVec]) # upperBoundLogL - logl (a positive number)
    
    #if constrainType == 'projection':
    #    if cpPenalty != 0: d,gs = _contractToCP_direct(gs,verbosity=0,TPalso=not opt_G0,maxiter=100)
    #    if spamPenalty != 0: gs = _contractToValidSPAM(gs, verbosity=0)

    if verbosity > 1:
      if _np.isfinite(deltaLogL):
        nGateStrings = len(gateStringsToUse)
        nDataParams  = nGateStrings*(len(dataset.get_spam_labels())-1) #number of independent parameters 
                                                                     # in dataset (max. model # of params)
        try:
          nModelParams = gs.num_nongauge_params(opt_gates, opt_G0, opt_SPAM, opt_SP0) #len(x0)
        except: #numpy can throw a LinAlgError
          print "Warning: could not obtain number of *non-gauge* parameters - using total params instead"
          nModelParams = gs.num_params(opt_gates, opt_G0, opt_SPAM, opt_SP0) #just use total number of params

        pvalue = 1.0 - _stats.chi2.cdf(2*deltaLogL,nDataParams-nModelParams) # reject GST if p-value < threshold (~0.05?)

        print "  Maximum log(L) = %g below upper bound of %g" % (deltaLogL, logL_upperbound)
        print "    2*Delta(log(L)) = %g (%d data params - %d model params = expected mean of %g; p-value = %g)" % \
            (2*deltaLogL, nDataParams,  nModelParams, nDataParams-nModelParams, pvalue)

        #print " DEBUG LOGL = ", _tools.logl(gs, dataset, gateStringsToUse),
        #  " DELTA = ",(logL_upperbound-_tools.logl(gs, dataset, gateStringsToUse))
      else:
        print "  **Warning** upper_bound_logL - logl = " + str(deltaLogL)
        
    return (logL_upperbound - deltaLogL), gs

#SCRATCH - used for debugging logl objective function
        #if len( (v < 0).nonzero()[0] ) > 0:
        #  raise ValueError("B: v < 0!! min/max of v = %g,%g" % (_np.min(v), _np.max(v)) )
      
        #v = _np.sqrt( _np.where( probs < min_p, v + totalCntVec[None,:]*(min_p - probs), v) ) #OLD
      
        # Let p_min be a small number > 0
        #if p >= p_min then term == sqrt( N_{i,sl} * -log(p_{i,sl}) + N[i] * p_{i,sl} )
        #if p <  p_min then term == sqrt( N_{i,sl} * -log(p_min) + N[i] * p_min + S*(p-p_min) )
        #  where S := deriv (slope) of -logl at p_min = N_{i,sl} * -1/p_min + N[i]
        
        #v = _np.sqrt(cntVecMx * _np.log(freqs/probs) + totalCntVec[None,:]*(probs-freqs) ) # dims K x M (K = nSpamLabels, M = nGateStrings)
      
        #DEBUG
        #beforeSqrt = freqTerm + minusCntVecMx * _np.log(probs) + totalCntVec[None,:]*probs
        #oob = (beforeSqrt < 0.0)
        #nbelowZero = len(oob.nonzero()[0])
        #nan = _np.isnan(v)
        #nNan = len(nan.nonzero()[0])
        #print "%d terms < 0, %d terms NAN" % (nbelowZero,nNan)
        #k,l= nan.nonzero()[0][0],nan.nonzero()[1][0]
        #print k,l,v[k,l],beforeSqrt[k,l]
      
        #DEBUG - comment out line above, set probs to something larger than the [0,1] range, and uncomment lines below
        #oob = _np.logical_or(probs < 0, probs > 1)  #boolean array masking out-of-bound probabilities
        #clipped_probs =  _np.clip(probs,1e-6,1.0-1e-6)
        #v = _np.sqrt(minusCntVecMx * _np.log(clipped_probs) + totalCntVec[None,:]*clipped_probs) # dims K x M (K = nSpamLabels, M = nGateStrings)
        #v[ oob ] = 1e3 #penalty terms for out of bounds probabilities
        #nClipped = len(oob.nonzero()[0])
        #nClipped0 = len((probs < 0).nonzero()[0])
        #nClipped1 = len((probs > 1).nonzero()[0])
        #nz = (probs < 0).nonzero()
        #firstProbBelow = probs[ nz[0][0], nz[1][0] ] if nClipped0 > 0 else None
        #nz = (probs > 1).nonzero()
        #firstProbAbove = probs[ nz[0][0], nz[1][0] ] if nClipped1 > 0 else None
        #print "DB: logl = ", -1.0*sum([x**2 for x in v.reshape([KM])]), " outside [0,1] = ",(nClipped0,nClipped1), " below = ", firstProbBelow, " above = ", firstProbAbove


def do_iterative_mlgst(dataset, startGateset, gateStringSetsToUseInEstimation,
                      maxiter=100000, maxfev=None, tol=1e-6,
                      opt_gates=True, opt_G0=True, opt_SPAM=True, opt_SP0=True,
                      minProbClip=1e-4, probClipInterval=None, radius=1e-4, poissonPicture=True,
                      returnMaxLogL=False, returnAll=False, 
                      gateStringSetLabels=None, useFreqWeightedChiSq=False, verbosity=0, 
                      check=False, memLimit=None):
  """
  Performs Iterative Maximum Liklihood Estimation Gate Set Tomography on the dataset.

  Parameters
  ----------
  dataset : DataSet
      The data used to generate LSGST gate estimates

  startGateset : GateSet
      The GateSet used as a starting point for the least-squares
      optimization.

  gateStringSetsToUseInEstimation : list of lists of (tuples or GateStrings)
      The i-th element is a list of the gate strings to be used in the i-th iteration 
      of LSGST.  Each element of these lists is a gate string, specifed as
      either a GateString object or as a tuple of gate labels (but all must be specified
      using the same type).
      e.g. [ [ (), ('Gx',) ], [ (), ('Gx',), ('Gy',) ], [ (), ('Gx',), ('Gy',), ('Gx','Gy') ]  ]

  maxiter : int, optional
      Maximum number of iterations for the least squares optimization.

  maxfev : int, optional
      Maximum number of function evaluations for the least squares optimization.

  tol : float, optional
      The tolerance for the least squares optimization.

  opt_gates : bool, optional
      Whether the gate matrices should be optimized

  opt_G0 : bool, optional
      Whether the first row of gate matrices should be optimized.  If False, then
      when the startGateset has TP gates this will now be changed during the
      optimization and the resulting gates are guaranteed to be TP.

  opt_SPAM : bool, optional
      Whether the rhoVecs and EVecs should be optimized

  opt_SP0 : bool, optional
      Whether the first element of the state preparation vectors
      (i.e. the rhoVecs) should be optimized.  If False, then rhoVecs
      in startingGateset that are trace == 1 will remain trace == 1 
      after the optimization.

  minProbClip : float, optional
      The minimum probability treated normally in the evaluation of the log-likelihood.
      A penalty function replaces the true log-likelihood for probabilities that lie
      below this threshold so that the log-likelihood never becomes undefined (which improves
      optimizer performance).

  probClipInterval : 2-tuple or None, optional
      (min,max) values used to clip the probabilities predicted by gatesets during MLEGST's
      search for an optimal gateset (if not None).  Defaults to no clipping.

  radius : float, optional
      Specifies the severity of rounding used to "patch" the zero-frequency
      terms of the log-likelihood.

  poissonPicture : boolean, optional
      Whether the Poisson-picture log-likelihood should be used.
  
  returnAll : bool, optional
      If True return a list of gatesets (and maxLogLs if returnMaxLogL == True),
      one per iteration, instead of the results from just the final iteration.

  gateStringSetLabels : list of strings, optional
      An identification label for each of the gate string sets (used for displaying
      progress).  Must be the same length as gateStringSetsToUseInEstimation.

  useFreqWeightedChiSq : bool, optional
      If True, chi-square objective function uses the approximate chi^2 weighting:  N/(f*(1-f)) 
      where f is the frequency obtained from the dataset, instead of the true chi^2: N/(p*(1-p))
      where p is a predicted probability.  Defaults to False, and only should use 
      True for backward compatibility.

  returnAll : boolean, optional
      If True return a list of gatesets
                      gateStringSetLabels=None,
  verbosity : int, optional
      How much detail to send to stdout.

  check : boolean, optional
    If True, perform extra checks within code to verify correctness.  Used
    for testing, and runs much slower when True.

  memLimit : int, optional
      A rough memory limit in bytes which restricts the amount of intermediate
      values that are computed and stored.


  Returns
  -------
  gateset               if returnAll == False and returnMaxLogL == False
  gatesets              if returnAll == True  and returnMaxLogL == False
  (maxLogL, gateset)    if returnAll == False and returnMaxLogL == True
  (maxLogL, gatesets)   if returnAll == True  and returnMaxLogL == True
      where maxLogL is the maximum log-likelihood, and gateset is the GateSet containing
      the final estimated gates.  In cases when returnAll == True, maxLogLs and gatesets
      are lists whose i-th elements are the maxLogL and gateset corresponding to the results
      of the i-th iteration.
  """

  #convert lists of GateStrings to lists of raw tuples since that's all we'll need
  if len(gateStringSetsToUseInEstimation ) > 0 and \
     len(gateStringSetsToUseInEstimation[0]) > 0 and \
     isinstance(gateStringSetsToUseInEstimation[0][0],_objs.GateString):
    gateStringLists = [ [gstr.tup for gstr in gsList] for gsList in gateStringSetsToUseInEstimation ]
  else:
    gateStringLists = gateStringSetsToUseInEstimation 


  #Run extended MLEGST iteratively on given sets of estimatable strings
  mleGatesets = [ ]; maxLogLs = [ ] #for returnAll == True case
  mleGateset = startGateset.copy(); nIters = len(gateStringLists)
  for (i,stringsToEstimate) in enumerate(gateStringLists):
    if verbosity > 1: print ""
    if verbosity > 0:
      print "--- Iterative MLEGST: Beginning iter %d of %d %s: %d gate strings ---" \
                  % (i+1,nIters,("(%s) " % gateStringSetLabels[i]) if gateStringSetLabels else "", len(stringsToEstimate))
      _sys.stdout.flush()

    if stringsToEstimate is None or len(stringsToEstimate) == 0: continue

    
    chi2Diff, mleGateset = do_mc2gst( dataset, mleGateset, stringsToEstimate,
                                    maxiter, maxfev, tol, opt_gates, opt_G0, opt_SPAM, opt_SP0,
                                    0, minProbClip, probClipInterval, useFreqWeightedChiSq, 0, verbosity, check,
                                    False, None, None, memLimit) # so maxLogL is really chi2 number here

    logL_ub = _tools.logl_max(dataset, stringsToEstimate, None, poissonPicture, check)
    maxLogL = _tools.logl(mleGateset, dataset, stringsToEstimate, minProbClip, probClipInterval,
                       radius, None, None, poissonPicture, check)  #get maxLogL from chi2 estimate
    if verbosity > 0:
      print "    2*Delta(log(L)) = %g" % (2*(logL_ub - maxLogL))

    #OLD: do MLEGST for all iterations
    #maxLogL, mleGateset = do_mlgst( dataset, mleGateset, stringsToEstimate,
    #                                maxiter, maxfev, tol, opt_gates, opt_G0, opt_SPAM, opt_SP0,
    #                                minProbClip, probClipInterval, radius, poissonPicture,
    #                                verbosity, check, None, memLimit)

    if i == len(gateStringLists)-1: #on the last iteration, do MLE
      if verbosity > 0: print "--- Last Iteration: switching to MLE objective ---"
      maxLogL_p, mleGateset_p = do_mlgst( dataset, mleGateset, stringsToEstimate,
                                          maxiter, maxfev, tol, opt_gates, opt_G0, opt_SPAM, opt_SP0,
                                          minProbClip, probClipInterval, radius, poissonPicture,
                                          verbosity, check, None, memLimit)
      if verbosity > 0:
        print "    2*Delta(log(L)) = %g" % (2*(logL_ub - maxLogL_p))

      if maxLogL_p > maxLogL: #if do_mlgst improved the maximum log-likelihood
        maxLogL = maxLogL_p
        mleGateset = mleGateset_p
      else:
        if verbosity > 0:
          print "   !!! Warning: MLEGST failed to improve logl: retaining chi2-objective estimate !!!"

    if returnAll: 
      mleGatesets.append(mleGateset)
      maxLogLs.append(maxLogL)

  if returnMaxLogL:
    return (maxLogL, mleGatesets) if returnAll else (maxLogL, mleGateset)
  else:
    return mleGatesets if returnAll else mleGateset


###################################################################################
#                 Other Tools
##################################################################################

#Note: this code overlaps do_mlgst a lot -- consolidate in FUTURE?
def optimize_gauge(gateset, toGetTo, maxiter=100000, maxfev=None, tol=1e-8,
                  method='L-BFGS-B', targetGateset=None, targetFactor=0.0001, 
                  constrainToTP=False, constrainToCP=False, constrainToValidSpam=False,
                  returnAll=False, gateWeight=1.0, spamWeight=0.0, 
                  targetGatesMetric="frobenius", targetSpamMetric="frobenius", verbosity=0):
    """
    Optimize the gauge of a GateSet using some 'goodness' function.
  
    Parameters
    ----------
    gateset : GateSet
        The gateset to gauge-optimize
  
    toGetTo : string
        Specifies which goodness function is used.  Allowed values are:

        - 'target' -- minimize the frobenius norm of the difference between
          gateset and targetGateset, which must be specified.
        - 'CPTP'   -- minimize the non-CPTP-ness of the gateset.
        - 'TP'     -- minimize the non-TP-ness of the gateset.
        - 'TP and target' -- minimize the non-TP-ness of the gateset and
          the frobenius norm distance to targetGateset using targetFactor
          to multiply this distance.
        - 'CPTP and target' -- minimize the non-CPTP-ness of the gateset and
          the frobenius norm distance to targetGateset using targetFactor to
          multiply this distance.
        - 'Completely Depolarized' -- minimize the frobenius norm distance 
          between gateset and the completely-depolarized gateset.
  
    maxiter : int, optional
        Maximum number of iterations for the gauge optimization.
  
    maxfev : int, optional
        Maximum number of function evaluations for the gauge optimization.
        Defaults to maxiter.

    tol : float, optional
        The tolerance for the gauge optimization.
  
    method : string, optional
        The method used to optimize the objective function.  Can be any method
        known by scipy.optimize.minimize such as 'BFGS', 'Nelder-Mead', 'CG', 'L-BFGS-B',
        or additionally:

        - 'custom' -- custom CG that often works better than 'CG'
        - 'supersimplex' -- repeated application of 'Nelder-Mead' to converge it
        - 'basinhopping' -- scipy.optimize.basinhopping using L-BFGS-B as a local optimizer
        - 'swarm' -- particle swarm global optimization algorithm
        - 'evolve' -- evolutionary global optimization algorithm using DEAP
        - 'brute' -- Experimental: scipy.optimize.brute using 4 points along each dimensions
  
    targetGateset : GateSet, optional
        The target gateset used by the 'target', 'TP and target' and 'CPTP and target'
        values of the toGetTo parameter (above).
  
    targetFactor : float, optional
        A weighting factor that multiplies by the frobenius norm difference between gateset
        and targetGateset with toGetTo is either "TP and target" or "CPTP and target".
        
    constrainToTP : bool, optional
       When toGetTo == 'target', whether gauge optimization is constrained to TP gatesets.

    constrainToCP : bool, optional
       When toGetTo == 'target', whether gauge optimization is constrained to CP gatesets.
  
    constrainToValidSpam : bool, optional
       When toGetTo == 'target', whether gauge optimization is constrained to gatesets with
       valid state preparation and measurements.

    returnAll : bool, optional
       When True, return best "goodness" value and gauge matrix in addition to the
       gauge optimized gateset.

    gateWeight : float, optional
       Weighting factor that multiplies each single-gate norm before summing it
       into the total frobenius norm between two gatesets.

    spamWeight : float, optional
       Weighting factor that multiplies the norms of between surface-preparation
       and measuement vectors (or gates, depending on the metric used) before
       summing them into the total norm between two gatesets.

    targetGatesMetric : string, optional
       When toGetTo == "target", this specifies the metric used to evaluate what 
       "close to the target" means for the gate matrices.  Allowed values are 
       "frobenius", "fidelity", and "tracedist". Contributions for the individual
       gates are summed, and in the case of frobenius, ultimately normalized by
       the number of elements.

    targetSpamMetric : string, optional
       When toGetTo == "target", this specifies the metric used to evaluate what 
       "close to the target" means for the spam vectors.  Allowed values are 
       "frobenius", "fidelity", and "tracedist". Contributions for the individual
       vectors are summed, and in the case of frobenius, ultimately normalized 
       by the number of elements.

    verbosity : int, optional
        How much detail to send to stdout.
  
    Returns
    -------
    gateset                            if returnAll == False

    (goodnessMin, gaugeMx, gateset)    if returnAll == True

      where goodnessMin is the minimum value of the goodness function (the best 'goodness')
      found, gaugeMx is the gauge matrix used to transform the gateset, and gateset is the
      final gauge-transformed gateset.
    """

    #OLD
    #    - 'best fidelity' -- minimize the sum of gate fidelities between
    #      gateset and targetGateset, which must be specified.
    #    - 'best trace distance' -- minimize the sum of trace distances between
    #      gateset and targetGateset, which must be specified.

    if maxfev is None: maxfev = maxiter
    gateDim = gateset.get_dimension()
    firstRowForTP = _np.zeros(gateDim); firstRowForTP[0] = 1.0
    
    if toGetTo == "target":
      if targetGateset is None: raise ValueError("Must specify a targetGateset != None")

      CLIFF = 1e10
      cpPenalty = CLIFF if constrainToCP else 0
      spamPenalty = CLIFF if constrainToValidSpam else 0
      assert(method != "custom") #do not allow use of custom method yet (since it required a different obj func)

      if verbosity > 2: print ""
      if verbosity > 1: print "--- Gauge Optimization to a target (%s) ---" % method

      def objective_func(vectorM):
        if constrainToTP: vectorM = _np.concatenate( (firstRowForTP,vectorM) )
        matM = vectorM.reshape( (gateDim,gateDim) )
        gs = gateset.copy(); gs.transform(matM)

        if cpPenalty != 0:
          s = _tools.sum_of_negative_choi_evals(gs)
          if s > 1e-8: return cpPenalty #*(1.0+s) #1e-8 should match TOL in contract to CP routines

        if spamPenalty != 0:
          sp =  sum( [ _tools.rhovec_penalty(rhoVec) for rhoVec in gs.rhoVecs ] )
          sp += sum( [ _tools.evec_penalty(EVec)     for EVec   in gs.EVecs ] )
          if sp > 1e-8: return spamPenalty #*(1.0+sp) #1e-8 should match TOL in contract to CP routines


        #Special case of full frobenius norm
          # TODO: remove?  but note this is different from the separate cases summed b/c of normalization
        if targetGatesMetric == "frobenius" and targetSpamMetric == "frobenius":
          return gs.frobeniusdist(targetGateset, None, gateWeight, spamWeight)

        diff = 0
        if targetGatesMetric == "frobenius":
          diff += gs.frobeniusdist(targetGateset, None, gateWeight, 0.0)
        elif targetGatesMetric == "fidelity":
          for gateLbl in gs:
            diff += gateWeight * (1.0 - _tools.process_fidelity(targetGateset[gateLbl], gs[gateLbl]))
        elif targetGatesMetric == "tracedist":
          for gateLbl in gs:
            diff += gateWeight * _tools.jtracedist(targetGateset[gateLbl], gs[gateLbl])
        else: raise ValueError("Invalid targetGatesMetric: %s" % targetGatesMetric)
            
        if targetSpamMetric == "frobenius":
          diff += gs.frobeniusdist(targetGateset, None, 0.0, spamWeight)
        elif targetSpamMetric == "fidelity":
          for spamlabel in gs.SPAMs.keys(): 
            diff += spamWeight * (1.0 - _tools.process_fidelity(targetGateset.SPAMs[spamlabel], gs.SPAMs[spamlabel]))
        elif targetSpamMetric == "tracedist":
          for spamlabel in gs.SPAMs.keys(): 
            diff += spamWeight * _tools.jtracedist(targetGateset.SPAMs[spamlabel], gs.SPAMs[spamlabel])
        else: raise ValueError("Invalid targetSpamMetric: %s" % targetGatesMetric)

        return diff

      
    elif toGetTo == "CPTP":

      if constrainToTP: #assume gateset is already in TP so no TP optimization needed
        tpGateset = gateset
        tpGaugeMx = _np.identity( gateDim, 'd' )
      else:
        minf, tpGaugeMx, tpGateset = optimize_gauge(gateset,"TP", maxiter, maxfev, tol,
                                                   'L-BFGS-B', targetGateset, targetFactor, 
                                                   constrainToTP, constrainToCP, constrainToValidSpam, True,
                                                   gateWeight, spamWeight, verbosity)

      if verbosity > 2: print ""
      if verbosity > 1: print "--- Gauge Optimization to CPTP w/valid SPAM (%s) ---" % method
      constrainToTP = True #always constrain next optimization to TP

      #DEBUG
      #import pickle
      #bestGaugeMx = pickle.load(open("bestGaugeMx.debug"))
      
      def objective_func(vectorM):
        #matM = vectorM.reshape( (gateDim,gateDim) )
        #gs = gateset.copy(); gs.transform(matM)

        vectorM = _np.concatenate( (firstRowForTP,vectorM) )
        matM = vectorM.reshape( (gateDim,gateDim) )
        gs = tpGateset.copy(); gs.transform(matM)

        cpPenalties = _tools.sums_of_negative_choi_evals(gs)
        #numNonCP = sum([ 1 if p > 1e-4 else 0 for p in cpPenalties ])
        #cpPenalty = sum( [ 10**i*cp for (i,cp) in enumerate(cpPenalties)] ) + 100*numNonCP #DEBUG
        #print "DB: diff from best = ", frobeniusnorm(bestGaugeMx - matM) #DEBUG
        cpPenalty = sum( cpPenalties )

        spamPenalty =  sum( [ _tools.rhovec_penalty(rhoVec) for rhoVec in gs.rhoVecs ] )
        spamPenalty += sum( [ _tools.evec_penalty(EVec)     for EVec   in gs.EVecs ] )

        #OLD
        #tpPenalty = 0
        #for gate in gs.values():
        #  tpPenalty += (1.0-gate[0,0])**2
        #  for k in range(1,gate.shape[1]): 
        #    tpPenalty += gate[0,k]**2
        #return cpPenalty + spamPenalty + tpPenalty

        penalty = cpPenalty + spamPenalty
        if penalty > 1e-100: return _np.log10(penalty)
        else: return -100


    elif toGetTo == "TP":
      if verbosity > 2: print ""
      if verbosity > 1: print "--- Gauge Optimization to TP (%s) ---" % method
      if constrainToTP: raise ValueError("Cannot gauge optimize to TP and constrain to TP")
      rhoVecFirstEl = 1.0 / gateDim**0.25  # note: sqrt(gateDim) gives linear dim of density mx
      
      def objective_func(vectorM):
        matM = vectorM.reshape( (gateDim,gateDim) )
        gs = gateset.copy(); gs.transform(matM)

        tpPenalty = 0
        for gate in gs.values():
          tpPenalty += (1.0-gate[0,0])**2
          for k in range(1,gate.shape[1]): 
            tpPenalty += gate[0,k]**2

        for rhoVec in gs.rhoVecs:
          tpPenalty += (rhoVecFirstEl - rhoVec[0])**2

        return tpPenalty

    elif toGetTo == "TP and target":
      if verbosity > 2: print ""
      if verbosity > 1: print "--- Gauge Optimization to TP and target (%s) ---" % method
      if targetGateset is None: raise ValueError("Must specify a targetGateset != None")
      if constrainToTP: raise ValueError("Cannot gauge optimize to TP and constrain to TP")
      rhoVecFirstEl = 1.0 / gateDim**0.25  # note: sqrt(gateDim) gives linear dim of density mx
      
      def objective_func(vectorM):
        matM = vectorM.reshape( (gateDim,gateDim) )
        gs = gateset.copy(); gs.transform(matM)

        tpPenalty = 0
        for gate in gs.values():
          tpPenalty += (1.0-gate[0,0])**2
          for k in range(1,gate.shape[1]): 
            tpPenalty += gate[0,k]**2

        for rhoVec in gs.rhoVecs:
          tpPenalty += (rhoVecFirstEl - rhoVec[0])**2

        return tpPenalty + gs.frobeniusdist(targetGateset, None, gateWeight, spamWeight) * targetFactor


    elif toGetTo == "CPTP and target":
      if targetGateset is None: raise ValueError("Must specify a targetGateset != None")

      if constrainToTP: #assume gateset is already in TP so no TP optimization needed
        tpGateset = gateset
        tpGaugeMx = _np.identity( gateDim, 'd' )
      else:
        minf, tpGaugeMx, tpGateset = optimize_gauge(gateset, "TP and target", maxiter, maxfev, tol,
                                                   'L-BFGS-B', targetGateset, targetFactor, 
                                                   constrainToTP, constrainToCP, constrainToValidSpam, True,
                                                   gateWeight, spamWeight, verbosity)

      if verbosity > 2: print ""
      if verbosity > 1: print "--- Gauge Optimization to CPTP and target w/valid SPAM (%s) ---" % method
      constrainToTP = True # always constrain next optimization to TP

      def objective_func(vectorM):
        vectorM = _np.concatenate( (firstRowForTP,vectorM) ) #constraining to TP
        matM = vectorM.reshape( (gateDim,gateDim) )
        gs = tpGateset.copy(); gs.transform(matM)

        cpPenalties = _tools.sums_of_negative_choi_evals(gs)
        cpPenalty = sum( cpPenalties )

        spamPenalty =  sum( [ _tools.rhovec_penalty(rhoVec) for rhoVec in gs.rhoVecs ] )
        spamPenalty += sum( [ _tools.evec_penalty(EVec)     for EVec   in gs.EVecs ] )

        targetPenalty = gs.frobeniusdist(targetGateset, None, gateWeight, spamWeight) * targetFactor

        penalty = cpPenalty + spamPenalty + targetPenalty
        if penalty > 1e-100: return _np.log10(penalty)
        else: return -100


    elif toGetTo == "Completely Depolarized":
      if verbosity > 2: print ""
      if verbosity > 1: print "--- Gauge Optimization to Completely Depolarized w/valid SPAM (%s) ---" % method
      complDepolGate = _np.zeros( (gateDim,gateDim) )
      complDepolGate[0,0] = 1.0
      
      def objective_func(vectorM):
        if constrainToTP: vectorM = _np.concatenate( (firstRowForTP,vectorM) )
        matM = vectorM.reshape( (gateDim,gateDim) )

        gs = gateset.copy(); gs.transform(matM); d=0
        for gateLabel in gs:
          d += _tools.frobeniusdist(gs[gateLabel],complDepolGate)
        spamPenalty  = sum( [ _tools.rhovec_penalty(rhoVec) for rhoVec in gs.rhoVecs ] )
        spamPenalty += sum( [ _tools.evec_penalty(EVec)     for EVec   in gs.EVecs ] )        
        return d + spamPenalty

    else: raise ValueError("Invalid toGetTo passed to optimize_gauge: %s" % toGetTo)

    #Run Minimization Algorithm
    startM = _np.identity(gateDim)  #take identity as initial gauge matrix   
    
    x0 = startM.flatten() if not constrainToTP else startM[1:,:].flatten()
    print_obj_func = _opt.create_obj_func_printer(objective_func)
    minSol = _opt.minimize(objective_func, x0,
                          method=method, maxiter=maxiter, maxfev=maxfev, tol=tol, 
                          stopval= -20 if toGetTo == "CPTP" else None,
                          callback = print_obj_func if verbosity > 2 else None) #stopval=1e-7 -- (before I added log10)

    if constrainToTP:
      v = _np.concatenate( (firstRowForTP,minSol.x) )
      gaugeMat = v.reshape( (gateDim,gateDim) )
    else:
      gaugeMat = minSol.x.reshape( (gateDim,gateDim) )

    if toGetTo in ("CPTP","CPTP and target"):
      gaugeMat = _np.dot(tpGaugeMx, gaugeMat) #include initial TP gauge tranform

    newGateset = gateset.copy()
    newGateset.transform(gaugeMat)
    newGateset.log("Optimize Gauge", { 'method': method, 'tol': tol, 'toGetTo': toGetTo } )

    if verbosity > 1:
      if toGetTo == "target":
        print 'The resulting Frobenius-norm distance is: %g' % minSol.fun
        for gateLabel in newGateset:
          print "  frobenius norm diff of %s = %g" % (gateLabel, _tools.frobeniusdist(newGateset[gateLabel],targetGateset[gateLabel]))
        for (i,rhoV) in enumerate(newGateset.rhoVecs): 
          print "  frobenius norm diff of rho[%d] = %g" % (i, _tools.frobeniusdist(rhoV,targetGateset.rhoVecs[i]))
        for (i,Evec) in enumerate(newGateset.EVecs): 
          print "  frobenius norm diff of EVec[%d] = %g" % (i, _tools.frobeniusdist(Evec,targetGateset.EVecs[i]))
      else:
        print 'The resulting %s penalty is: %g' % (toGetTo, minSol.fun)

    if verbosity > 2:
        print 'The gauge matrix found (B^-1) is:\n' + str(gaugeMat) + '\n'
        print 'The gauge-corrected gates are:\n' + str(newGateset)

    if returnAll:
      return minSol.fun, gaugeMat, newGateset
    else:
      return newGateset


#def optimize_unitary_gauge(gateset,dataset,verbosity,**kwargs):
#    """ Experimental -- works only for single qubit case:
#        Try to find a unitary that maximizes the norm of the logl gradient """
#
#    tol = kwargs.get('tol',1e-8)
#    method = kwargs.get('method','BFGS')
#    gateDim = len(gateset.rhoVecs[0]) # The dimension of the space: TODO = cleaner way
#    if verbosity > 1: print "\n--- Unitary Gauge Optimization ---"
#
#    def objective_func(v):
#      matM = single_qubit_gate( v[0], v[1], v[2] )
#      gs = gateset.copy()
#      gs.transform(matM)
#      ret = -_np.linalg.norm(_tools.dlogL(gs, dataset, **kwargs))
#      print "DEBUG: ",ret
#      return ret
#
#    startV = _np.array( [0,0,0] )
#    print_obj_func = _opt.create_obj_func_printer(objective_func)
#    minSol = _opt.minimize(objective_func,startV,method=method, tol=tol,
#                          callback = print_obj_func if verbosity > 2 else None)
#
#    gaugeMat = single_qubit_gate( minSol.x[0], minSol.x[1], minSol.x[2] )
#    newGateset = gateset.copy()
#    newGateset.transform(gaugeMat)
#    newGateset.log("Optimize Unitary Gauge To Max dLogL", { 'method': method, 'tol': tol } )
#
#    if verbosity > 1:
#        print 'The resulting norm(dLog) is: %g' % -minSol.fun
#        #print 'The gauge matrix found (B^-1) is:\n' + str(gaugeMat) + '\n'
#        #print 'The gauge-corrected gates are:\n' + str(newGateset)
#
#    return -minSol.fun, gaugeMat, newGateset


def contract(gateset, toWhat, dataset=None, maxiter=1000000, tol=0.01, useDirectCP=True, method="Nelder-Mead", verbosity=0):
  """
  Contract a GateSet to a specified space.

  All contraction operations except 'vSPAM' operate entirely on the gate
  matrices and leave state preparations and measurments alone, while 'vSPAM'
  operations only on SPAM.

  Parameters
  ----------
  gateset : GateSet
      The gateset to contract

  toWhat : string
      Specifies which space is the gateset is contracted to.
      Allowed values are:

      - 'TP'     -- All gates are manifestly trace-preserving maps.
      - 'CP'     -- All gates are manifestly completely-positive maps.
      - 'CPTP'   -- All gates are manifestly completely-positive and trace-preserving maps.
      - 'XP'     -- All gates are manifestly "experimentally-positive" maps.
      - 'XPTP'   -- All gates are manifestly "experimentally-positive" and trace-preserving maps.
      - 'vSPAM'  -- state preparation and measurement operations are valid.
      - 'nothing' -- no contraction is performed.

  dataset : DataSet, optional
      Dataset to use to determine whether a gateset is in the
      "experimentally-positive" (XP) space.  Required only when
      contracting to XP or XPTP.
  
  maxiter : int, optional
      Maximum number of iterations for iterative contraction routines.

  tol : float, optional
      Tolerance for iterative contraction routines.

  useDirectCP : bool, optional
      Whether to use a faster direct-contraction method for CP
      contraction.  This method essentially transforms to the 
      Choi matrix, truncates any negative eigenvalues to zero, 
      then transforms back to a gate matrix.

  method : string, optional
      The method used when contracting to XP and non-directly to CP
      (i.e. useDirectCP == False).

  verbosity : int, optional
      How much detail to send to stdout.

  Returns
  -------
  GateSet
      The contracted gateset
  """

  if toWhat == 'CPTP':
    if useDirectCP:
      distance,contractedGateset = _contractToCP_direct(gateset, verbosity, TPalso=True, maxiter=maxiter)
    else:
      distance,contractedGateset = _contractToTP(gateset,verbosity)
      distance,contractedGateset = _contractToCP(contractedGateset, verbosity, method, maxiter, tol, opt_G0=False)
  elif toWhat == 'XPTP':
    if dataset is None: raise ValueError("dataset must be given to contract to " + toWhat)
    distance,contractedGateset = _contractToTP(gateset,verbosity)
    distance,contractedGateset = _contractToXP(contractedGateset, dataset,verbosity, method, maxiter, tol, opt_G0=False)
  elif toWhat == 'CP':
    distance,contractedGateset = _contractToCP(gateset, verbosity, method, maxiter, tol, opt_G0=True)
  elif toWhat == 'TP':
    distance,contractedGateset = _contractToTP(gateset,verbosity)
  elif toWhat == 'XP':
    if dataset is None: raise ValueError("dataset must be given to contract to " + toWhat)
    distance,contractedGateset = _contractToXP(gateset,dataset,verbosity,method,maxiter,tol,opt_G0=True)
  elif toWhat == 'vSPAM':
    contractedGateset = _contractToValidSPAM(gateset, verbosity)
  elif toWhat == 'nothing':
    contractedGateset = gateset.copy()
  else: raise ValueError("Invalid contract argument: %s" % toWhat)

  return contractedGateset


#modifies gates only (not rhoVecs or EVecs = SPAM)
def _contractToXP(gateset,dataset,verbosity,method='Nelder-Mead',
                 maxiter=100000, tol=1e-10, opt_G0=True):

    CLIFF = 10000
    
    if verbosity > 2: print ""
    if verbosity > 1:  print "--- Contract to XP ---"
    gs = gateset.copy() #working copy that we keep overwriting with vectorized data

    def objective_func(vectorGS):
        gs.from_vector(vectorGS,SPAM=False,G0=opt_G0)
        forbiddenProbPenalty = _tools.forbidden_prob(gs,dataset)
        return (CLIFF + forbiddenProbPenalty if forbiddenProbPenalty > 1e-10 else 0) + gs.frobeniusdist(gateset)

    print_obj_func = _opt.create_obj_func_printer(objective_func)
    if objective_func(gs.to_vector(SPAM=False,G0=opt_G0)) < 1e-8:  
      if verbosity > 1: print 'Already in XP - no contraction necessary'
      return 0.0, gs

    optSol = _opt.minimize(objective_func,gs.to_vector(SPAM=False,G0=opt_G0),
                          method=method, tol=tol, maxiter=maxiter,
                          callback = print_obj_func if verbosity > 2 else None)
                      
    gs.from_vector(optSol.x,SPAM=False,G0=opt_G0)
    gs.log("Contract to XP", { 'method': method, 'tol': tol, 'maxiter': maxiter, 'opt_G0': opt_G0 } )
    if optSol.fun >= CLIFF: raise ValueError("Failed to contract_to_xp")

    if verbosity > 1:
        print 'The closest legal point found was distance: ' + str(optSol.fun)        

    return optSol.fun, gs

#modifies gates only (not rhoVecs or EVecs = SPAM)
def _contractToCP(gateset,verbosity,method='Nelder-Mead',
                 maxiter=100000, tol=1e-2, opt_G0=True):

    CLIFF = 10000

    if verbosity > 2: print ""
    if verbosity > 1:  print "--- Contract to CP ---"
    gs = gateset.copy() #working copy that we keep overwriting with vectorized data

    def objective_func(vectorGS):
        gs.from_vector(vectorGS,SPAM=False,G0=opt_G0)
        cpPenalty = _tools.sum_of_negative_choi_evals(gs) * 1000
        return (CLIFF + cpPenalty if cpPenalty > 1e-10 else 0) + gs.frobeniusdist(gateset)

    print_obj_func = _opt.create_obj_func_printer(objective_func)
    if objective_func(gs.to_vector(SPAM=False,G0=opt_G0)) < 1e-8:  
      if verbosity > 1: print 'Already in CP - no contraction necessary'
      return 0.0, gs

    optSol = _opt.minimize(objective_func,gs.to_vector(SPAM=False,G0=opt_G0),
                          method=method, tol=tol, maxiter=maxiter,
                          callback = print_obj_func if verbosity > 2 else None)

    gs.from_vector(optSol.x,SPAM=False,G0=opt_G0)
    gs.log("Contract to CP", { 'method': method, 'tol': tol, 'maxiter': maxiter, 'opt_G0': opt_G0 } )
    if optSol.fun >= CLIFF: raise ValueError("Failed to contract_to_cp")

    if verbosity > 1:
        print 'The closest legal point found was distance: ' + str(optSol.fun)        

    return optSol.fun, gs


#modifies gates only (not rhoVecs or EVecs = SPAM)
def _contractToCP_direct(gateset,verbosity,TPalso=False,maxiter=100000,tol=1e-8):

    gs = gateset.copy() #working copy that we keep overwriting with vectorized data
    if verbosity > 2: print ""
    if verbosity > 1:  print "--- Contract to %s (direct) ---" % ("CPTP" if TPalso else "CP")

    for (gateLabel,gate) in gateset.iteritems():
      new_gate = gate.copy()
      if(TPalso): 
        for k in range(new_gate.shape[1]): new_gate[0,k] = 1.0 if k == 0 else 0.0

      Jmx = _tools.jamiolkowski_iso(new_gate)
      evals,evecs = _np.linalg.eig(Jmx)

      assert( abs( sum(evals) - 1.0 ) < 1e-8 ) #check that Jmx always has trace == 1
      #if abs( sum(evals) - 1.0 ) >= 1e-8: #DEBUG
      #  print "WARNING: JMx given with evals = %s (sum = %s != 1)" % (evals,sum(evals))
      #  print "WARNING: JMx from: "; _tools.print_mx(new_gate)
      

      it = 0
      while min(evals) < -tol or abs( sum(evals) - 1.0 ) >= tol:

        #Project eigenvalues to being all positive
        new_evals = evals[:]

        #New projection code
        new_evals = [ max(ev.real,0) for ev in new_evals ]  #don't need .real in theory, but small im parts can snowball in practice
        total_shift = 1.0 - sum(new_evals)  #amount (usually/always < 0) needed to add to eigenvalues to make sum == 1
        sorted_evals_with_inds = sorted( enumerate(new_evals), key=lambda x: x[1] ) # (index,eval) tuples sorted by eval

        shift_left = total_shift
        evals_left = len(sorted_evals_with_inds)
        ideal_shift = shift_left / evals_left
          
        for (i,eval) in sorted_evals_with_inds: #loop over new_evals from smallest to largest (note all > 0)
          evals_left -= 1  #number of eigenvalue beyond current eval (in sorted order)
          if eval+ideal_shift >= 0:
            new_evals[i] = eval + ideal_shift
            shift_left -= ideal_shift
          elif evals_left > 0:
            new_evals[i] = 0
            shift_left += eval
            ideal_shift = shift_left / evals_left #divide remaining shift evenly among remaining eigenvalues
          else: #last eigenvalue would be < 0 with ideal shift and can't set == 0 b/c all others must be zero too
            new_evals[i] = 1.0 # so set what was the largest eigenvalue == 1.0
          
        #if abs( sum(new_evals) - 1.0 ) >= 1e-8:              #DEBUG
        #  print "DEBUG: sum(new_evals) == ",sum(new_evals)   #DEBUG
        #  print "DEBUG: new_evals == ",new_evals             #DEBUG
        #  print "DEBUG: orig evals == ",evals                #DEBUG
        assert( abs( sum(new_evals) - 1.0 ) < 1e-8 )

        
#  OLD projection code -- can runaway, and take many iters
#        while min(new_evals) < 0:
#          new_evals = [ max(ev.real,0) for ev in new_evals ]  #don't need .real in theory, but small im parts can snowball in practice
#          inds_to_shift = [i for (i,ev) in enumerate(new_evals) if ev > 0]
#          assert(len(inds_to_shift) > 0)
#
#          shift = (1.0 - sum(new_evals))/float(len(inds_to_shift))          
#          for i in inds_to_shift: new_evals[i] += shift
#          
#          if abs( sum(new_evals) - 1.0 ) >= 1e-8:              #DEBUG
#            print "DEBUG: sum(new_evals) == ",sum(new_evals)   #DEBUG
#            print "DEBUG: new_evals == ",new_evals             #DEBUG
#            
#          assert( abs( sum(new_evals) - 1.0 ) < 1e-8 )

        new_Jmx = _np.dot(evecs, _np.dot( _np.diag(new_evals), _np.linalg.inv(evecs) ) )

        #Make trace preserving by zeroing out real parts of off diagonal blocks and imaginary parts
        #  within diagaonal 1x1 and 3x3 block (so really just the 3x3 block's off diag elements)
        #assert(new_Jmx.shape == (4,4)) #NOTE: only works for 1-qubit case so far
        kmax = new_Jmx.shape[0]
        for k in range(1,kmax):
          new_Jmx[0,k] = 1j*new_Jmx[0,k].imag
          new_Jmx[k,0] = 1j*new_Jmx[k,0].imag
        for i in range(1,kmax):
          for j in range(1,kmax):
            new_Jmx[i,j] = new_Jmx[i,j].real
          
        evals,evecs = _np.linalg.eig(new_Jmx)

        #DEBUG
        #EVAL_TOL = 1e-10
        #if abs( sum(evals) - 1.0 ) >= 1e-8:
        #  print "DEBUG2: sum(evals) == ",sum(evals)
        #  print "DEBUG2: evals == ",evals
        #if min(evals) < -EVAL_TOL:
        #  print "DEBUG3: evals = ",evals
          
        assert( min(evals) >= -1e-10 and abs( sum(evals) - 1.0 ) < 1e-8) #Check that trace-trunc above didn't mess up positivity

        new_gate = _tools.jamiolkowski_iso_inv(new_Jmx)

        #Old way of enforcing TP -- new way should be better since it's not iterative, but keep this around just in case.
        #  new_gate = _tools.jamiolkowski_iso_inv(new_Jmx)
        #
        #  if(TPalso):
        #    for k in range(new_gate.shape[1]): 
        #      #if k == 0: assert( abs(new_gate[0,k] - 1.0) < 1e-8 )
        #      #else: assert( abs(new_gate[0,k]) < 1e-8 )
        #      new_gate[0,k] = 1.0 if k == 0 else 0.0
        #
        #  new_Jmx = _tools.jamiolkowski_iso(new_gate)
        #  evals,evecs = _np.linalg.eig(new_Jmx)

        it += 1
        if it > maxiter: break

      #gs[gateLabel] = new_gate
      gs.set_gate(gateLabel, _objs.FullyParameterizedGate( new_gate ) )

      if it > maxiter:
        print "Warning: Max iterations exceeded in contract_to_cp_direct"
      #else: print "contract_to_cp_direct success in %d iterations" % it  #DEBUG

      if verbosity > 2:
        print "Direct CP contraction of %s gate gives frobenius diff of %g" % \
            (gateLabel, _tools.frobeniusdist(gs[gateLabel],gate))
        
    gs.log("Choi-Truncate to %s" % ("CPTP" if TPalso else "CP"), { 'maxiter': maxiter } )
    distance = gs.frobeniusdist(gateset)
    if verbosity > 1:
        print 'The closest legal point found was distance: %s' % str(distance)
    
    return distance, gs


#modifies gates only (not rhoVecs or EVecs = SPAM)
def _contractToTP(gateset,verbosity):
    if verbosity > 2: print ""
    if verbosity > 1:  print "--- Contract to TP ---"
    gs = gateset.copy()
    for gate in gs.values():
      gate[0,0] = 1.0
      for k in range(1,gate.shape[1]): gate[0,k] = 0.0

    gate_dim = gs.get_dimension()
    for rhoVec in gs.rhoVecs:
      rhoVec[0] = 1.0 / gate_dim**0.25
    
    distance = gs.frobeniusdist(gateset)
    if verbosity > 1:
      print 'Projected TP gateset was at distance: %g' % distance

    gs.log("Contract to TP")
    return distance, gs


#modifies rhoVecs and EVecs (SPAM) only (not gates)
def _contractToValidSPAM(gateset, verbosity=0):
    """
    Contract the surface preparation and measurement operations of
    a GateSet to the space of valid quantum operations.
  
    Parameters
    --------------------
    gateset : GateSet
        The gateset to contract
  
    verbosity : int
        How much detail to send to stdout.
  
    Returns
    -------
    GateSet
        The contracted gateset  
    """

    TOL = 1e-9
    gs = gateset.copy()
        
    # ** assumption: only the first vector element of pauli vectors has nonzero trace
    dummyVec = gateset.rhoVecs[0].copy(); dummyVec[0] = 1.0
    firstElTrace = _np.real( _tools.trace(_tools.ppvec_to_stdmx(dummyVec)))  # == sqrt(2)**nQubits
    diff = 0
        
    # rhoVec must be positive semidefinite and trace = 1
    for i in range(len(gs.rhoVecs)):
      vec = gs.rhoVecs[i].copy()

      #Ensure trace == 1.0 (maybe later have this be optional)
      firstElTarget = 1.0 / firstElTrace    #TODO: make this function more robust
                                            # to multiple rhovecs -- can only use on ratio,
                                            # so maybe take average of ideal ratios for each rhoVec
                                            # and apply that?  The function works fine now for just one rhovec.
      if abs(firstElTarget - vec[0,0]) > TOL:
        r = firstElTarget / vec[0,0]
        vec *= r  #multiply rhovec by factor
        for j in range(len(gs.EVecs)):
          gs.EVecs[i] /= r
        gs.make_spams()

      mx = _tools.ppvec_to_stdmx(vec)

      #Ensure positive semidefinite
      lowEval = min( [ev.real for ev in _np.linalg.eigvals( mx ) ])
      while(lowEval < -TOL):
        idEl = vec[0,0] #only element with trace (even for multiple qubits) -- keep this constant and decrease others
        vec /= 1.00001; vec[0,0] = idEl
        lowEval = min( [ev.real for ev in _np.linalg.eigvals( _tools.ppvec_to_stdmx(vec) ) ])

      diff += _np.linalg.norm( gateset.rhoVecs[i] - vec )
      gs.set_rhovec(vec,i)

    # EVec must have eigenvals between 0 and 1 <==> positive semidefinite and trace <= 1
    for i in range(len(gs.EVecs)):
      evals,evecs = _np.linalg.eig( _tools.ppvec_to_stdmx(gs.EVecs[i]) )
      if(min(evals) < 0.0 or max(evals) > 1.0):
        if all([ev > 1.0 for ev in evals]):
          evals[ evals.argmin() ] = 0.0 #at least one eigenvalue must be != 1.0
        if all([ev < 0.0 for ev in evals]):
          evals[ evals.argmax() ] = 1.0 #at least one eigenvalue must be != 0.0
        for (k,ev) in enumerate(evals):
          if ev < 0.0: evals[k] = 0.0
          if ev > 1.0: evals[k] = 1.0
        mx = _np.dot(evecs, _np.dot( _np.diag(evals), _np.linalg.inv(evecs) ) )
        vec = _tools.stdmx_to_ppvec(mx)
        diff += _np.linalg.norm( gateset.EVecs[i] - vec )
        gs.set_evec( vec, i )

    gs.log("Contract to valid SPAM")
    if verbosity > 2: print ""
    if verbosity > 1:
      print "--- Contract to valid SPAM ---"
      print "Sum of norm(deltaE) and norm(deltaRho) = %g" % diff
    if verbosity > 2:
      for (i,rhoVec) in enumerate(gateset.rhoVecs):
        print "  rhoVec[%d]: %s ==> %s " % (i, str(_np.transpose(rhoVec)), str(_np.transpose(gs.rhoVecs[i])))
      for (i,EVec) in enumerate(gateset.EVecs):
        print "  EVec[%d]: %s ==> %s " % (i, str(_np.transpose(EVec)), str(_np.transpose(gs.EVecs[i])))

    return gs #return contracted gateset


def find_closest_unitary_gatemx(gateMx):
  """
  Get the closest gate matrix (by maximizing fidelity)
    to gateMx that describes a unitary quantum gate.

  Parameters
  ----------
  gateMx : numpy array
      The gate matrix to act on.

  Returns
  -------
  numpy array
      The resulting closest unitary gate matrix.
  """
  gate_JMx = _tools.jamiolkowski_iso( gateMx, choiMxBasis="std" )
  d = _np.sqrt(gateMx.shape[0])
  I = _np.identity(d)

  #def getu_1q(basisVec):  # 1 qubit version
  #    return _spl.expm( 1j * (basisVec[0]*_tools.sigmax + basisVec[1]*_tools.sigmay + basisVec[2]*_tools.sigmaz) )
  def get_gate_mx_1q(basisVec):  # 1 qubit version
      return _tools.single_qubit_gate(basisVec[0], 
                                      basisVec[1],
                                      basisVec[2])


  if gateMx.shape[0] == 4:
    #bell = _np.transpose(_np.array( [[1,0,0,1]] )) / _np.sqrt(2)
    initialBasisVec = [ 0, 0, 0]  #start with I until we figure out how to extract target unitary 
    #getU = getu_1q
    getGateMx = get_gate_mx_1q
  #Note: seems like for 2 qubits bell = [1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 ]/sqrt(4) (4 zeros between 1's since state dimension is 4 ( == sqrt(gate dimension))
  else:
    raise ValueError("Can't get closest unitary for > 1 qubits yet -- need to generalize.")  

  def objective_func(basisVec):
    gateMx = getGateMx(basisVec)
    JU = _tools.jamiolkowski_iso( gateMx, choiMxBasis="std" )
    # OLD: but computes JU in Pauli basis (I think) -> wrong matrix to fidelity check with gate_JMx
    #U = getU(basisVec)
    #vU = _np.dot( _np.kron(U,I), bell ) # "Choi vector" corresponding to unitary U
    #JU = _np.kron( vU, _np.transpose(_np.conjugate(vU))) # Choi matrix corresponding to U
    return -_tools.fidelity(gate_JMx, JU)
  
  print_obj_func = _opt.create_obj_func_printer(objective_func)
  solution = _spo.minimize(objective_func, initialBasisVec,  options={'maxiter': 10000},
                           method='Nelder-Mead',callback=None, tol=1e-8) # if verbosity > 2 else None
  gateMx = getGateMx(solution.x)

  #print "DEBUG: Best fidelity = ",-solution.fun
  #print "DEBUG: Using vector = ", solution.x
  #print "DEBUG: Gate Mx = \n", gateMx
  #print "DEBUG: Chi Mx = \n", _tools.jamiolkowski_iso( gateMx)
  #return -solution.fun, gateMx
  return gateMx