In [4]:
import torch

class EasyAcc:
    def __init__(self):
        self.n = 0
        self.sum = 0
        self.sumsq = 0

    def __iadd__(self, other):
        self.n += 1
        self.sum += other
        self.sumsq += other*other
        return self

    def __isub__(self, other):
        self.n += 1
        self.sum -= other
        self.sumsq += other*other
        return self

    def mean(self):
        return self.sum / max(self.n, 1)

    def var(self):
        from math import sqrt
        return sqrt(self.sumsq / max(self.n, 1) - self.mean()**2)

    def semean(self):
        from math import sqrt
        return self.var() / sqrt(max(self.n, 1))
    
class EasyPoissonBootstrapAcc:
    def __init__(self, batch_size, confidence=0.95, seed=2112):
        from math import ceil
        from numpy.random import default_rng
        
        self.n = 0
        self.batch_size = batch_size
        self.confidence = confidence
        self.samples = [ EasyAcc() for _ in range(int(ceil(3 / (1 - self.confidence)))) ]
        self.rng = default_rng(seed)
        
    def __iadd__(self, other):
        self.n += 1
        
        poissons = self.rng.poisson(lam=self.batch_size, size=len(self.samples)) / self.batch_size
        
        for chirp, acc in zip(poissons, self.samples):
            acc += chirp * other
            
        return self
         
    def __isub__(self, other):
        return self.__iadd__(-other)
    
    def ci(self):
        import numpy
        quantiles = numpy.quantile(a=[ x.mean() for x in self.samples ],
                                   q=[1 - self.confidence, 0.5, self.confidence])
        return list(quantiles)
    
    def formatci(self):
        z = self.ci()
        return '[{:<.5f}, {:<.5f}, {:<.5f}]'.format(z[0], z[1], z[2])
        
def isfloat(element):
    try:
        float(element)
        return True
    except ValueError:
        return False
    
def floatorzero(element):
    try:
        return float(element)
    except ValueError:
        return 0.0
    
# match vw's flexible parsing
def makeCategoricalData(filename):
    from collections import defaultdict
    import numpy
    
    print('using categorical parsing')
        
    isNumeric = None
    extras = defaultdict(set)

    with open(filename, 'r') as f:
        for line in f:
            targetstr, rest = line.strip().split('|')
            target = float(targetstr)
            stringfeatures = rest.split()
            for col, (isnum, v) in enumerate((isfloat(x), x) for x in stringfeatures):
                if not isnum:
                    extras[col].add(v)
                    
    onehotmap = {}
    for col, values in extras.items():
        for v in values:
            if (col, v) not in onehotmap:
                onehotmap[col, v] = len(onehotmap)
             
    print(f'creating {len(onehotmap)} additional one-hot columns')        
    
    Y = []
    X = []
    with open(filename, 'r') as f:
        for line in f:
            targetstr, rest = line.strip().split('|')
            target = float(targetstr)
            stringfeatures = rest.split()
            features = [0]*len(onehotmap) + [ floatorzero(x) for x in stringfeatures ]
            for col, v in enumerate(stringfeatures):
                if (col, v) in onehotmap:
                    features[onehotmap[col, v]] = 1
            
            Y.append(target)
            X.append(features)
 
    Y = numpy.array(Y)
    Ymin, Ymax = numpy.min(Y), numpy.max(Y)
    Y = (Y - Ymin) / (Ymax - Ymin)
    X = numpy.array(X)
    Xmin, Xmax = numpy.min(X, axis=0, keepdims=True), numpy.max(X, axis=0, keepdims=True)
    X = (X - Xmin) / numpy.maximum(Xmax - Xmin, 1e-9)
    
    return X, Y
    
def makeData(filename):
    import numpy
    
    Y = []
    X = []
    with open(filename, 'r') as f:
        for line in f:
            targetstr, rest = line.strip().split('|')
            target = float(targetstr)
            try:
                features = [ float(x) for x in rest.split() ]
            except ValueError:
                return makeCategoricalData(filename)
            
            Y.append(target)
            X.append(features)
 
    Y = numpy.array(Y)
    Ymin, Ymax = numpy.min(Y), numpy.max(Y)
    Y = (Y - Ymin) / (Ymax - Ymin)
    X = numpy.array(X)
    Xmin, Xmax = numpy.min(X, axis=0, keepdims=True), numpy.max(X, axis=0, keepdims=True)
    X = (X - Xmin) / (Xmax - Xmin)
    
    return X, Y

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, filename):
        X, Y = makeData(filename)
        self.Xs = torch.Tensor(X)
        self.Ys = torch.Tensor(Y).unsqueeze(1)
            
    def __len__(self):
        return self.Xs.shape[0]

    def __getitem__(self, index):
        # Select sample
        return self.Xs[index], self.Ys[index]

In [5]:
# best constant predictor
# if you don't beat this, you have a problem

def bestconstant(filename):
    import numpy
    
    dataset = MyDataset(filename)
    
    ymed = torch.median(dataset.Ys).item()
    ymedabsloss = torch.mean(torch.abs(dataset.Ys - ymed)).item()

    l1_loss = torch.nn.L1Loss(reduction='none')
    log_loss = torch.nn.BCELoss()
    sumabsloss, sumlogloss = EasyAcc(), EasyAcc()
    
    generator = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)
    
    for _, Ys in generator:
        with torch.no_grad():
            yhat = torch.Tensor([ymed]).expand(*Ys.shape)
            loss = l1_loss(input=yhat, target=Ys)
            losspredict = torch.Tensor([ymedabsloss]).expand(*Ys.shape)
            loglosspredict = log_loss(input=losspredict, target=loss)
            
            sumabsloss += torch.mean(losspredict).item()
            sumlogloss += torch.mean(loglosspredict).item()
    
    return { 'best_constant_predict': ymed,
             'best_constant_loss_predict': ymedabsloss,
             'best_constant_average_reward': 1 - sumabsloss.mean(), 
             'best_constant_average_logloss': sumlogloss.mean()
           }

In [47]:
bestconstant('black_friday.dat')

using categorical parsing
creating 13 additional one-hot columns


{'best_constant_predict': 0.4867081642150879,
 'best_constant_loss_predict': 0.17742004990577698,
 'best_constant_average_reward': 0.8225799649925263,
 'best_constant_average_logloss': 0.46747197577744826}

In [4]:
bestconstant('zurich.dat')

using categorical parsing
creating 4 additional one-hot columns


{'best_constant_predict': 0.9444969892501831,
 'best_constant_loss_predict': 0.0004755092377308756,
 'best_constant_average_reward': 0.9995244907331653,
 'best_constant_average_logloss': 0.00411359055235489}

In [3]:
bestconstant('BNG_auto_price.dat')

{'best_constant_predict': 0.22539003193378448,
 'best_constant_loss_predict': 0.09392284601926804,
 'best_constant_average_reward': 0.906077153980732,
 'best_constant_average_logloss': 0.31152110839271546}

In [6]:
bestconstant('BNG_cpu_act.dat')

{'best_constant_predict': 0.6102600693702698,
 'best_constant_loss_predict': 0.04165152832865715,
 'best_constant_average_reward': 0.9583484716713428,
 'best_constant_average_logloss': 0.17315774370002746}

In [57]:
bestconstant('BNG_wisconsin.dat')

{'best_constant_predict': 0.29374879598617554,
 'best_constant_loss_predict': 0.16278983652591705,
 'best_constant_average_reward': -0.16278983652591705,
 'best_constant_average_logloss': 0.44426723398303986}

How to achieve $O(1)$ in argmax?

Parameterize our regressor class $f(x, a; \theta) = g(\hat{a}(x; \theta) - a; \theta)$ where $g(0; \theta)$ is a global minimum for all $\theta$.  So $\hat{a}(x; \theta)$ is the argmax we want. 

For the cats datasets, we use $g(z; \theta) = \sigma\left(|w_{\theta}| |z| + b_{\theta}\right)$, and we have a choice of $\hat{a}$:
* "linear": $\hat{a}(x; \theta) = v_{\theta}^\top x$.
* "nonlinear": $\hat{a}(x; \theta)$ is a deep neural net or kernel machine.

In [13]:
class ArgmaxPlusDispersion(torch.nn.Module):
    def __init__(self, argmaxblock):
        super(ArgmaxPlusDispersion, self).__init__()
        
        self.argmaxblock = argmaxblock
        self.logitsigma = torch.nn.Parameter(torch.ones(1))
        self.scale = torch.nn.Parameter(torch.ones(1))
        self.sigmoid = torch.nn.Sigmoid()
        
    def argmax(self, Xs):
        return 1 - self.sigmoid(self.logitsigma).unsqueeze(0).expand(Xs.shape[0], -1), self.argmaxblock(Xs)

    def forward(self, Xs, As):
        _, Yhat = self.argmax(Xs)
        return 1 - self.sigmoid(torch.abs(self.scale * (Yhat - As)) + self.logitsigma)
    
class LinearArgmax(torch.nn.Module):
    def __init__(self, dobs):
        super(LinearArgmax, self).__init__()
        
        self.linear = torch.nn.Linear(in_features=dobs, out_features=1)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, Xs):
        return self.sigmoid(self.linear(Xs))
    
class ResidualBlock(torch.nn.Module):
    def __init__(self, d, device):
        super(ResidualBlock, self).__init__()
        
        self.W = torch.nn.Parameter(torch.zeros(d, d, device=device))
        self.afunc = torch.nn.LeakyReLU(negative_slope=0.01, inplace=True)
        
    def forward(self, X):
        return X + self.afunc(torch.matmul(X, self.W))
    
class NonlinearArgmax(torch.nn.Module):
    def __init__(self, dobs, depth, device):
        super(NonlinearArgmax, self).__init__()
        
        self.block = torch.nn.Sequential(*[ResidualBlock(dobs, device) for _ in range(depth) ])
        self.linear = torch.nn.Linear(in_features=dobs, out_features=1, device=device)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, Xs):
        return self.sigmoid(self.linear(self.block(Xs)))
    
class CauchyRFF(torch.nn.Module):
    def __init__(self, dobs, numrff, sigma, device):
        from math import pi, sqrt
        
        super(CauchyRFF, self).__init__()
        
        self.rffW = torch.nn.Parameter(torch.empty(dobs, numrff).cauchy_(sigma = sigma).to(device), 
                                       requires_grad=False)
        self.rffb = torch.nn.Parameter((2 * pi * torch.rand(numrff)).to(device),
                                       requires_grad=False)
        self.sqrtrff = torch.nn.Parameter(torch.Tensor([sqrt(numrff)]).to(device), 
                                          requires_grad=False)
        self.linear = torch.nn.Linear(in_features=numrff, out_features=1, device=device)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, Xs):
        with torch.no_grad():
            rff = (torch.matmul(Xs, self.rffW) + self.rffb).cos() / self.sqrtrff
            
        return self.sigmoid(self.linear(rff))
    
class CorralFastIGW(object):
    def __init__(self, *, eta, gammamin, gammamax, nalgos, device):
        import numpy
        
        super(CorralFastIGW, self).__init__()
        
        self.eta = eta / nalgos
        self.gammas = torch.Tensor(numpy.geomspace(gammamin, gammamax, nalgos)).to(device)
        self.invpalgo = torch.Tensor([ self.gammas.shape[0] ] * self.gammas.shape[0]).to(device)
        
    def update(self, algo, invprop, reward):
        import numpy
        from scipy import optimize
        
        assert torch.all(reward >= 0) and torch.all(reward <= 1), reward
        
        weightedlosses = self.eta * (-reward.squeeze(1)) * invprop.squeeze(1)
        newinvpalgo = torch.scatter(input=self.invpalgo,
                                    dim=0,
                                    index=algo,
                                    src=weightedlosses,
                                    reduce='add')
                                    
        # just do this calc on the cpu
        invp = newinvpalgo.cpu().numpy() 
        invp += 1 - numpy.min(invp)
        Zlb = 0
        Zub = 1
        while (numpy.sum(1 / (invp + Zub)) > 1):
            Zlb = Zub
            Zub *= 2 
        root, res = optimize.brentq(lambda z: 1 - numpy.sum(1 / (invp + z)), Zlb, Zub, full_output=True)
        assert res.converged, res
        
        self.invpalgo = torch.Tensor(invp + root, device=self.invpalgo.device)
 
    def sample(self, fhatstar, ahatstar, fhat, X):
        N, _ = fhatstar.shape

        algosampler = torch.distributions.categorical.Categorical(probs=1.0/self.invpalgo, validate_args=False)
        algo = algosampler.sample((N,))
        invpalgo = torch.gather(input=self.invpalgo.unsqueeze(0).expand(N, -1),
                                dim=1,
                                index=algo.unsqueeze(1))
        gamma = torch.gather(input=self.gammas.unsqueeze(0).expand(N, -1),
                             dim=1,
                             index=algo.unsqueeze(1))
        
        rando = torch.rand(size=(N, 1), device=X.device)
        fhatrando = fhat(X, rando)
        probs = 1 / (1 + gamma * (1 - fhatrando / fhatstar))
        unif = torch.rand(size=(N, 1), device=X.device)
        shouldexplore = (unif <= probs).long()
        return (ahatstar + shouldexplore * (rando - ahatstar)), algo, invpalgo        
    
def learnOnline(dataset, *, seed, batch_size, modelfactory, initlr, tzero, eta, gammamin, gammamax, nalgos):
    import time
    
    torch.manual_seed(seed)
        
    generator = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model = None
    l1_loss = torch.nn.L1Loss(reduction='none')
    log_loss = torch.nn.BCELoss()
    
    print('{:<5s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<10s}'.format(
            'n', 'loss', 'since last', 'acc', 'since last', 'realized l1', 'since last', 'dt (sec)'),
          flush=True)
    avloss, sincelast, acc, accsincelast, avreward, rewardsincelast = [ EasyPoissonBootstrapAcc(batch_size=batch_size) for _ in range(6) ]
    
    for bno, (Xs, ys) in enumerate(generator):
        if model is None:
            from math import sqrt
            model = modelfactory(Xs)
            opt = torch.optim.Adam(( p for p in model.parameters() if p.requires_grad ), lr=initlr)
            scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda = lambda t: sqrt(tzero) / sqrt(tzero + t))
            sampler = CorralFastIGW(eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos, device=Xs.device)
            start = time.time()
            
        opt.zero_grad()
        
        with torch.no_grad():
            fhatstar, ahatstar = model.argmax(Xs)
            sample, algo, invpalgo = sampler.sample(fhatstar, ahatstar, model, Xs)
            reward = 1 - l1_loss(sample, ys)
        
        score = model(Xs, sample)
        loss = log_loss(score, reward)
        loss.backward()
        opt.step()
        scheduler.step()
        
        with torch.no_grad():
            acc += 1 - torch.mean(l1_loss(ahatstar, ys)).item()
            accsincelast += 1 - torch.mean(l1_loss(ahatstar, ys)).item()
            avreward += 1 - torch.mean(reward).item()
            rewardsincelast += 1 - torch.mean(reward).item()
            avloss += loss
            sincelast += loss
            sampler.update(algo, invpalgo, reward)

        if bno & (bno - 1) == 0:
            now = time.time()
            print('{:<5d}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<10.5f}'.format(
                    avloss.n, avloss.formatci(), sincelast.formatci(), acc.formatci(),
                    accsincelast.formatci(), avreward.formatci(), rewardsincelast.formatci(),
                    now - start),
                  flush=True)
            sincelast, accsincelast, rewardsincelast = [ EasyPoissonBootstrapAcc(batch_size=batch_size) for _ in range(3) ]
            print(f'sampler.palgo = { 1/sampler.invpalgo }')

    now = time.time()
    print('{:<5d}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<20s}\t{:<10.5f}'.format(
            avloss.n, avloss.formatci(), sincelast.formatci(), acc.formatci(),
            accsincelast.formatci(), avreward.formatci(), rewardsincelast.formatci(),
            now - start),
         flush=True)
    print(f'sampler.palgo = { 1/sampler.invpalgo }')

## Black Friday

In [9]:
mydata = MyDataset('black_friday.dat')

using categorical parsing
creating 13 additional one-hot columns


### Linear Argmax Result

In [48]:
def doit():
    # see tune-fastcbcorral.black_friday.res
    initlr, tzero, eta, gammamin, gammamax, nalgos = 0.024639917166813248, 54.11341668228498, 0.18242747240949592, 16, 4096, 12

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=8, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=LinearArgmax(dobs=x.shape[1])))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.44451, 1.03719, 1.77804]	[0.44451, 1.03719, 1.77804]	[0.32704, 0.76309, 1.30815]	[0.32704, 0.76309, 1.30815]	[0.04796, 0.11191, 0.19185]	[0.04796, 0.11191, 0.19185]	0.03898   
sampler.palgo = tensor([0.0826, 0.0836, 0.0826, 0.0849, 0.0838, 0.0826, 0.0838, 0.0826, 0.0836,
        0.0837, 0.0836, 0.0826])
2    	[0.63034, 1.05718, 1.62497]	[0.40123, 0.93621, 1.60492]	[0.45650, 0.76702, 1.17980]	[0.28658, 0.66868, 1.14631]	[0.09785, 0.17223, 0.25595]	[0.08721, 0.20348, 0.34882]	0.05835   
sampler.palgo = tensor([0.0820, 0.0838, 0.0820, 0.0864, 0.0840, 0.0820, 0.0840, 0.0830, 0.0841,
        0.0831, 0.0829, 0.0828])
3    	[0.72623, 1.07729, 1.50143]	[0.39741, 0.92730, 1.58965]	[0.52821, 0.78167, 1.09336]	[0.28904, 0.67442, 1.15615]	[0.12594, 0.19784, 0.27013]	[0.08596, 0.20058, 0.34385]	0.09517   
sampler.palgo = tensor([0.0825, 0.0840, 0.08

### Deep Neural Network Argmax Result

In [60]:
def doit():
    # see tune-fastcbcorraldeep.black_friday.res
    initlr, tzero, eta, gammamin, gammamax, nalgos = 0.027040928375108752, 53.93189408955668, 0.28222864777632817, 4, 4096, 15

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=8, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=NonlinearArgmax(dobs=x.shape[1], depth=2, device=x.device)))    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.44451, 1.03719, 1.77804]	[0.44451, 1.03719, 1.77804]	[0.32704, 0.76309, 1.30815]	[0.32704, 0.76309, 1.30815]	[0.04796, 0.11191, 0.19185]	[0.04796, 0.11191, 0.19185]	0.23831   
sampler.palgo = tensor([0.0661, 0.0671, 0.0661, 0.0673, 0.0671, 0.0673, 0.0661, 0.0661, 0.0673,
        0.0661, 0.0671, 0.0672, 0.0671, 0.0661, 0.0661])
2    	[0.65036, 1.08691, 1.66838]	[0.42501, 0.99170, 1.70006]	[0.45650, 0.76702, 1.17980]	[0.28658, 0.66868, 1.14631]	[0.09059, 0.15733, 0.23554]	[0.07603, 0.17740, 0.30411]	0.30430   
sampler.palgo = tensor([0.0656, 0.0677, 0.0656, 0.0668, 0.0687, 0.0676, 0.0656, 0.0656, 0.0676,
        0.0666, 0.0676, 0.0667, 0.0665, 0.0656, 0.0664])
3    	[0.71956, 1.08725, 1.51830]	[0.38932, 0.90842, 1.55729]	[0.53597, 0.78948, 1.10163]	[0.29909, 0.69787, 1.19635]	[0.12596, 0.19883, 0.27668]	[0.09706, 0.22647, 0.38823]	0.31657

### Laplace Kernel Argmax Result

In [61]:
def doit():
    # see tune-fastcbcorralkernel.black_friday.res
    initlr, tzero, eta, gammamin, gammamax, nalgos, sigma = 0.02416239035126675, 103.29574104500331, 0.2058418084208306, 1, 1024, 8, 0.16806449335165635 

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=8, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                    modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=CauchyRFF(dobs=x.shape[1], 
                                                                                      numrff=1024, 
                                                                                      sigma=sigma,
                                                                                      device=x.device)))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.43171, 1.00733, 1.72684]	[0.43171, 1.00733, 1.72684]	[0.31135, 0.72649, 1.24541]	[0.31135, 0.72649, 1.24541]	[0.06189, 0.14441, 0.24756]	[0.06189, 0.14441, 0.24756]	0.01672   
sampler.palgo = tensor([0.1223, 0.1223, 0.1283, 0.1279, 0.1223, 0.1254, 0.1249, 0.1265])
2    	[0.57448, 0.97074, 1.49642]	[0.34488, 0.80471, 1.37951]	[0.44182, 0.74095, 1.13886]	[0.28140, 0.65661, 1.12562]	[0.15440, 0.27941, 0.41422]	[0.15540, 0.36261, 0.62162]	0.03083   
sampler.palgo = tensor([0.1214, 0.1298, 0.1263, 0.1286, 0.1205, 0.1235, 0.1253, 0.1245])
3    	[0.69283, 1.01894, 1.42490]	[0.39936, 0.93184, 1.59744]	[0.52410, 0.77165, 1.06703]	[0.29662, 0.69212, 1.18648]	[0.17489, 0.28042, 0.37922]	[0.09844, 0.22969, 0.39376]	0.04316   
sampler.palgo = tensor([0.1213, 0.1287, 0.1261, 0.1292, 0.1211, 0.1236, 0.1230, 0.1271])
5    	[0.77356, 1.04730, 1.34789]	[

## BNG auto price

In [62]:
mydata = MyDataset('BNG_auto_price.dat')

### Linear Argmax Result

In [55]:
def doit():
    # see tune-fastcbcorral.BNG_auto_price.res
    initlr, tzero, eta, gammamin, gammamax, nalgos = 0.04358188422013057, 46.67619815431553, 0.14095041286150903, 8, 4096, 11

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=8, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=LinearArgmax(dobs=x.shape[1])))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.41952, 0.97887, 1.67806]	[0.41952, 0.97887, 1.67806]	[0.30312, 0.70728, 1.21249]	[0.30312, 0.70728, 1.21249]	[0.08301, 0.19369, 0.33203]	[0.08301, 0.19369, 0.33203]	0.06301   
sampler.palgo = tensor([0.0937, 0.0912, 0.0902, 0.0902, 0.0911, 0.0902, 0.0902, 0.0914, 0.0902,
        0.0902, 0.0911])
2    	[0.62602, 1.04396, 1.60109]	[0.41565, 0.96985, 1.66261]	[0.45850, 0.76698, 1.17264]	[0.30891, 0.72080, 1.23566]	[0.11296, 0.19036, 0.29314]	[0.06928, 0.16165, 0.27712]	0.09326   
sampler.palgo = tensor([0.0939, 0.0905, 0.0904, 0.0925, 0.0904, 0.0896, 0.0906, 0.0916, 0.0896,
        0.0905, 0.0904])
3    	[0.74380, 1.09562, 1.49341]	[0.42316, 0.98737, 1.69264]	[0.54610, 0.80705, 1.10297]	[0.31449, 0.73381, 1.25796]	[0.11835, 0.18196, 0.25560]	[0.05991, 0.13978, 0.23963]	0.12802   
sampler.palgo = tensor([0.0931, 0.0916, 0.0897, 0.0929, 0.09

### Laplace Kernel Argmax Result

In [63]:
def doit():
    # see tune-fastcbcorralkernel.BNG_auto_price.res
    initlr, tzero, eta, gammamin, gammamax, nalgos, sigma = 0.010252315029742986, 90.19228797599096, 0.33095653977189465, 32, 4096, 16, 0.2515667011825146

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=8, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                    modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=CauchyRFF(dobs=x.shape[1], 
                                                                                      numrff=1024, 
                                                                                      sigma=sigma,
                                                                                      device=x.device)))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.40650, 0.94851, 1.62601]	[0.40650, 0.94851, 1.62601]	[0.28903, 0.67440, 1.15612]	[0.28903, 0.67440, 1.15612]	[0.08597, 0.20060, 0.34388]	[0.08597, 0.20060, 0.34388]	0.01745   
sampler.palgo = tensor([0.0620, 0.0620, 0.0620, 0.0620, 0.0620, 0.0620, 0.0620, 0.0620, 0.0629,
        0.0629, 0.0642, 0.0620, 0.0620, 0.0639, 0.0620, 0.0642])
2    	[0.58171, 0.97460, 1.49744]	[0.37318, 0.87075, 1.49272]	[0.40955, 0.68694, 1.05592]	[0.26052, 0.60789, 1.04210]	[0.15083, 0.26231, 0.39043]	[0.11903, 0.27773, 0.47611]	0.02923   
sampler.palgo = tensor([0.0632, 0.0616, 0.0625, 0.0616, 0.0624, 0.0616, 0.0616, 0.0616, 0.0624,
        0.0634, 0.0637, 0.0633, 0.0616, 0.0645, 0.0616, 0.0637])
3    	[0.68795, 1.01305, 1.39940]	[0.38796, 0.90525, 1.55185]	[0.48510, 0.71423, 0.98872]	[0.27400, 0.63933, 1.09599]	[0.17019, 0.27040, 0.36636]	[0.10100, 0.23567, 

## BNG cpu act

In [65]:
mydata = MyDataset('BNG_cpu_act.dat')

### Linear Argmax Result

In [57]:
def doit():
    # see tune-fastcbcorral.BNG_cpu_act.res
    initlr, tzero, eta, gammamin, gammamax, nalgos = 0.05155739705298048, 54.01734580625215, 0.10294568157425851, 32, 4096, 9
    
    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=8, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=LinearArgmax(dobs=x.shape[1])))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.42861, 1.00010, 1.71446]	[0.42861, 1.00010, 1.71446]	[0.32671, 0.76232, 1.30683]	[0.32671, 0.76232, 1.30683]	[0.06803, 0.15873, 0.27212]	[0.06803, 0.15873, 0.27212]	0.04840   
sampler.palgo = tensor([0.1129, 0.1102, 0.1102, 0.1113, 0.1102, 0.1124, 0.1113, 0.1102, 0.1114])
2    	[0.61247, 1.02631, 1.57700]	[0.39244, 0.91568, 1.56974]	[0.45439, 0.76379, 1.17503]	[0.28432, 0.66342, 1.13729]	[0.11695, 0.20405, 0.30249]	[0.09068, 0.21158, 0.36271]	0.08422   
sampler.palgo = tensor([0.1131, 0.1112, 0.1093, 0.1114, 0.1103, 0.1124, 0.1114, 0.1103, 0.1105])
3    	[0.67497, 1.02014, 1.43173]	[0.36146, 0.84342, 1.44585]	[0.49398, 0.74955, 1.05424]	[0.25986, 0.60635, 1.03945]	[0.15713, 0.24447, 0.33781]	[0.11514, 0.26865, 0.46055]	0.10420   
sampler.palgo = tensor([0.1123, 0.1113, 0.1086, 0.1106, 0.1095, 0.1134, 0.1142, 0.1095, 0.1106])
5    	[0.71

### Laplace Kernel Argmax Result

In [67]:
def doit():
    # see tune-fastcbcorralkernel.BNG_cpu_act.res
    initlr, tzero, eta, gammamin, gammamax, nalgos, sigma = 0.011477880564119584, 97.35078068249528, 0.24115949048600216, 32, 2048, 13, 0.409523341439077

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=8, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                    modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=CauchyRFF(dobs=x.shape[1], 
                                                                                      numrff=1024, 
                                                                                      sigma=sigma,
                                                                                      device=x.device)))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.45471, 1.06099, 1.81884]	[0.45471, 1.06099, 1.81884]	[0.33724, 0.78689, 1.34895]	[0.33724, 0.78689, 1.34895]	[0.03776, 0.08811, 0.15105]	[0.03776, 0.08811, 0.15105]	0.00658   
sampler.palgo = tensor([0.0761, 0.0761, 0.0761, 0.0761, 0.0774, 0.0761, 0.0761, 0.0800, 0.0761,
        0.0774, 0.0761, 0.0789, 0.0774])
2    	[0.67845, 1.13140, 1.73521]	[0.45041, 1.05096, 1.80165]	[0.50450, 0.84108, 1.28981]	[0.33563, 0.78314, 1.34252]	[0.05770, 0.09695, 0.14772]	[0.03937, 0.09186, 0.15748]	0.03029   
sampler.palgo = tensor([0.0766, 0.0753, 0.0753, 0.0753, 0.0792, 0.0753, 0.0753, 0.0818, 0.0766,
        0.0779, 0.0766, 0.0780, 0.0766])
3    	[0.80106, 1.18029, 1.61241]	[0.45154, 1.05358, 1.80614]	[0.59410, 0.87634, 1.19703]	[0.33442, 0.78031, 1.33768]	[0.07061, 0.10465, 0.14347]	[0.04274, 0.09973, 0.17097]	0.04395   
sampler.palgo = tensor([0.07

## BNG wisconsin

In [68]:
mydata = MyDataset('BNG_wisconsin.dat')

### Linear Argmax Result

In [51]:
def doit():
    # see tune-fastcbcorral.BNG_wisconsin.res
    initlr, tzero, eta, gammamin, gammamax, nalgos = 0.02189132200644531, 101.61209134336646, 0.3329151288501932, 1, 2048, 10

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=8, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=LinearArgmax(dobs=x.shape[1])))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.41627, 0.97129, 1.66507]	[0.41627, 0.97129, 1.66507]	[0.29879, 0.69719, 1.19518]	[0.29879, 0.69719, 1.19518]	[0.07621, 0.17781, 0.30482]	[0.07621, 0.17781, 0.30482]	0.03317   
sampler.palgo = tensor([0.0979, 0.0979, 0.1007, 0.1003, 0.1031, 0.0979, 0.1012, 0.1001, 0.0979,
        0.1031])
2    	[0.59156, 0.99190, 1.52449]	[0.37725, 0.88025, 1.50900]	[0.44025, 0.73519, 1.12816]	[0.28936, 0.67518, 1.15744]	[0.14385, 0.24981, 0.37403]	[0.12068, 0.28158, 0.48272]	0.07493   
sampler.palgo = tensor([0.0961, 0.0973, 0.1026, 0.1010, 0.1038, 0.0961, 0.1053, 0.0983, 0.0961,
        0.1034])
3    	[0.68108, 1.01073, 1.40808]	[0.37284, 0.86996, 1.49135]	[0.51484, 0.76088, 1.04539]	[0.28746, 0.67074, 1.14984]	[0.17454, 0.27875, 0.38040]	[0.11580, 0.27020, 0.46320]	0.10708   
sampler.palgo = tensor([0.0975, 0.0955, 0.1007, 0.1015, 0.1045, 0.0944, 0.10

### Laplace Kernel Argmax Result

In [69]:
def doit():
    # see tune-fastcbcorralkernel.BNG_wisconsin.res
    initlr, tzero, eta, gammamin, gammamax, nalgos, sigma = 0.015267278594117419, 49.29292960751217, 0.4982367162934428, 16, 4096, 15, 0.227192842252423

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=8, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                    modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=CauchyRFF(dobs=x.shape[1], 
                                                                                      numrff=1024, 
                                                                                      sigma=sigma,
                                                                                      device=x.device)))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.41351, 0.96485, 1.65403]	[0.41351, 0.96485, 1.65403]	[0.29603, 0.69075, 1.18414]	[0.29603, 0.69075, 1.18414]	[0.07897, 0.18425, 0.31586]	[0.07897, 0.18425, 0.31586]	0.03155   
sampler.palgo = tensor([0.0657, 0.0657, 0.0675, 0.0657, 0.0657, 0.0677, 0.0657, 0.0694, 0.0657,
        0.0657, 0.0674, 0.0678, 0.0670, 0.0657, 0.0673])
2    	[0.58882, 0.98708, 1.51695]	[0.37616, 0.87770, 1.50463]	[0.42405, 0.71038, 1.09143]	[0.27227, 0.63530, 1.08908]	[0.14427, 0.24932, 0.37406]	[0.11771, 0.27467, 0.47085]	0.06540   
sampler.palgo = tensor([0.0669, 0.0649, 0.0667, 0.0655, 0.0649, 0.0669, 0.0662, 0.0704, 0.0649,
        0.0662, 0.0721, 0.0670, 0.0662, 0.0649, 0.0664])
3    	[0.69865, 1.02867, 1.42087]	[0.39558, 0.92302, 1.58233]	[0.50210, 0.73936, 1.02049]	[0.28360, 0.66173, 1.13440]	[0.15976, 0.25368, 0.34105]	[0.09140, 0.21327, 0.36560]	0.07892

## Zurich

In [70]:
mydata = MyDataset('zurich.dat')

using categorical parsing
creating 4 additional one-hot columns


### Linear Argmax Result

In [53]:
def doit():
    # see tune-fastcbcorral.zurich.res
    initlr, tzero, eta, gammamin, gammamax, nalgos = 0.0301241109725143, 92.84769369162983, 0.8175612001975752, 32, 4096, 13

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=64, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=LinearArgmax(dobs=x.shape[1])))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.59803, 0.72959, 0.88688]	[0.59803, 0.72959, 0.88688]	[0.34538, 0.42136, 0.51220]	[0.34538, 0.42136, 0.51220]	[0.43158, 0.52653, 0.64003]	[0.43158, 0.52653, 0.64003]	0.10139   
sampler.palgo = tensor([0.0783, 0.0720, 0.0810, 0.0749, 0.0776, 0.0795, 0.0708, 0.0860, 0.0801,
        0.0774, 0.0706, 0.0750, 0.0769])
2    	[0.65123, 0.76060, 0.85238]	[0.61845, 0.75451, 0.91716]	[0.38770, 0.45262, 0.50653]	[0.37804, 0.46120, 0.56063]	[0.44837, 0.52137, 0.58629]	[0.40329, 0.49201, 0.59808]	0.13352   
sampler.palgo = tensor([0.0775, 0.0707, 0.0914, 0.0733, 0.0753, 0.0817, 0.0819, 0.0788, 0.0770,
        0.0790, 0.0652, 0.0732, 0.0748])
3    	[0.67103, 0.76983, 0.86572]	[0.62680, 0.76470, 0.92955]	[0.40784, 0.46919, 0.52717]	[0.39867, 0.48637, 0.59122]	[0.44661, 0.51152, 0.57194]	[0.38852, 0.47399, 0.57617]	0.16169   
sampler.palgo = tensor([0.07

### Laplace Kernel Argmax Result

In [71]:
def doit():
    # see tune-fastcbcorralkernel.zurich.res
    initlr, tzero, eta, gammamin, gammamax, nalgos, sigma = 0.03499568975690761, 98.99637157325944, 0.36153161625407354, 8, 4096, 10, 0.14724977605624373

    learnOnline(mydata, seed=4545, initlr=initlr, tzero=tzero, batch_size=64, 
                eta=eta, gammamin=gammamin, gammamax=gammamax, nalgos=nalgos,
                    modelfactory=lambda x: ArgmaxPlusDispersion(argmaxblock=CauchyRFF(dobs=x.shape[1], 
                                                                                      numrff=1024, 
                                                                                      sigma=sigma,
                                                                                      device=x.device)))
    
doit()

n    	loss                	since last          	acc                 	since last          	realized l1         	since last          	dt (sec)  
1    	[0.67555, 0.82417, 1.00184]	[0.67555, 0.82417, 1.00184]	[0.43738, 0.53361, 0.64864]	[0.43738, 0.53361, 0.64864]	[0.35341, 0.43116, 0.52411]	[0.35341, 0.43116, 0.52411]	0.01644   
sampler.palgo = tensor([0.0972, 0.1076, 0.0949, 0.0975, 0.0994, 0.0936, 0.0993, 0.1057, 0.1015,
        0.1035])
2    	[0.77189, 0.90088, 1.00736]	[0.76374, 0.93176, 1.13263]	[0.51785, 0.60571, 0.67615]	[0.52954, 0.64604, 0.78530]	[0.33236, 0.38097, 0.43141]	[0.25692, 0.31344, 0.38101]	0.02959   
sampler.palgo = tensor([0.0917, 0.1286, 0.1045, 0.0934, 0.0931, 0.0930, 0.0934, 0.1035, 0.0973,
        0.1015])
3    	[0.82258, 0.94677, 1.05974]	[0.82196, 1.00280, 1.21897]	[0.57448, 0.66104, 0.74082]	[0.60840, 0.74225, 0.90226]	[0.29107, 0.33067, 0.36710]	[0.17900, 0.21838, 0.26545]	0.04100   
sampler.palgo = tensor([0.0909, 0.1211, 0.1027, 0.0937, 0.0911, 0.0962, 0.10