# Evaluation Strategy

Rejection sampling to sample from the distribution of histories.

In [1]:
def innerloop(pi, rand, log):
    for x, a, r, p in log:
        maxratio = maxoverxa(lambda x, a: pi(x, a) / mu(x, a))
        if rand.uniform() * maxratio * p < pi(x, a):
            yield x, a, r  # note: yielded-to can change pi by reference here      

def effectiveinnerloop(pi, mu, rand, actions, D):
    for x, r in D:
        a = rand.choice(actions,  p=[mu(x, a) for a in actions])  # non-uniform
        p = mu(x, a)
        
        maxratio = maxoverxa(lambda x, a: pi(x, a) / mu(x, a))
        if rand.uniform() * maxratio * p < pi(x, a):
            yield x, a, r  # note: yielded-to can change pi by reference here

from `effectiveinnerloop`
$$
\begin{aligned}
\mathrm{Pr}\left(\text{yield}(x, a, r) \left|\right. h_{t-1} \right) &\propto \mathrm{Pr}(x, r | h_{t-1}) \mathrm{Pr}(a | x, r, h_{t-1}) \mathrm{Pr}(\text{yield}(x, a, r) | x, a, r, h_{t-1}) \\
&\propto \mathrm{Pr}_D(x, r) \mathrm{Pr}_{\mu}(a | x) \frac{1}{\mathrm{Pr}_{\mu}(a | x) \max_{x'',a''} \frac{\pi(x'', a'')}{\mu(x'', a'')} } \mathrm{Pr}_{\pi(h_{t-1})}\left(a \left|\right. x\right) \\
&\propto \left(\frac{1}{\max_{x'',a''} \frac{\pi(x'', a'')}{\mu(x'', a'')} } \right) \mathrm{Pr}_D(x, r) \mathrm{Pr}_{\pi(h_{t-1})}\left(a \left|\right. x\right)
\end{aligned}
$$

The normalization constant is exactly $\max_{x'',a''} \frac{\pi(x'', a'')}{\mu(x'', a'')}$ since the other terms integrate to 1.  So $\left(\frac{1}{\max_{x'',a''} \frac{\pi(x'', a'')}{\mu(x'', a'')} } \right)$ is the acceptance rate, e.g. when $\pi = \mu$ the acceptance rate is 1.  

In practice if $\pi$ can be arbitrary then `maxratio` is $1/\min_{x, a} \mu(x, a)$ and the acceptance rate is $\min_{x, a} \mu(x, a)$.

# Code and Results

In [2]:
class EasyAcc:
    def __init__(self):
        self.n = 0
        self.sum = 0

    def __iadd__(self, other):
        self.n += 1
        self.sum += other
        return self
    
    def __isub__(self, other):
        self.n += 1
        self.sum -= other
        return self

    def mean(self):
        return self.sum / max(self.n, 1)

def evalit(path, seed, maxratio, veedub):
    from vowpalwabbit import pyvw
    import fnmatch
    import gzip
    import json
    import os
    import random
    
    randState = random.Random(seed)
    vw = [ pyvw.vw(veedub) for _ in range(16) ]

    dsfiles = (os.path.join(dirpath, f)
               for dirpath, dirnames, files in os.walk(path)
               for f in fnmatch.filter(files, '*.json.gz'))
    
    makeacc = lambda: [ EasyAcc() for _ in range(16)]
    rawcount, acceptrate, cost, sincelastcost, sincelastpia = makeacc(), makeacc(), makeacc(), makeacc(), makeacc()
    
    print('{:<6s}\t{:<8s}\t{:<6s}\t{:<25s}\t{:<25s}\t{:<9s}\t{:<9s}'.format(
                     'n', 'n raw', 'accept', 'cost', 'since last', 'pi[a]', 'since last pi[a]',
                ),
               flush=True)
    
    for fn in dsfiles:
        with gzip.open(fn, 'rt') as f:
            for line in f:
                try:
                    o = json.loads(line)
                except:
                    continue
                    
                if o.get('_skipLearn', False):
                    continue

                p = float(o['_label_probability'])
                a = int(o['_labelIndex'])
                
                for ind in range(16):
                    rawcount[ind] += 1
                    pi = vw[ind].predict(line)

                    if randState.random() * maxratio * p < pi[a]:
                        acceptrate[ind] += 1
                        cost[ind] += float(o['_label_cost'])
                        sincelastcost[ind] += float(o['_label_cost'])
                        sincelastpia[ind] += pi[a]

                        o['_label_probability'] = pi[a]
                        vw[ind].learn(json.dumps(o))

                        if ind == 15 and cost[ind].n & (cost[ind].n - 1) == 0:
                            sortcost = list(sorted(cost, key=lambda x: x.mean()))
                            sortsincelastcost = list(sorted(sincelastcost, key=lambda x: x.mean()))

                            print('{:<6d}\t{:<8d}\t{:<5.5f}\t[{:<5.5f},{:<5.5f},{:<5.5f}]\t[{:<5.5f},{:<5.5f},{:<5.5f}]\t{:<9.5f}\t{:<9.5f}'.format(
                                        cost[ind].n,
                                        rawcount[ind].n,
                                        acceptrate[ind].mean(),
                                        sortcost[1].mean(),
                                        0.5 * (sortcost[7].mean() + sortcost[8].mean()),
                                        sortcost[14].mean(),
                                        sortsincelastcost[1].mean(),
                                        0.5 * (sortsincelastcost[7].mean() + sortsincelastcost[8].mean()),
                                        sortsincelastcost[14].mean(),
                                        pi[a],
                                        sincelastpia[ind].mean()
                                 ),
                                 flush=True)
                            sincelastcost, sincelastpia = makeacc(), makeacc()
                    else:
                        acceptrate[ind] += 0
     
    sortcost = list(sorted(cost, key=lambda x: x.mean()))
    sortsincelastcost = list(sorted(sincelastcost, key=lambda x: x.mean()))
    print('{:<6d}\t{:<8d}\t{:<5.5f}\t[{:<5.5f},{:<5.5f},{:<5.5f}]\t[{:<5.5f},{:<5.5f},{:<5.5f}]\t{:<9.5f}\t{:<9.5f}'.format(
                cost[15].n,
                rawcount[15].n,
                acceptrate[15].mean(),
                sortcost[1].mean(),
                0.5 * (sortcost[7].mean() + sortcost[8].mean()),
                sortcost[14].mean(),
                sortsincelastcost[1].mean(),
                0.5 * (sortsincelastcost[7].mean() + sortsincelastcost[8].mean()),
                sortsincelastcost[14].mean(),
                pi[a],
                sincelastpia[15].mean()
         ),
         flush=True)

def forkit(func, args, kwds):
    from multiprocessing import Pool
    
    pool = Pool(processes=1)
    return pool.apply(func, args, kwds)

## Baseline

`--cb_explore_adf --epsilon 0.2 --dsjson --cb_type ips -q RA -q PA -l 0.0001 --l1 1e-06 --power_t 0`

### Details

In [82]:
forkit(evalit, (),
       kwds = {
         'path' : '/mnt/c/Users/pmineiro/Downloads/iris/', 
         'seed' : 4545,
         'maxratio' : 1/(0.2/5),
         'veedub' : '--cb_explore_adf --epsilon 0.2 --dsjson --cb_type ips -q RA -q PA -l 0.0001 --l1 1e-06 --power_t 0'
       })

n     	n raw   	accept	cost                     	since last               	pi[a]    	since last pi[a]
1     	10      	0.10000	[0.00000,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.16667  	0.16667  
2     	40      	0.05000	[0.00000,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.16667  	0.16667  
4     	129     	0.03101	[0.00000,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.20000  	0.20000  
8     	307     	0.02606	[-0.10000,0.00000,0.00000]	[-0.11111,0.00000,0.00000]	0.16667  	0.26667  
16    	469     	0.03412	[-0.06667,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.20000  	0.17619  
32    	1180    	0.02712	[-0.04545,0.00000,0.00000]	[-0.03448,0.00000,0.00000]	0.20000  	0.18542  
64    	2229    	0.02871	[-0.03409,-0.01212,0.00000]	[-0.03030,0.00000,0.00000]	0.20000  	0.19427  
128   	3971    	0.03223	[-0.02740,-0.01246,0.00000]	[-0.04286,-0.01366,0.00000]	0.83333  	0.20016  
256   	7522    	0.03403	[-0.02508,-0.01770,-0.00800]	[-0.03906,-0.02037,0.00000]	0.83333  	0.70547  
512   	14334   	0

## ips $\rightarrow$ mtr

`--cb_explore_adf --epsilon 0.2 --dsjson --cb_type mtr -q RA -q PA -l 0.0001 --l1 1e-06 --power_t 0`

No effect.

### Details

In [5]:
forkit(evalit, (),
       kwds = {
         'path' : '/mnt/c/Users/pmineiro/Downloads/iris/', 
         'seed' : 4545,
         'maxratio' : 1/(0.2/5),
         'veedub' : '--cb_explore_adf --epsilon 0.2 --dsjson --cb_type mtr -q RA -q PA -l 0.0001 --l1 1e-06 --power_t 0'
       })

n     	n raw   	accept	cost                     	since last               	pi[a]    	since last pi[a]
1     	10      	0.10000	[0.00000,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.16667  	0.16667  
2     	40      	0.05000	[0.00000,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.16667  	0.16667  
4     	129     	0.03101	[0.00000,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.20000  	0.20000  
8     	307     	0.02606	[-0.10000,0.00000,0.00000]	[-0.11111,0.00000,0.00000]	0.16667  	0.26667  
16    	469     	0.03412	[-0.07143,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.20000  	0.17619  
32    	1180    	0.02712	[-0.04255,0.00000,0.00000]	[-0.05556,0.00000,0.00000]	0.20000  	0.18542  
64    	2229    	0.02871	[-0.03448,-0.01198,0.00000]	[-0.03125,0.00000,0.00000]	0.20000  	0.19427  
128   	3973    	0.03222	[-0.02000,-0.01175,0.00000]	[-0.01887,-0.01300,0.00000]	0.23333  	0.19078  
256   	7185    	0.03563	[-0.02349,-0.01549,-0.00340]	[-0.03401,-0.01806,0.00000]	0.23333  	0.48737  
512   	14148   	0

## softmax

`--cb_explore_adf --softmax --epsilon 0.025 --lambda 160 --dsjson -q RA -q PA --cb_type mtr -l 0.1 --l1 1e-06 --power_t 0`

Exploits more, but policy value about the same.

### Details

In [3]:
# ips is much worse (?)

forkit(evalit, (),
       kwds = {
         'path' : '/mnt/c/Users/pmineiro/Downloads/iris/', 
         'seed' : 4545,
         'maxratio' : 1/(0.2/5),
         'veedub' : '--cb_explore_adf --softmax --epsilon 0.025 --lambda 160 --dsjson -q RA -q PA --cb_type mtr -l 0.1 --l1 1e-06 --power_t 0'
       })

n     	n raw   	accept	cost                     	since last               	pi[a]    	since last pi[a]
1     	10      	0.10000	[0.00000,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.16667  	0.16667  
2     	40      	0.05000	[0.00000,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.16667  	0.16667  
4     	129     	0.03101	[0.00000,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.20000  	0.20000  
8     	307     	0.02606	[-0.10000,0.00000,0.00000]	[-0.10000,0.00000,0.00000]	0.16667  	0.26667  
16    	469     	0.03412	[-0.06667,0.00000,0.00000]	[0.00000,0.00000,0.00000]	0.20000  	0.17619  
32    	1180    	0.02712	[-0.04444,0.00000,0.00000]	[-0.03846,0.00000,0.00000]	0.20000  	0.18542  
64    	2229    	0.02871	[-0.03488,-0.01205,0.00000]	[-0.06383,0.00000,0.00000]	0.20000  	0.19427  
128   	3973    	0.03222	[-0.03165,-0.00752,0.00000]	[-0.03226,0.00000,0.00000]	0.21018  	0.19070  
256   	7185    	0.03563	[-0.02347,-0.01597,-0.00382]	[-0.03571,-0.02225,0.00000]	0.32008  	0.49247  
512   	18434   	0.

## rnd

`-b 24 --cb_explore_adf --epsilon 0.01 --rnd 1 --rnd_alpha 1e-12 --rnd_invlambda 1e-5 --dsjson --cb_type mtr -q RA -q PA -l 5e-1 --power_t 0`

# Dragoonz

## regcb

I played around but consistently got poor results (?).

### Details

In [4]:
forkit(evalit, (),
       kwds = {
         'path' : '/mnt/c/Users/pmineiro/Downloads/iris/', 
         'seed' : 4545,
         'maxratio' : 1/(0.2/5),
         'veedub' : '-b 24 --cb_explore_adf --regcb --mellowness 1e-3 --dsjson -q RA -q PA --cb_type mtr -l 1 --l1 1e-06 --power_t 0'
       })

n        	n raw    	accept   	cost     	since last	pi[a]    	since last pi[a]
1        	60       	0.01667  	0.00000  	0.00000  	0.20000  	0.20000  
2        	72       	0.02778  	0.00000  	0.00000  	0.20000  	0.20000  
4        	87       	0.04598  	0.00000  	0.00000  	0.16667  	0.18333  
8        	185      	0.04324  	0.00000  	0.00000  	0.20000  	0.18333  
16       	269      	0.05948  	-0.06250 	-0.06250 	0.20000  	0.22917  
32       	714      	0.04482  	-0.03125 	-0.03125 	0.16667  	0.18750  
64       	1760     	0.03636  	-0.03125 	-0.03125 	0.20000  	0.17917  
128      	4003     	0.03198  	-0.03906 	-0.03906 	0.16667  	0.18828  
256      	8029     	0.03188  	-0.03906 	-0.03906 	0.20000  	0.19754  
512      	15626    	0.03277  	-0.02734 	-0.02734 	0.16667  	0.19255  
1024     	30625    	0.03344  	-0.02637 	-0.02637 	0.50000  	0.19400  
2048     	61523    	0.03329  	-0.02441 	-0.02441 	0.20000  	0.20601  
4096     	128402   	0.03190  	-0.02417 	-0.02417 	0.50000  	0.23526  
8192     	26

## rnd

### Details

In [3]:
forkit(evalit, (),
       kwds = {
         'path' : '/mnt/c/Users/pmineiro/Downloads/iris/', 
         'seed' : 4545,
         'maxratio' : 1/(0.2/5),
         'veedub' : '-b 24 --cb_explore_adf --rnd 3 --epsilon 0.025 --rnd_alpha 1e-12 --rnd_invlambda 1e-5 --dsjson --cb_type mtr -q RA -q PA -l 1e-2 --l1 1e-06 --power_t 0'
       })

n        	n raw    	accept   	cost     	since last	pi[a]    	since last pi[a]
1        	60       	0.01667  	0.00000  	0.00000  	0.18838  	0.18838  
2        	72       	0.02778  	0.00000  	0.00000  	0.22478  	0.22478  
4        	87       	0.04598  	0.00000  	0.00000  	0.15170  	0.18315  
8        	185      	0.04324  	0.00000  	0.00000  	0.24072  	0.17813  
16       	269      	0.05948  	-0.06250 	-0.06250 	0.24236  	0.23219  
32       	672      	0.04762  	-0.03125 	-0.03125 	0.11053  	0.42559  
64       	1521     	0.04208  	-0.03125 	-0.03125 	0.23614  	0.24205  
128      	3373     	0.03795  	-0.02344 	-0.02344 	0.25018  	0.39891  
256      	6700     	0.03821  	-0.03516 	-0.03516 	0.56028  	0.48760  
512      	15895    	0.03221  	-0.02539 	-0.02539 	0.94873  	0.42831  
1024     	31151    	0.03287  	-0.02637 	-0.02637 	0.98000  	0.52850  
2048     	64186    	0.03191  	-0.02344 	-0.02344 	0.52553  	0.61042  
4096     	138417   	0.02959  	-0.02808 	-0.02808 	0.95989  	0.68995  
8192     	36