Most basic MAB functionality with epsilon-greedy.

roycoding · Sep 25, 2014 · fbd0cc0 · fbd0cc0
commit fbd0cc0
Show file tree

Hide file tree

Showing 5 changed files with 335 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,19 @@
+Copyright (C) 2014 Roy Keyes
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,13 @@
+#slots
+###*A multi-armed bandit library for Python*
+
+Slots is intended to be a basic, very easy-to-use multi-armed bandit library for Python.
+
+See [slots-notes.md](https://github.com/roycoding/slots/blob/master/slots-notes.md) for design ideas.
+
+####Author
+[Roy Keyes](https://roycoding.github.io) -- roy.coding@gmail
+
+####License: BSD
+See [LICENSE.txt](https://github.com/roycoding/slots/blob/master/LICENSE.txt)
+
diff --git a/slots-notes.md b/slots-notes.md
@@ -0,0 +1,105 @@
+#Multi-armed bandit library notes
+
+### What does the library need to do?
+1. Set up N bandits with probabilities, p_i, and payouts, pay_i.
+2. Implement several MAB strategies, with kwargs as parameters, and consistent API.
+3. Allow for T trials.
+4. Continue with more trials (i.e. save state after trials).
+5. Values to save:
+    1. Current choice
+    2. number of trials completed for each arm
+    3. scores for each arm
+    4. average payout per arm (payout*wins/trials?)
+6. Use sane defaults.
+7. Be obvious and clean.
+
+###Library API ideas:
+Creating a MAB test instance:
+
+```Python
+# Default: 3 bandits with random p_i and pay_i = 1
+mab = slots.MAB()
+
+# Set up 4 bandits with random p_i and pay_i
+mab = slots.MAB(4)
+
+# 4 bandits with specified p_i
+mab = slots.MAB(probs = [0.2,0.1,0.4,0.1])
+
+# 3 bandits with specified pay_i
+mab = slots.MAB(payouts = [1,10,15])
+
+# Bandits with payouts specified by arrays (i.e. payout data with unknown probabilities)
+# payouts is an N * T array, with N bandits and T trials
+mab = slots.MAB(live = True, payouts = [[0,0,0,0,1.2,0,0],[0,0.1,0,0,0.1,0.1,0]]
+```
+
+Running tests with strategy, S
+
+```Python
+# Default: Epsilon-greedy, epsilon = 0.1, num_trials = 1000
+mab.run()
+
+# Run chosen strategy with specified parameters and trials
+map.eps_greedy(eps = 0.2, trials = 10000)
+map.run(strategy = 'eps_greedy',params = {'eps':0.2}, trials = 10000)
+
+# Run strategy, updating old trial data
+map.run(continue = True)
+```
+
+Displaying / retrieving bandit properties
+
+```Python
+# Default: display number of bandits, probabilities and payouts
+mab.bandits.info()
+
+# Display info for bandit i
+mab.bandits[i]
+
+# Retrieve bandits' payouts, probabilities, etc
+mab.bandits.payouts
+mab.bandits.probs
+
+# Retrieve count of bandits
+mab.bandits.count
+```
+
+Setting bandit properties
+
+```Python
+# Reset bandits to defaults
+map.bandits.reset()
+
+# Set probabilities or payouts
+map.bandits.probs_set([0.1,0.05,0.2,0.15])
+map.bandits.payouts_set([1,1.5,0.5,0.8])
+```
+
+Displaying / retrieving test info
+
+```Python
+# Retrieve current "best" bandit
+mab.best()
+
+# Retrieve bandit probability estimates
+map.prob_est()
+
+# Retrieve bandit probability estimate of bandit i
+map.prob_est(i)
+
+# Retrieve bandit payout estimates (p * payout)
+map.payout_est()
+
+# Retrieve current bandit choice
+map.current()
+
+# Retrieve sequence of choices
+map.choices
+
+# Retrieve probabilty estimate history
+map.prob_est_sequence
+
+# Retrieve test strategy info (current strategy) -- a dict
+map.strategy_info()
+```
diff --git a/slots.py b/slots.py
@@ -0,0 +1,197 @@
+'''
+slots
+
+A Python library to perform simple multi-armed bandit analyses.
+
+Scenarios:
+    - Run MAB test on simulated data (N bandits), default epsilon-greedy test.
+        mab = slots.MAB(probs = [0.1,0.15,0.05])
+        mab.run(trials = 10000)
+        mab.best  # Bandit with highest probability after T trials
+
+    - Run MAB test on "real" payout data (probabilites unknown).
+        mab = slots.MAB(payouts = [0,0,0,1,0,0,0,0,0,....])
+        mab.run(trials = 10000) # Max is length of payouts
+'''
+
+
+import numpy as np
+
+class MAB():
+    '''
+    Multi-armed bandit test class.
+    '''
+
+    def __init__(self, num_bandits=None,probs=False,payouts=False,live=False):
+        '''
+        Instantiate MAB class, determining
+            - Number of bandits
+            - Probabilities of bandit payouts
+            - Bandit payouts
+
+        Parameters (optional):
+            - Number of bandits (used alone) - integer
+            - Probabilities of bandit payouts - array of floats
+            - Amount of bandit payouts
+                - array of floats
+                - If 'live' = True, a N*T array of floats indication payout
+                    amount per pull for N bandits and T trials
+            - Boolean indicating if data is live
+        '''
+
+        default_num_bandits = 3
+
+        self.choices = []
+
+        if not probs:
+            if not payouts:
+                if not num_bandits:
+                    num_bandits = default_num_bandits
+                self.bandits = Bandits(probs = [np.random.rand() for x in 
+                                            range(num_bandits)],
+                                            payouts = np.ones(num_bandits))
+            else:
+                if live:
+                    self.bandits = Bandits(live = True, payouts = payouts)
+                else:
+                    # Not sure why anyone would do this
+                    self.bandits = Bandits(probs = [np.random.rand() for x in 
+                                            range(len(payouts))],
+                                            payouts = payouts)
+        else:
+            if payouts:
+                self.bandits = Bandits(probs = probs, payouts = payouts)
+            else:
+                self.bandits = Bandits(probs = probs,
+                                        payouts = np.ones(len(payouts)))
+
+        self.wins = np.zeros(num_bandits)
+        self.pulls = np.zeros(num_bandits)
+
+    def run(self, trials=100, strategy=None, parameters=None):
+        '''
+        Run MAB test with T trials.
+        
+        Paramters:
+            trials (integer) - number of trials to run.
+            strategy (string) - name of selected strategy.
+            parameters (dict) - parameters for selected strategy.
+
+        Currently on epsilon greedy is implemented.
+        '''
+
+        strategies = {'eps_greedy':self.eps_greedy}
+
+        if trials < 1:
+            raise Exception('MAB.run: Number of trials cannot be less than 1!')
+        if not strategy:
+            strategy = 'eps_greedy'
+        else:
+            if strategy not in strategies:
+                raise Exception('MAB,run: Strategy name invalid. Choose from: '
+                                 + ', '.join(strategies))
+
+        # Run strategy
+        for n in xrange(trials):
+            choice = strategies[strategy](params=parameters)
+            self.choices.append(choice)
+            self.pulls[choice] += 1
+            self.wins[choice] += self.bandits.pull(choice)
+#            print 'DEBUG - run - choice:',choice
+#            print 'DEBUG - run - choices:',self.choices
+#            print 'DEBUG - run - pulls:',self.pulls
+#            print 'DEBUG - run - wins:',self.wins
+
+    def max_mean(self):
+        """
+        Pick the bandit with the current best observed proportion of winning 
+
+        Input: self
+        Output: None
+        """
+        return np.argmax( self.wins / ( self.pulls +1 ) )
+
+    def eps_greedy(self,params):
+        '''
+        Run the epsilon-greedy MAB algorithm.
+
+        Input: dict of parameters (epsilon)
+        Output: None
+        '''
+
+        if params and type(params) == dict:
+            eps = param
+        else:
+            eps = 0.1
+
+        r = np.random.rand()
+        if r < eps:
+            return np.random.choice(list(set(range(len(self.wins)))-{self.max_mean()}))
+        else:
+            return self.max_mean()
+
+    def best(self):
+        '''
+        Return current 'best' choice of bandit.
+
+        Input: self
+        Output: integer
+        '''
+
+        if len(self.choices) < 1:
+            print 'slots: No trials run so far.'
+            return None
+        else:
+            return self.choices[-1]
+
+
+class Bandits():
+    '''
+    Bandit class.
+    '''
+
+    def __init__(self, probs, payouts, live=False):
+        '''
+        Instantiate Bandit class, determining
+            - Probabilities of bandit payouts
+            - Bandit payouts
+
+        Parameters:
+            - Probabilities of bandit payouts - array of floats
+            - Amount of bandit payouts
+                - array of floats
+                - If 'live' = True, a N*T array of floats indication payout
+                    amount per pull for N bandits and T trials
+            - Boolean indicating if data is live
+        '''
+
+        if not live:
+            # Only use arrays of equal length
+            if len(probs) != len(payouts):
+                raise Exception('Bandits.__init__: Probability and payouts arrays of different lengths!')
+            self.probs = probs
+            self.payouts = payouts
+            self.live = False
+        else:
+            self.live = True
+            self.probs = None
+            self.payouts = payouts
+
+    def pull(self,i):
+        '''
+        Return the payout from a single pull of the bandit i's arm.
+        '''
+
+        if self.live:
+            if len(self.payouts[i]) > 0:
+                return self.payouts[i].pop()
+            else:
+                return None
+        else:
+            if np.random.rand() < self.probs[i]:
+                return self.payouts[i]
+            else:
+                return 0.0
+
+    def info(self):
+        pass