In [1]:
import numpy as np
import tensorflow as tf
import random


In [2]:
class Arm:
    def __init__(self,probability: float, deviation: float):
        self.probability = probability
        self.deviation = deviation
        self.positive_reward = 1
        self.negative_reward = 0
    def pull(self) -> int:
        return self.positive_reward*np.random.normal(self.probability,self.deviation)

In [3]:
class Fair:
    def __init__(self):
        self.arms = []
        self.length = 0
    def add_arm(self,arm : Arm):
        self.arms.append(arm)
        self.length += 1
    def pull_at_machine(self,index : int) -> int:
        try:
            return self.arms[index].pull()
        except:
            print("Error at Fair.pull_at_machine")
            raise

In [4]:
f = Fair()
f.add_arm(Arm(0.7, 0.01))
f.add_arm(Arm(0.8, 0.01))
f.add_arm(Arm(0.6, 0.01))
f.add_arm(Arm(0.5, 0.03))
f.add_arm(Arm(0.7, 0.02))
f.add_arm(Arm(0.7, 0.01))
f.add_arm(Arm(0.8, 0.01))
f.add_arm(Arm(0.6, 0.03))
f.add_arm(Arm(0.5, 0.05))
f.add_arm(Arm(0.7, 0.02))
f.add_arm(Arm(0.7, 0.03))
f.add_arm(Arm(0.8, 0.06))
f.add_arm(Arm(0.6, 0.01))
f.add_arm(Arm(0.5, 0.01))
f.add_arm(Arm(0.9, 0.01))
f.add_arm(Arm(0.7, 0.01))
f.add_arm(Arm(0.91, 0.01))
f.add_arm(Arm(0.6, 0.02))
f.add_arm(Arm(0.5, 0.04))
f.add_arm(Arm(0.7, 0.05))

In [5]:
'''
Epsilon Greedy Algorithm for reinforcement learning
'''
class Agent:
    def __init__(self,fair : Fair):
        self.reward_at_machine = [0 for i in range(fair.length)]
        self.trial_times = [0 for i in range(fair.length)]
        self.fair = fair
        self.epsilon = 0.01
        self.action_length = fair.length
    def choose_action(self):
        if random.random() < self.epsilon:
            machine = random.randint(0,self.action_length-1)
        else:
            machine = self.reward_at_machine.index(max(self.reward_at_machine))
        
        self.trial_times[machine] += 1
        reward = self.fair.pull_at_machine(machine)
        self.reward_at_machine[machine] = (self.reward_at_machine[machine]*
                                            (self.trial_times[machine]-1)+reward) /\
                                            (self.trial_times[machine])

        return reward
    def drive(self, num_episodes:int, num_timesteps_per_episode : int):
        rewards = []
        for episode in range(num_episodes):
            reward = 0
            for timestep in range(num_timesteps_per_episode):
                reward += self.choose_action()
            rewards.append(reward / num_timesteps_per_episode)
        
        return rewards

In [6]:
c = Agent(f)
c.drive(10,10000)

[0.900691784696403,
 0.9075826986632534,
 0.9068959390075035,
 0.907493941051995,
 0.9078743378475996,
 0.9077173280987223,
 0.9077032715178746,
 0.9078410912474856,
 0.9070818908031315,
 0.9073584031710289]