In [7]:
import tensorflow as tf
import tensorflow.contrib.layers as layers
import numpy as np
#import pyMCCM as ccm
from collections import deque

In [8]:
class Bandit():
    def __init__(self):
        self.state = 0
        #List out our bandits. Currently arms 4, 2, and 1 (respectively) are the most optimal.
        self.bandits = np.array([[0.2,0,-0.0,-5],[0.1,-5,1,0.25],[-5,5,5,5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
        
    def getBandit(self):
        self.state_onehot = np.zeros(len(self.bandits))
        self.state = np.random.randint(0,len(self.bandits)) #Returns a random state for each episode.
        self.state_onehot[self.state] = 1
        return self.state_onehot
        
    def pullArm(self,action):
        #Get a random number.
        bandit = self.bandits[self.state,action]
        result = np.random.randn(1)
        if result > bandit:
            #return a positive reward.
            return 1
        else:
            #return a negative reward.
            return -1

In [35]:
class Memory():
    def __init__(self,max_size):
        self.buffer = deque([])
        self.current = []
        self.max_size = max_size
    
    def __len__(self):
        return(len(self.buffer))
    
    def add(self,state,t):
        if t == 0:
            if len(self.buffer) >= self.max_size:
                self.buffer.popleft()
            self.buffer.append(self.current)
            self.current = []
        self.current.append(state)
    
    def recall(self,batch):
        idx = [i for i in range(len(self.buffer))]
        np.random.shuffle(idx)
        out = [self.buffer[x] for x in idx[0:batch]]
        return(out)

In [63]:
class Agent():
    def __init__(self,ID,observation_space,action_space,observation_type=tf.float32,action_type=tf.float32):
        self.lr = 0.95 # learning rate
        self.memory_size = 1e4
        self.batch_size = 20
        self.ID = ID
        
        self._obs_space = observation_space
        self._act_space = action_space
        self.inputs = tf.placeholder(shape=[None, self._obs_space], dtype=observation_type)
        self.outputs = self._make_network(scope="Agent{}".format(self.ID))
        self.action = tf.argmax(self.outputs)
        
        self.memory = Memory(self.memory_size)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        
    def _make_network(self,scope,layer_nodes=16,activation=tf.nn.relu,reuse=False):
        with tf.variable_scope(scope, reuse=reuse):
            net = self.inputs
            net = layers.fully_connected(net, num_outputs=layer_nodes, activation_fn=activation)
            net = layers.fully_connected(net, num_outputs=layer_nodes, activation_fn=activation)
            net = layers.fully_connected(net, num_outputs=self._act_space, activation_fn=tf.nn.sigmoid)
            return(net)
    
    def experience(self,t,obs,act,rew,new_obs,done=None,terminal=None):
        self.memory.add([obs,act,rew,new_obs,done,terminal],t)
    
    def learn(self):
        #if len(self.memory) < self.memory_size:
            #return
        
        replay = self.memory.recall(self.batch_size)
        replay = [y for x in replay for y in x]
        np.random.shuffle(replay)
        return(replay)
        
        
    def act(self):
        pass

In [64]:
tf.reset_default_graph()
with tf.Session() as sess:
    agent = Agent(0,3,2)

    for i in range(100):
        for j in range(25):
            agent.experience(j,\
                            np.random.randint(3),\
                            np.random.randint(4),\
                            np.random.random(1)[0],\
                            np.random.randint(3))

    print(agent.inputs)

Tensor("Placeholder:0", shape=(?, 3), dtype=float32)


In [None]:
tf.reset_default_graph()

total_episodes = 10000
episode_length = 25
episode_rewards = []
explore = 0.99
num_agents = 1
agents = []

init = tf.global_variable_initializer()
with tf.Session() as sess:
    cBandit = Bandit()
    
    for a in range(num_agents):
        agents.append(Agent(a,len(cBandit.bandits),len(cBandit.num_actions)))
    
    agent = agents[0]
    for episode in range(total_episodes):
        obs_0 = [agent.act]
        for t in range(episode_length):
            if explore < 0.01:
                explore = 0.01
                
            obs = bandit.getBandit()
            if np.random.rand(1) < explore:
                action = np.random.randit(cBandit.num_actions)
            else:
                action = sess.run(agent.action, feed_dict{agent.inputs: [obs]})
                
            reward = bandit.pullArm(action)
        break