In [1]:
# https://gym.openai.com/evaluations/eval_lEi8I8v2QLqEgzBxcvRIaA

""" Quick script for an "Episodic Controller" Agent, i.e. nearest neighbor """

import logging
import os
import tempfile
import numpy as np

import gym, sys

class EpisodicAgent(object):
    """
    Episodic agent is a simple nearest-neighbor based agent:
    - At training time it remembers all tuples of (state, action, reward).
    - After each episode it computes the empirical value function based 
        on the recorded rewards in the episode.
    - At test time it looks up k-nearest neighbors in the state space 
        and takes the action that most often leads to highest average value.
    """
    def __init__(self, action_space):
        self.action_space = action_space
        assert isinstance(action_space, gym.spaces.discrete.Discrete), 'unsupported action space for now.'

        # options
        self.epsilon = 1.0 # probability of choosing a random action
        self.epsilon_decay = 0.98 # decay of epsilon per episode
        self.epsilon_min = 0
        self.nnfind = 500 # how many nearest neighbors to consider in the policy?
        self.mem_needed = 500 # amount of data to have before we can start exploiting
        self.mem_size = 50000 # maximum size of memory
        self.gamma = 0.95 # discount factor

        # internal vars
        self.iter = 0
        self.mem_pointer = 0 # memory pointer
        self.max_pointer = 0
        self.db = None # large array of states seen
        self.dba = {} # actions taken
        self.dbr = {} # rewards obtained at all steps
        self.dbv = {} # value function at all steps, computed retrospectively
        self.ep_start_pointer = 0

    def act(self, observation, reward, done):
        assert isinstance(observation, np.ndarray) and observation.ndim == 1, 'unsupported observation type for now.'

        if self.db is None:
            # lazy initialization of memory
            self.db = np.zeros((self.mem_size, observation.size))
            self.mem_pointer = 0
            self.ep_start_pointer = 0

        # we have enough data, we want to explore, and we have seen at least one episode already (so values were computed)
        if self.iter > self.mem_needed and np.random.rand() > self.epsilon and self.dbv:
            # exploit: find the few closest states and pick the action that led to highest rewards
            # 1. find k nearest neighbors
            ds = np.sum((self.db[:self.max_pointer] - observation)**2, axis=1) # L2 distance
            ix = np.argsort(ds) # sorts ascending by distance
            # ix = ix[:min(len(ix), self.nnfind)] # crop to only some number of nearest neighbors
            ix = ix[:min(len(ix), 1000)]
            
            # find the action that leads to most success. do a vote among actions
            adict = {}
            ndict = {}
            for i in ix:
                vv = self.dbv[i]
                aa = self.dba[i]
                vnew = adict.get(aa, 0) + vv
                adict[aa] = vnew                          #adict[1] = somevalue, adict[0] = somevalue
                ndict[aa] = ndict.get(aa, 0) + 1          #keeps a count of the number of times an action was taken

            for a in adict: # normalize by counts
                adict[a] = adict[a] / ndict[a]
            # print ('Action Dict:',adict)
            
            its = [(y,x) for x,y in adict.items()]   #sort([value, action for action, value in adict.items()])
            its.sort(reverse=True) # descending
            a = its[0][1]

        else:
            # explore: do something random
            a = self.action_space.sample()

        # record move to database
        if self.mem_pointer < self.mem_size:
            self.db[self.mem_pointer] = observation # save the state
            self.dba[self.mem_pointer] = a # and the action we took
            self.dbr[self.mem_pointer-1] = reward # and the reward we obtained last time step
            self.dbv[self.mem_pointer-1] = 0
        self.mem_pointer += 1
        self.iter += 1

        if done: # episode Ended;

            # compute the estimate of the value function based on this rollout
            v = 0
            for t in reversed(range(self.ep_start_pointer, self.mem_pointer)):
                v = self.gamma * v + self.dbr.get(t,0)
                self.dbv[t] = v

            self.ep_start_pointer = self.mem_pointer
            self.max_pointer = min(max(self.max_pointer, self.mem_pointer), self.mem_size)
            
            # decay exploration probability
            self.epsilon *= self.epsilon_decay
            self.epsilon = max(self.epsilon, self.epsilon_min) # cap at epsilon_min

            # print ('===============memory size: ', self.mem_pointer)

        return a

In [3]:
if __name__ == '__main__':
#     logger = logging.getLogger()
#     logger.setLevel(logging.INFO)

    directory='training_dir'
    env = gym.make('CartPole-v0')
#     env = gym.wrappers.Monitor(env, directory)
#     env.monitor.start('training_dir', force=True)
    agent = EpisodicAgent(env.action_space)
    

    episode_count = 500
    max_steps = 200
    reward = 0
    done = False
    sum_reward_running = 0

    for i in range(episode_count):
        ob = env.reset()
        sum_reward = 0

        for j in range(max_steps):
            action = agent.act(ob, reward, done)
            ob, reward, done, _ = env.step(action)
            sum_reward += reward
            if done:
                break

        sum_reward_running = sum_reward_running * 0.95 + sum_reward * 0.05
        print ('%d running reward: %f actual reward: %f' % (i, sum_reward_running,sum_reward))

    # Dump monitor info to disk
    env.monitor.close()
    
    # uncomment this line to also upload to OpenAI gym
    #gym.upload('training_dir', algorithm_id='episodic_controller')

[2017-02-25 13:57:15,821] Making new env: CartPole-v0


0 running reward: 1.900000 actual reward: 38.000000
1 running reward: 3.955000 actual reward: 43.000000
2 running reward: 5.357250 actual reward: 32.000000
3 running reward: 6.139387 actual reward: 21.000000
4 running reward: 6.732418 actual reward: 18.000000
5 running reward: 7.195797 actual reward: 16.000000
6 running reward: 8.486007 actual reward: 33.000000
7 running reward: 8.561707 actual reward: 10.000000
8 running reward: 9.583622 actual reward: 29.000000
9 running reward: 9.654441 actual reward: 11.000000
10 running reward: 12.271719 actual reward: 62.000000
11 running reward: 12.408133 actual reward: 15.000000
12 running reward: 12.787726 actual reward: 20.000000
13 running reward: 13.698340 actual reward: 31.000000
14 running reward: 13.963423 actual reward: 19.000000
15 running reward: 14.265252 actual reward: 20.000000
16 running reward: 15.151989 actual reward: 32.000000
17 running reward: 15.244390 actual reward: 17.000000
18 running reward: 15.482170 actual reward: 20.0

KeyboardInterrupt: 

In [5]:
import tensorflow as tf 
print (tf)

<module 'tensorflow' from 'c:\\users\\prerak - nonwork\\.conda\\envs\\py3.5\\lib\\site-packages\\tensorflow\\__init__.py'>
