In [1]:
import gym
import time
import numpy as np
import subprocess
import os
import random

import matplotlib
import matplotlib.pyplot as plt

from gym import utils, spaces
from gym.envs.vmware_env.esxtest_env import ESXTestEnv
from gym.envs.vmware_env.vmware_env import VMwareEnv


from gym.utils import seeding

In [2]:
class QLearn:
    def __init__(self, actions, epsilon, alpha, gamma):
        self.q = {}
        self.epsilon = epsilon  # exploration constant
        self.alpha = alpha      # discount constant
        self.gamma = gamma      # discount factor
        self.actions = actions

    def getQ(self, state, action):
        return self.q.get((state, action), 0.0)

    def learnQ(self, state, action, reward, value):
        '''
        Q-learning:
            Q(s, a) += alpha * (reward(s,a) + max(Q(s') - Q(s,a))            
        '''
        oldv = self.q.get((state, action), None)
        if oldv is None:
            self.q[(state, action)] = reward
        else:
            self.q[(state, action)] = oldv + self.alpha * (value - oldv)

    def chooseAction(self, state, return_q=False):
        q = [self.getQ(state, a) for a in self.actions]
        maxQ = max(q)

        if random.random() < self.epsilon:
            minQ = min(q); mag = max(abs(minQ), abs(maxQ))
            # add random values to all the actions, recalculate maxQ
            q = [q[i] + random.random() * mag - .5 * mag for i in range(len(self.actions))] 
            maxQ = max(q)

        count = q.count(maxQ)
        # In case there're several state-action max values 
        # we select a random one among them
        if count > 1:
            best = [i for i in range(len(self.actions)) if q[i] == maxQ]
            i = random.choice(best)
        else:
            i = q.index(maxQ)

        action = self.actions[i]        
        if return_q: # if they want it, give it!
            return action, q
        return action

    def learn(self, state1, action1, reward, state2):
        maxqnew = max([self.getQ(state2, a) for a in self.actions])
        self.learnQ(state1, action1, reward, reward + self.gamma*maxqnew)

In [3]:
import logging
from gym.envs.registration import register

logger = logging.getLogger(__name__)

# Virtualization
# ----------------------------------------


register(
    id='VirtESX-v0',
    entry_point='gym.envs.vmware_env.esxtest_env:ESXTestEnv',
    # More arguments here
)

In [5]:


class LivePlot(object):
    def __init__(self, outdir, data_key='episode_rewards', line_color='blue'):
        """
        Liveplot renders a graph of either episode_rewards or episode_lengths
        Args:
            outdir (outdir): Monitor output file location used to populate the graph
            data_key (Optional[str]): The key in the json to graph (episode_rewards or episode_lengths).
            line_color (Optional[dict]): Color of the plot.
        """
        self.outdir = outdir
        self._last_data = None
        self.data_key = data_key
        self.line_color = line_color

        #styling options
        matplotlib.rcParams['toolbar'] = 'None'
        plt.style.use('ggplot')
        plt.xlabel("")
        plt.ylabel(data_key)
        fig = plt.gcf().canvas.set_window_title('simulation_graph')

    def plot(self):
        results = gym.monitoring.monitor.load_results(self.outdir)
        data =  results[self.data_key]

        #only update plot if data is different (plot calls are expensive)
        if data !=  self._last_data:
            self._last_data = data
            plt.plot(data, color=self.line_color)

            # pause so matplotlib will display
            # may want to figure out matplotlib animation or use a different library in the future
            plt.pause(0.000001)

def render():
    render_skip = 0 #Skip first X episodes.
    render_interval = 50 #Show render Every Y episodes.
    render_episodes = 10 #Show Z episodes every rendering.

    if (x%render_interval == 0) and (x != 0) and (x > render_skip):
        env.render()
    elif ((x-render_episodes)%render_interval == 0) and (x != 0) and (x > render_skip) and (render_episodes < x):
        env.render(close=True)

if __name__ == '__main__':

    env = gym.make('VirtESX-v0')


    outdir = '/tmp/vmware_gym_experiments'
    env.monitor.start(outdir, force=True)

    #plotter = LivePlot(outdir)

    last_time_steps = np.ndarray(0)

    qlearn = QLearn(actions=range(env.action_space.n),
                    alpha=0.1, gamma=0.8, epsilon=0.9)

    initial_epsilon = qlearn.epsilon

    epsilon_discount = 0.999 # 1098 eps to reach 0.1

    start_time = time.time()
    total_episodes = 10000
    highest_reward = 0

    print("starting...")
    
    for x in range(total_episodes):
        done = False

        cumulated_reward = 0 #Should going forward give more reward then L/R ?
        print("resetting...")
        observation = env.reset()
        print("reset...")
        if qlearn.epsilon > 0.05:
            qlearn.epsilon *= epsilon_discount

        #render() #defined above, not env.render()

        state = ''.join(map(str, observation))

        for i in range(500):
            print ("Step: "+str(i+1)+"  .")
            # Pick an action based on the current state
            action = qlearn.chooseAction(state)

            # Execute the action and get feedback
            observation, reward, done, info = env.step(action)
            cumulated_reward += reward

            if highest_reward < cumulated_reward:
                highest_reward = cumulated_reward

            nextState = ''.join(map(str, observation))

            qlearn.learn(state, action, reward, nextState)

            env.monitor.flush(force=True)

            if not(done):
                state = nextState
            else:
                last_time_steps = np.append(last_time_steps, [int(i + 1)])
                break 

        m, s = divmod(int(time.time() - start_time), 60)
        h, m = divmod(m, 60)
        print ("EP: "+str(x+1)+" - [alpha: "+str(round(qlearn.alpha,2))+" - gamma: "+str(round(qlearn.gamma,2))+" - epsilon: "+str(round(qlearn.epsilon,2))+"] - Reward: "+str(cumulated_reward)+"     Time: %d:%02d:%02d" % (h, m, s))

    #Github table content
    print ("\n|"+str(total_episodes)+"|"+str(qlearn.alpha)+"|"+str(qlearn.gamma)+"|"+str(initial_epsilon)+"*"+str(epsilon_discount)+"|"+str(highest_reward)+"| PICTURE |")

    l = last_time_steps.tolist()
    l.sort()

    #print("Parameters: a="+str)
    print("Overall score: {:0.2f}".format(last_time_steps.mean()))
    print("Best 100 score: {:0.2f}".format(reduce(lambda x, y: x + y, l[-100:]) / len(l[-100:])))

    #env.monitor.close()
    env.close()

[2017-05-08 00:04:57,365] Making new env: VirtESX-v0


VMWare Env launched!
starting...
resetting...
Resetting 1...


[2017-05-08 00:05:08,915] Disabling video recorder because <ESXTestEnv instance> neither supports video mode "rgb_array" nor "ansi".


Resetting 2...
Resetting 3...
reset...
Step: 1  .
Step: 2  .
Step: 3  .
Step: 4  .
Step: 5  .
Step: 6  .
Step: 7  .
Step: 8  .
Step: 9  .
Step: 10  .
Step: 11  .
Step: 12  .
Step: 13  .
Step: 14  .
Step: 15  .
Step: 16  .
Step: 17  .
Step: 18  .
Step: 19  .
Step: 20  .
Step: 21  .
Step: 22  .
Step: 23  .
Step: 24  .
Step: 25  .
Step: 26  .
Step: 27  .
Step: 28  .
Step: 29  .
Step: 30  .
Step: 31  .
Step: 32  .
Step: 33  .
Step: 34  .


KeyboardInterrupt: 