In [1]:
from qlearningAgents import QLearningAgent
import numpy as np
from IPython.display import clear_output
import time
import gym

In [2]:
def get_actions(s):
    return [0,1,2,3]
def get_taxi_actions(s):
    return [0,1,2,3,4,5]
agent = QLearningAgent(0.1,0.2,0.9,get_actions)

In [3]:
agent.getAction(3)

0

In [4]:
agent.update((1,1),2,(1,2),-1)

In [5]:
agent.getQValue((1,1),2)

-0.1

In [3]:
class GameHistory:
    def __init__(self,isEnd, history, seed):
        self.isEnd = isEnd
        self.history = history
        self.seed = seed

class GymEnv:
    def __init__(self, env, get_actions,max_steps, agent = None,step_cost = -0.1, alpha = 0.1,epsilon = 0.99, discount = 0.9,special_render = None, death_cost = -1, epsilon_decrease=1.0/0.99, discretize=None):
        self.env = env
        if special_render != None:
            self.env.render = special_render
        self.get_actions = get_actions
        self.step_cost = step_cost
        self.death_cost = death_cost
        self.agent = agent if agent != None else QLearningAgent(alpha, epsilon, discount, get_actions)
        self.max_steps = max_steps
        self.epsilon = float(self.agent.epsilon )
        self.epsilon_decrease = epsilon_decrease
        self.discretize = discretize
    def playGame(self,max_steps = None, isTraining = False):
        if max_steps == None:
            max_steps = self.max_steps
        if not isTraining:
            self.agent.epsilon = 0.0
        seed = np.random.randint(0,132132132)
        
        self.env.seed(seed)
        state = self.env.reset()
        
        if type(state)==np.ndarray:
            state = tuple(state.tolist())
        
        if self.discretize != None:
            state = self.discretize(state)
                 
        isEnd = False
        
        history = []
        
        total_reward = 0.0
        
        for i in xrange(max_steps):
            action = self.agent.getAction(state)
            assert action in self.get_actions(state)
            next_state, reward, isEnd, _ = self.env.step(action)
            
            if type(next_state)==np.ndarray:
                next_state = tuple(next_state.tolist())
            
            if self.discretize != None:
                 next_state = self.discretize(next_state)
            
            reward+=self.step_cost
            history.append((state,action,next_state))
            if isTraining:
                self.agent.update(state,action,next_state,reward)
            state = next_state
            total_reward+=reward
            if isEnd:
                if reward == 0:
                    self.agent.update(state,action,next_state,self.death_cost)
                    pass
                break
        return GameHistory( isEnd, history, seed), total_reward
    def trainAgent(self,n_games = 100,verbosity = None,max_steps = None):
        res = []
        if verbosity == None:
            verbosity = (n_games/20)
        self.agent.epsilon = self.epsilon
        for i in xrange(n_games):
            res.append(self.playGame(max_steps, True)[1])
            if i%verbosity  == verbosity-1:
                print "score",np.mean(res),"epsilon", self.agent.epsilon
                res = []
                self.agent.epsilon/= self.epsilon_decrease
        self.agent.epsilon = self.epsilon
    def playSeries(self, n_games):
        res = []
        for i in xrange(n_games):
            res.append(self.playGame(max_steps, False)[1])
        return res
            
    def renderHistory(self,history, delay=0.1, text = False):
        self.env.seed(history.seed)
        self.env.reset()
        if text:
            clear_output(wait=True)
        self.env.render()
        time.sleep(delay)
        for _,action, _ in history.history:
            self.env.step(action)
            if text:
                clear_output(wait=True)
            self.env.render()
            time.sleep(0.1)
        self.env.close()

In [22]:

e = GymEnv(gym.make("Taxi-v1"), get_taxi_actions, 100)

[2017-01-05 10:58:47,416] Making new env: Taxi-v1


In [471]:
e.trainAgent(1000000, 2000)

score -390.41125 epsilon 0.99
score -378.799 epsilon 0.9801
score -370.1531 epsilon 0.970299
score -365.99315 epsilon 0.96059601
score -358.00375 epsilon 0.9509900499
score -349.4307 epsilon 0.941480149401
score -341.6344 epsilon 0.932065347907
score -335.51765 epsilon 0.922744694428
score -328.1477 epsilon 0.913517247484
score -313.7378 epsilon 0.904382075009
score -306.45425 epsilon 0.895338254259
score -298.6585 epsilon 0.886384871716
score -287.3054 epsilon 0.877521022999
score -274.937 epsilon 0.868745812769
score -271.7011 epsilon 0.860058354641
score -257.7751 epsilon 0.851457771095
score -246.18395 epsilon 0.842943193384
score -241.2133 epsilon 0.83451376145
score -225.00495 epsilon 0.826168623836
score -217.50085 epsilon 0.817906937597
score -210.2881 epsilon 0.809727868221
score -200.6506 epsilon 0.801630589539
score -192.847 epsilon 0.793614283644
score -187.3742 epsilon 0.785678140807
score -176.61025 epsilon 0.777821359399
score -167.98815 epsilon 0.770043145805
score -162

In [23]:
hist,reward = e.playGame()

In [24]:
len(hist.history)

100

In [25]:
reward

-109.99999999999982

In [26]:
e.renderHistory(hist)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m:[43m [0m|
+---------+
  (South)


KeyboardInterrupt: 

In [143]:
e.agent.getQValue(state=0, action=3)

0.0

In [418]:
from ipywidgets import widgets
from IPython.display import display


In [419]:
up = widgets.Button(description="Up")
down = widgets.Button(description="Down")
left = widgets.Button(description="Left")
right = widgets.Button(description="Right")
reset_btn = widgets.Button(description="Reset")

s = e.env.reset()
e.env.render()

def reset(b):
    s = e.env.reset()
    e.env.render()

def make_learn_action(action):
    s1, reward, _,_ = e.env.step(action)
    e.agent.update(s,action,s1, reward)
    clear_output(wait=True)
    e.env.render()
    
down.on_click(lambda b : make_learn_action(1))
left.on_click(lambda b : make_learn_action(0))
right.on_click(lambda b : make_learn_action(2))
up.on_click(lambda b : make_learn_action(3))

reset_btn.on_click(reset)

display(reset_btn)
display(up)
display(down)
display(left)
display(right)

+---------+
|[35m[34;1mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[43mY[0m| : |B: |
+---------+
  (South)


In [5]:
n_desc = 20

def discretize_lland(s):
    s = list(s)
    s[0] = round( s[0]/2 * n_desc)
    s[1] = round( s[1]/2 * n_desc)
    s[2] = round( s[2]/5 * n_desc)
    s[3] = round( s[3]/5 * n_desc)
    s[4] = round( s[4]/5 * n_desc)
    s[5] = round( s[5]/10 * n_desc)
    s[6] = round( s[4]/10 * n_desc)
    s[7] = s[7]
    return tuple(s)
    
def discretize_car(s):
    s = list(s)
    s[0] = round( s[0]/0.5 * n_desc)
    s[1] = round( s[1]/0.1 * n_desc)
    
    return tuple(s)


In [6]:
car = gym.make("MountainCar-v0")

def get_car_actions(s):
    return [0,1,2]

e = GymEnv(car, get_car_actions, 3000,discretize=discretize_car)

[2017-01-05 16:20:30,614] Making new env: MountainCar-v0


In [26]:
e.epsilon = 0.01

In [None]:
hist,reward = e.playGame()
print reward
e.renderHistory(hist,0.00)

-204.6


In [13]:
reward

-183.69999999999945

In [18]:
e1.renderHistory(hist,0.01)

NameError: name 'e1' is not defined

In [66]:
e1 = GymEnv(car, get_car_actions, 3000,discretize=discretize_car)

In [8]:
import pickle as pkl

In [29]:

pkl.dump({k: dict(v) for k,v in e.agent._qValues.items()},open("car_q.pkl",'wb'))

In [9]:
d = pkl.load(open("car_q.pkl",'rb'))

for k1 in d:
    for k2 in d[k1]:
        e.agent._qValues[k1][k2] = d[k1][k2]