In [0]:
pwd()

"/content"

In [0]:
cd("/content/drive/My Drive/julia")
pwd()

"/content/drive/My Drive/julia"

In [0]:
versioninfo()

Julia Version 1.1.0
Commit 80516ca202 (2019-01-21 21:24 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) CPU @ 2.20GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-6.0.1 (ORCJIT, broadwell)


In [0]:
# import Pkg
# Pkg.add("DataStructures")
# Pkg.add(["Reinforce", "StatsBase", "Plots"])
# Pkg.add("BSON") # see https://github.com/FluxML/Flux.jl/blob/master/docs/src/saving.md

In [0]:
using DataStructures
using BSON: @save, @load
import Reinforce
using Reinforce: CartPoleV0, actions, reset!, finished, step!
using Flux, CuArrays, StatsBase, Plots

In [0]:
gr() # gr is faster than pyplot
ENV["GKSwstype"] = "100" # plotting in headless environment

"100"

In [0]:
#---------------Initialize game environment----------------#
env = CartPoleV0()


#-------------------------Parameters-----------------------#
EPISODES = 500
STATE_SIZE = length(env.state)
ACTION_SIZE = length(actions(env, env.state))
REPLAY_MEMORY = 10000 # buffer size
MAX_STEPS = 300 # maximum timesteps per episode

BATCH_SIZE = 32

γ = 0.99                # discount rate
η = 0.0001              # learning rate

ϵ = 0.9                 # exploration rate
ϵ_min = 0.01            # exploration minimum
ϵ_decay = 0.995         # exploration decay

memory = CircularBuffer{Any}(REPLAY_MEMORY)


#-----------------------Model Architecture------------------------#
model = Chain(Dense(STATE_SIZE, 24, relu),
              Dense(24, 48, relu), 
              Dense(48, ACTION_SIZE)) |> gpu

loss(x, y) = Flux.mse(model(x), y)
opt = ADAM(η)

fit_model(dataset) = Flux.train!(loss, params(model), dataset, opt)

fit_model (generic function with 1 method)

In [0]:
"""Save sample (s, a, r, s′) to replay memory"""
function remember(state, action, reward, next_state, done)
    push!(memory, (state, action, reward, next_state, done))
end


"""Get action from model using epsilon-greedy policy"""
function act(state, ϵ)
    rand() <= ϵ && return rand(1:ACTION_SIZE)
    q_values = model(state |> gpu).data # act values
    return argmax(q_values)  # returns action (idx of q value)
end


"""Sample from replay memory, train model, update exploration"""
function replay()
    length(memory) < BATCH_SIZE && return nothing
    
    batch_size = min(BATCH_SIZE, length(memory))
    minibatch = sample(memory, batch_size, replace=false)
    
    sb, ab, rb, s′b, db = collect.(zip(minibatch...))
    sb = hcat(sb...) |> gpu
    s′b = hcat(s′b...) |> gpu
    
    qb_target = model(sb).data
    qb_learned = maximum(model(s′b).data, dims=1)
    qb_learned = ifelse.(db, rb, rb .+ γ .* cpu(qb_learned))
    setindex!.(Ref(qb_target), qb_learned, ab) # (1, batch_size)
    
    dataset = [(sb, qb_target)] # [(input, target)]
    fit_model(dataset)
    
    global ϵ
    ϵ > ϵ_min && (ϵ *= ϵ_decay)
    
    GC.gc(); # CuArrays.clearpool()
end

replay

In [0]:
#----------------------------Training & Testing---------------------------#
best_score = 0.0
test_every, TEST = Integer(EPISODES/10), 10

for e=1:EPISODES
    reset!(env)
    state = env.state
    score = 0
    
    envs = []
    for step=1:MAX_STEPS
        push!(envs, deepcopy(env))
        
        action = act(state, ϵ) # predict action
        reward, next_state = step!(env, state, action)
        done = finished(env, next_state) # check if game is finished
        reward = !done ? reward : -1 # penalty of -1 if game is over
        score += reward
        
        remember(state, action, reward, next_state, done)
        
        state = next_state
        done && break
    end
    
    stats = "Episode: $e/$EPISODES | Score: $score | ϵ: $ϵ"
    # Episode X finished after Y timesteps with Z total reward
    
    if best_score < score
        best_score = score
        println(stats); flush(stdout)
        @save "models/notebook3/model-$e-$score.bson" model
        anim = @animate for env in envs
            plot(env)
        end
        mp4(anim, "models/notebook3/env-$e-$score.mp4", fps=20, show_msg=false)
    else
        print(stats); flush(stdout); print("\r")
    end
    
    replay() # replay and learn from the episode
    
    if e % test_every == 0
        score = 0
        for i=1:TEST
            reset!(env)    
            state = env.state
            
            for step=1:MAX_STEPS
                action = act(state, ϵ_min)
                reward, state = step!(env, state, action)
                done = finished(env, state) # check if game is finished
                reward = !done ? reward : -1 # penalty of -1 if game is over
                score += reward

                done && break
            end
        end
        
        score /= TEST
        println("#-- Avg Test Score $(Integer(e/test_every)) : $score --#")
        score >= 200 && break
    end
end

println("Done!")

Episode: 1/500 | Score: 12.0 | ϵ: 0.9
Episode: 3/500 | Score: 15.0 | ϵ: 0.9
Episode: 4/500 | Score: 17.0 | ϵ: 0.8955
Episode: 6/500 | Score: 21.0 | ϵ: 0.8865673875
Episode: 15/500 | Score: 32.0 | ϵ: 0.8474605262229382
Episode: 16/500 | Score: 38.0 | ϵ: 0.8432232235918236
#-- Avg Test Score 1 : 17.7 --#
Episode: 52/500 | Score: 65.0 | ϵ: 0.7040013079012841
Episode: 63/500 | Score: 120.0 | ϵ: 0.6662348619270341
Episode: 96/500 | Score: 129.0 | ϵ: 0.564662593848428
#-- Avg Test Score 2 : 51.4 --#
Episode: 110/500 | Score: 172.0 | ϵ: 0.5263954772927323
#-- Avg Test Score 3 : 48.7 --#
#-- Avg Test Score 4 : 30.9 --#
#-- Avg Test Score 5 : 23.0 --#
#-- Avg Test Score 6 : 14.6 --#
#-- Avg Test Score 7 : 10.1 --#
#-- Avg Test Score 8 : 8.0 --#
#-- Avg Test Score 9 : 7.9 --#
#-- Avg Test Score 10 : 7.6 --#
Done!


In [0]:
# takes about ~ 30 mins to train & test for 500 episodes and 32 batch size: giving best score of 172

In [0]:
#=
env = CartPoleV0()

for i=1:5
  plot(env)
  anim = Animation()
  for j=1:100
    plot!(env, title="env-$i-$j")
    frame(anim)
  end
  mp4(anim, "assets/notebook3/env-$i.mp4", fps=15)
end
=#

┌ Info: Saved animation to 
│   fn = /content/drive/My Drive/julia/assets/notebook3/env-1.mp4
└ @ Plots /root/.julia/packages/Plots/qh1wV/src/animation.jl:90
┌ Info: Saved animation to 
│   fn = /content/drive/My Drive/julia/assets/notebook3/env-2.mp4
└ @ Plots /root/.julia/packages/Plots/qh1wV/src/animation.jl:90
┌ Info: Saved animation to 
│   fn = /content/drive/My Drive/julia/assets/notebook3/env-3.mp4
└ @ Plots /root/.julia/packages/Plots/qh1wV/src/animation.jl:90
┌ Info: Saved animation to 
│   fn = /content/drive/My Drive/julia/assets/notebook3/env-4.mp4
└ @ Plots /root/.julia/packages/Plots/qh1wV/src/animation.jl:90
┌ Info: Saved animation to 
│   fn = /content/drive/My Drive/julia/assets/notebook3/env-5.mp4
└ @ Plots /root/.julia/packages/Plots/qh1wV/src/animation.jl:90


In [0]:
# CPU-compatible function
# note that gpu(x) and x |> gpu do the same thing (similarly for cpu)

"""Pick samples randomly from replay memory and train the model"""
function replay()
    length(memory) < TRAIN_START && return nothing
    
    batch_size = min(BATCH_SIZE, length(memory))
    minibatch = sample(memory, batch_size, replace=false)
    
    sb, ab, rb, s′b, db = collect.(zip(minibatch...))
    sb = hcat(sb...)
    s′b = hcat(s′b...)
    ab, rb, db = transpose.([ab, rb, db])
    # println(size(sb), size(ab), size(rb), size(s′b), size(db))
    # sb, ab, rb, s′b, db = transpose.(collect.(zip(minibatch...)))
    # state, action, reward, next_state, done batch (with batch dimension second)
    # collect(zip(ls...)) is equivalent to list(zip(*ls)) in python
    
    #=
    Q is a NN used as a function approximator of Q* (Bellman optimality equation)
    Q-learning: use any policy (model-free) to maximize future reward (just keep updating (s, a))
    Q[s,a] is a measure of how good an action a is for state s
    γ discount factor is used to value rewards received earlier ("good start")
    s -> a -> r -> s′
    Q[s,a] = (1 - α)Q[s,a] + α(r + γ max_{a′} Q[s′,a′])
    α is the learning rate
    r + γ max_{a′} Q[s′,a′] is the learned value (target)
    max_{a′} Q[s′,a′] is the estimate of future optimal value
    
    Q-learning Algorithm:
    initialize Q[s,a] arbitarily
    observe initial state s
    repeat
      select and carry out action a
      observe reward r and state s
      Q[s,a] = Q[s,a] + α(r + γ max_{a′} Q[s′,a′]  -  Q[s,a])
    until terminated
    =#
    
    qb_target = model(sb).data
    qb_learned = ifelse.(db, rb, rb .+ γ .* maximum(model(s′b).data, dims=1))
    setindex!.(Ref(qb_target), qb_learned, ab) # (1, batch_size)
    
    #=
    for (i, (s, a, r, s′, done)) in enumerate(minibatch)
        target = r + (done ? γ * maximum(model(s′ |> gpu).data) : 0)        
        target_f = model(s).data
        target_f[a] = target
        
        dataset = [(state, target_f)]
    end
    =#
    
    dataset = [(sb, qb_target)] # [(input, target)]
    fit_model(dataset)
end

replay

In [0]:
# tejank - batching for fitting dataset but not for model prediction

"""Save sample (s, a, r, s′) to replay memory and update exploration"""
function remember(state, action, reward, next_state, done)
    push!(memory, (state, action, reward, next_state, done))
    
    global ϵ
    ϵ > ϵ_min && (ϵ *= ϵ_decay)
end


"""Get action from model using epsilon-greedy policy"""
function act(state)
    rand() <= ϵ && return rand(1:ACTION_SIZE)
    q_values = model(state |> gpu).data # act values
    return argmax(q_values)  # returns action (idx of q value)
end


function replay()
    length(memory) < TRAIN_START && return nothing

    batch_size = min(BATCH_SIZE, length(memory))
    minibatch = sample(memory, batch_size, replace=false)

    x = zeros(Float32, STATE_SIZE, batch_size)
    y = zeros(Float32, ACTION_SIZE, batch_size)
    for (i, (s, a, r, s′, done)) in enumerate(minibatch)
        q_learned = r
        !done && (q_learned += γ * maximum(model(s′ |> gpu).data))

        q_target = model(s |> gpu).data
        q_target[a] = q_learned

        x[:, i] .= s
        y[:, i] .= q_target
    end

    dataset = [(gpu(x), gpu(y))]
    fit_model(dataset)
end

replay (generic function with 1 method)