In [0]:
using DataStructures
using BSON: @save, @load
import Reinforce
using Reinforce: CartPoleV0, actions, reset!, finished, step!
using Flux, CuArrays, StatsBase, Plots

In [11]:
gr() # gr is faster than pyplot
ENV["GKSwstype"] = "100" # headless

"100"

In [12]:
#---------------Initialize game environment----------------#
env = CartPoleV0()


#-------------------------Parameters-----------------------#
EPISODES = 400
STATE_SIZE = length(env.state)
ACTION_SIZE = length(actions(env, env.state))
REPLAY_MEMORY = 10000 # buffer size
MAX_STEPS = 300 # maximum timesteps per episode

BATCH_SIZE = 32

γ = 0.99                # discount rate
η = 0.0001              # learning rate

ϵ = 1.0                 # exploration rate
ϵ_min = 0.01            # exploration minimum
ϵ_decay = 0.99          # exploration decay

memory = CircularBuffer{Any}(REPLAY_MEMORY)


#-----------------------Model Architecture------------------------#
model = Chain(Dense(STATE_SIZE, 24, relu),
              Dense(24, 24, relu), 
              Dense(24, ACTION_SIZE)) |> gpu

loss(x, y) = Flux.mse(model(x), y)
opt = ADAM(η)

fit_model(dataset) = Flux.train!(loss, params(model), dataset, opt)

model_target = deepcopy(model)

Chain(Dense(4, 24, NNlib.relu), Dense(24, 24, NNlib.relu), Dense(24, 2))

In [13]:
function remember(state, action, reward, next_state, done)
    push!(memory, (state, action, reward, next_state, done))
end


"""Decaying Epsilon-Greedy Policy for selecting action (accept random actions with ϵ probability)"""
function act(state, ϵ)
    rand() <= ϵ && return rand(1:ACTION_SIZE) # explore
    q_values = model(state |> gpu).data # act values
    return argmax(q_values)  # returns action (idx of q value)
end


function replay()
    length(memory) < BATCH_SIZE && return nothing
    
    batch_size = min(BATCH_SIZE, length(memory))
    minibatch = sample(memory, batch_size, replace=false)
    
    sb, ab, rb, s′b, db = collect.(zip(minibatch...))
    sb = hcat(sb...) |> gpu
    s′b = hcat(s′b...) |> gpu
    
    qb_target = model(sb).data
    
    actions = argmax(model(s′b).data) # 1 addition in Double DQN: action using model
    qb_learn = model_target(s′b).data # 1 change in Double DQN: update using target model
    qb_learned = getindex.(Ref(qb_learn), actions)
    
    qb_learned = ifelse.(db, rb, rb .+ γ .* cpu(qb_learned))
    setindex!.(Ref(qb_target), qb_learned, ab) # (1, batch_size)
    
    dataset = [(sb, qb_target)] # [(input, target)]
    fit_model(dataset)
    
    GC.gc(); # CuArrays.clearpool()
    
    global ϵ
    ϵ = ϵ_min + (ϵ - ϵ_min)*ϵ_decay
end

replay (generic function with 1 method)

In [14]:
#----------------------------Training & Testing---------------------------#
best_score = 0.0
test_every, TEST = Integer(EPISODES/10), 10

for e=1:EPISODES
    reset!(env)
    state = env.state
    score = 0
    
    envs = []
    for step=1:MAX_STEPS
        push!(envs, deepcopy(env))
        
        action = act(state, ϵ) # predict action
        reward, next_state = step!(env, state, action)
        done = finished(env, next_state) # check if game is finished
        reward = !done ? reward : -1 # penalty of -1 if game is over
        score += reward
        
        remember(state, action, reward, next_state, done)
        
        state = next_state
        done && break
    end
    
    stats = "Episode: $e/$EPISODES | Score: $score | ϵ: $ϵ"
    # Episode X finished after Y timesteps with Z total reward
    
    if best_score <= score
        best_score = score
        println(stats); flush(stdout)
        @save "models/notebook4/model-$e-$score.bson" model
        anim = @animate for env in envs
            plot(env)
        end
        mp4(anim, "models/notebook4/env-$e-$score.mp4", fps=20, show_msg=false)
    else
        print(stats); flush(stdout); print("\r")
    end
    
    replay() # replay and learn from the episode
    model_target = deepcopy(model) # after each episode make target model same as model
    
    if e % test_every == 0
        score = 0
        for i=1:TEST
            reset!(env)    
            state = env.state
            
            for step=1:MAX_STEPS
                action = act(state, ϵ_min)
                reward, state = step!(env, state, action)
                done = finished(env, state) # check if game is finished
                reward = !done ? reward : -1 # penalty of -1 if game is over
                score += reward

                done && break
            end
        end
        
        score /= TEST
        println("#-- Avg Test Score $(Integer(e/test_every)) : $score --#")
        score >= 200 && break
    end
end

println("Done!")

Episode: 1/400 | Score: 22.0 | ϵ: 1.0
Episode: 5/400 | Score: 39.0 | ϵ: 0.97059601
Episode: 23/400 | Score: 48.0 | ϵ: 0.8116305895390458
Episode: 35/400 | Score: 81.0 | ϵ: 0.7205532272722921
#-- Avg Test Score 1 : 12.4 --#
#-- Avg Test Score 2 : 21.7 --#
#-- Avg Test Score 3 : 26.8 --#
#-- Avg Test Score 4 : 27.4 --#
#-- Avg Test Score 5 : 29.7 --#
#-- Avg Test Score 6 : 31.2 --#
#-- Avg Test Score 7 : 39.4 --#
#-- Avg Test Score 8 : 38.5 --#
Episode: 337/400 | Score: 81.0 | ϵ: 0.0441527268562123
Episode: 338/400 | Score: 96.0 | ϵ: 0.043811199587650174
#-- Avg Test Score 9 : 38.8 --#
Episode: 398/400 | Score: 125.0 | ϵ: 0.02850002244158248
#-- Avg Test Score 10 : 42.2 --#
Done!
