In [1]:
using Flux
using ReinforcementLearning
using Flux: InvDecay
using Flux.Losses: huber_loss
using BSON
using StatsBase:mean
using Plots
using IntervalSets

In [2]:
h = 4
l = 4

function dist(p1, p2)
   ((p1[1] - p2[1])^2 + (p1[2] - p2[2])^2)^0.5
end

actions = []
successful_episode = []

function puddle(cur_state, puddle_center, puddle_radius)
    (dist(cur_state, puddle_center) < puddle_radius)
end

function rectangle(cur_state, x, y, width, height)
    if (cur_state[1] > x && cur_state[1] < x + width)
        if (cur_state[2] > y && cur_state[2] < y + height)
            return true
        end
    end
    
    return false
end

function jackpot(cur_state, jackpot_center, jackpot_radius)
    (dist(cur_state, jackpot_center) < jackpot_radius)
end

jackpot (generic function with 1 method)

In [3]:
Base.@kwdef mutable struct PuddleWorld <: AbstractEnv
    height::Int = h
    length::Int = l
    
	current_state::Vector{Float64} = [0, 0]
    goal_state::Vector{Float64} = [l-0.5, h-0.5]
    goal_radius::Float64 = 0.25
    
    time::Int = 0
    reward::Float64 = 0
    
    n_actions::Int = 360
    episode_num::Int = 0
    
    j_center1::Vector{Float64} = [1.0, 3.0]
    j_center2::Vector{Float64} = [3.0, 1.0]
    j_radius::Float64 = 0.25
    
    visited1::Bool = false
    visited2::Bool = false
    
    move_dist::Float64 = 0.5
    
    temp::Vector{Int} = []
end

RLBase.action_space(env::PuddleWorld) = begin
	Base.OneTo(env.n_actions)
end

#=
RLBase.legal_action_space(env::GridWorld) = begin
	continuous ? 0..(2.0 * π) : Base.OneTo(n_actions)
end

RLBase.legal_action_space_mask(env::GridWorld) = begin
    map(x -> x∈[1, 2, 3, 4], 1:4)
end
=#

RLBase.state(env::PuddleWorld) = begin 
	env.current_state
end

RLBase.state_space(env::PuddleWorld) = begin 
	 Space([0.0..env.length, 0.0..env.height])
end

RLBase.reward(env::PuddleWorld) = begin 
    env.reward
end

RLBase.is_terminated(env::PuddleWorld) = begin
	if env.time > 100
        push!(actions, env.temp)
        push!(successful_episode, false)
        return true
    end
    
    #=
    if env.current_state[1] > env.length
        return true
    elseif env.current_state[1] < 0
        return true
    elseif env.current_state[2] > env.height
        return true
    elseif env.current_state[2] < 0
        return true
    end
    =#
    
    if dist(env.current_state, env.goal_state) < env.goal_radius
        push!(actions, env.temp)
        push!(successful_episode, true)
		return true
	end
    
    return false
end

RLBase.reset!(env::PuddleWorld) = begin
	env.height = h
    env.length = l
    env.current_state = [0, 0]
    env.time = 0
    env.temp = []
end

function (env::PuddleWorld)(a)
    if (env.time == 0)
        env.episode_num += 1
    end
    
    fell::Bool = false
    
    if !is_terminated(env)
        push!(env.temp, a)        
        env.time += 1
		
        env.current_state[1] += env.move_dist * cosd(a)
        env.current_state[2] += env.move_dist * sind(a)
        
        if env.current_state[1] > env.length
            env.current_state = [0, 0]
            fell = true
        elseif env.current_state[1] < 0
            env.current_state = [0, 0]
            fell = true
        elseif env.current_state[2] > env.height
            env.current_state = [0, 0]
            fell = true
        elseif env.current_state[2] < 0
            env.current_state = [0, 0]
            fell = true
        end
        
        
        if (fell)
            env.reward = -100
        else
            env.reward = -10
        end
        
        if (jackpot(env.current_state, env.j_center1, env.j_radius) && !env.visited1)
            env.reward = 50
            env.visited1 = true
        elseif (jackpot(env.current_state, env.j_center2, env.j_radius) && !env.visited2)
            env.reward = 50
            env.visited2 = true
        end
    end
end

In [4]:
env = PuddleWorld()
# hook = TotalRewardPerEpisode()
# action_space(env)
# run(RandomPolicy(action_space(env)), env, StopAfterEpisode(1_000), hook)

# PuddleWorld

## Traits

| Trait Type        |                  Value |
|:----------------- | ----------------------:|
| NumAgentStyle     |          SingleAgent() |
| DynamicStyle      |           Sequential() |
| InformationStyle  | ImperfectInformation() |
| ChanceStyle       |           Stochastic() |
| RewardStyle       |           StepReward() |
| UtilityStyle      |           GeneralSum() |
| ActionStyle       |     MinimalActionSet() |
| StateStyle        |     Observation{Any}() |
| DefaultStateStyle |     Observation{Any}() |

## Is Environment Terminated?

No

## State Space

`Space{Vector{ClosedInterval{Float64}}}(ClosedInterval{Float64}[0.0..4.0, 0.0..4.0])`

## Action Space

`Base.OneTo(360)`

## Current State

```
[0.0, 0.0]
```


In [5]:
# RLBase.test_runnable!(env)

In [6]:
typeof(env)

PuddleWorld

In [7]:
agent = Agent(
        policy = QBasedPolicy(
            learner = DQNLearner(
                approximator = NeuralNetworkApproximator(
                    model = Chain(
                        Dense(2, 64, relu; init = glorot_uniform()),
                        Dense(64, 64, relu; init = glorot_uniform()),
                        Dense(64, 360; init = glorot_uniform()),
                    ) |> gpu,
                    optimizer = ADAM(),
                ),
                target_approximator = NeuralNetworkApproximator(
                    model = Chain(
                        Dense(2, 64, relu; init = glorot_uniform()),
                        Dense(64, 64, relu; init = glorot_uniform()),
                        Dense(64, 360; init = glorot_uniform()),
                    ) |> gpu,
                    optimizer = ADAM(),
                ),
                loss_func = huber_loss,
                stack_size = nothing,
                batch_size = 32,
                update_horizon = 1,
                min_replay_history = 100,
                update_freq = 1,
                target_update_freq = 100,
            ),
            explorer = EpsilonGreedyExplorer(
                kind = :exp,
                ϵ_stable = 0.01,
                decay_steps = 500,
            ),
        ),
        trajectory = CircularArraySARTTrajectory(
            capacity = 500,
            state = Vector{Float32} => (2,),
        ),
    )

┌ Info: The GPU function is being called but the GPU is not accessible. 
│ Defaulting back to the CPU. (No action is required if you want to run on the CPU).
└ @ Flux C:\Users\saiko\.julia\packages\Flux\7nTyc\src\functor.jl:187


typename(Agent)
├─ policy => typename(QBasedPolicy)
│  ├─ learner => typename(DQNLearner)
│  │  ├─ approximator => typename(NeuralNetworkApproximator)
│  │  │  ├─ model => typename(Chain)
│  │  │  │  └─ layers
│  │  │  │     ├─ 1
│  │  │  │     │  └─ typename(Dense)
│  │  │  │     │     ├─ weight => 64×2 Matrix{Float32}
│  │  │  │     │     ├─ bias => 64-element Vector{Float32}
│  │  │  │     │     └─ σ => typename(typeof(relu))
│  │  │  │     ├─ 2
│  │  │  │     │  └─ typename(Dense)
│  │  │  │     │     ├─ weight => 64×64 Matrix{Float32}
│  │  │  │     │     ├─ bias => 64-element Vector{Float32}
│  │  │  │     │     └─ σ => typename(typeof(relu))
│  │  │  │     └─ 3
│  │  │  │        └─ typename(Dense)
│  │  │  │           ├─ weight => 360×64 Matrix{Float32}
│  │  │  │           ├─ bias => 360-element Vector{Float32}
│  │  │  │           └─ σ => typename(typeof(identity))
│  │  │  └─ optimizer => typename(ADAM)
│  │  │     ├─ eta => 0.001
│  │  │     ├─ beta
│  │  │     │  ├─ 1
│  │ 

In [None]:
hookAgent = TotalRewardPerEpisode(is_display_on_exit = false)
run(agent, env, StopAfterEpisode(1000),hookAgent)

[32mProgress:  17%|███████                                  |  ETA: 0:06:04[39m

In [None]:
plot(hookAgent.rewards)

In [None]:
hookRand = TotalRewardPerEpisode(is_display_on_exit = false)
run(RandomPolicy(action_space(env)), env, StopAfterEpisode(1000), hookRand)
plot(hookRand.rewards)

In [None]:
# figurated learning
# distributed network
# gains with physics are higher in larger systems
# mixed precision - faster process with no loss of quality
# annealing
# - higher noise initially to prevent you from going into local minimums
# - lower noise at the end when you are closer to optimal policy


In [None]:
agent.policy.explorer.is_training = false

hookAgent = TotalRewardPerEpisode(is_display_on_exit = false)
run(agent, env, StopAfterEpisode(1000),hookAgent)


In [None]:
plot(hookAgent.rewards)

In [None]:
for i in length(successful_episode):-1:1
    if (successful_episode[i])
        for k in actions[i]
            println(k)
        end
        
        break
    end
end

In [None]:
counter = 0
total = 0

for i in length(successful_episode):-1:length(successful_episode)-999
    if (successful_episode[i] == true)
        counter += 1
    end
    
    total += 1
end

print("Total: ")
println(total)
print("Successful: ")
println(counter)

In [None]:
counter = 0
total = 0

for i in 1:1:1000
    if (successful_episode[i] == true)
        counter += 1
    end
    
    total += 1
end

print("Total: ")
println(total)
print("Successful: ")
println(counter)

In [None]:
counter = 0
total = 0

for i in 1:1:length(successful_episode)
    if (successful_episode[i] == true)
        counter += 1
    end
    
    total += 1
end

print("Total: ")
println(total)
print("Successful: ")
println(counter)

In [None]:
println(length(actions))