In [75]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import pickle as pkl
import numpy as np
import json
import utils
import envs
import plots
import optim
import gym

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
# env, env_params = (envs.taxi, dict(repetitions=10, num_episodes=500))
env, env_params = (envs.frozenlake, dict(repetitions=10, num_episodes=5_000))
# env, env_params = (envs.blackjack, dict(repetitions=100, num_episodes=3000))
print(env.nA, env.nS)

4 16


In [77]:
def grid_search(
    env,
    optimizer,
    alpha_range,
    epsilon,
    discount_factor,
    reduction,
    repetitions,
    num_episodes,
    out_file=None,
):
    experiment = {
        "environment": env.unwrapped.spec.id,
        "optimizer": optimizer.__name__,
        "num_episodes": num_episodes,
        "repetitions": repetitions
    }
    results = {}
    
    best_alpha = None
    best_return = -np.inf
    
    for alpha in alpha_range:
        params = dict(
            num_episodes = num_episodes,
            repetitions = repetitions,
            reduction = reduction,
            discount_factor=discount_factor,
            alpha=alpha,
            epsilon=epsilon,
        )
        
        ep_lengths, ep_returns = optimizer(env, **params)
        results[alpha] = {
            "episode_lengths": ep_lengths,
            "episode_returns": ep_returns
        }
        
        # mean of last 5% of the rewards
        return_at_end = ep_returns[:,-int(num_episodes*0.05):].mean()
        print(f"[INFO]: Return for alpha {alpha}: {return_at_end}") 
        if return_at_end > best_return:
            best_return = return_at_end
            best_alpha = alpha
    
    experiment["best_alpha"] = best_alpha
    print("best alpha value: %.2f (reward %.2f)" % (best_alpha, best_return))
    
    if out_file:
        out_file = Path(out_file)
        out_file.parent.mkdir(parents=True, exist_ok=True)
        with open(out_file.with_suffix(".json"), 'w') as f:
            json.dump(experiment, f)
        with open(out_file.with_suffix(".pkl"), 'wb') as f:
            experiment["results"] = results
            pkl.dump(experiment, f)

In [78]:
params = dict(
    alpha_range=np.linspace(0.05, 0.25, 5),
    epsilon=0.1,
    discount_factor=0.99, 
    reduction="mean",
)
params = {**params, **env_params}

grid_search(
    env,
    optimizer=optim.q_learning,
    out_file=f"../gridsearch/{env.unwrapped.spec.id}_q_learning.json",
    **params,
)
grid_search(
    env,
    optimizer=optim.double_q_learning,
    out_file=f"../gridsearch/{env.unwrapped.spec.id}_double_q_learning.json",
    **params,
)
print("done")

100%|██████████| 10/10 [01:46<00:00, 10.62s/it]                                                                                                       


[INFO]: Return for alpha 0.05: 0.3528


100%|██████████| 10/10 [02:20<00:00, 14.07s/it]                                                                                                       


[INFO]: Return for alpha 0.1: 0.39


100%|██████████| 10/10 [02:24<00:00, 14.47s/it]                                                                                                       


[INFO]: Return for alpha 0.15000000000000002: 0.3672


100%|██████████| 10/10 [02:26<00:00, 14.65s/it]                                                                                                       


[INFO]: Return for alpha 0.2: 0.3764


100%|██████████| 10/10 [02:24<00:00, 14.44s/it]                                                                                                       


[INFO]: Return for alpha 0.25: 0.3516
best alpha value: 0.10 (reward 0.39)


100%|██████████| 10/10 [01:09<00:00,  7.00s/it]                                                                                                       


[INFO]: Return for alpha 0.05: 0.1832


100%|██████████| 10/10 [01:41<00:00, 10.14s/it]                                                                                                       


[INFO]: Return for alpha 0.1: 0.3368


100%|██████████| 10/10 [01:41<00:00, 10.14s/it]                                                                                                       


[INFO]: Return for alpha 0.15000000000000002: 0.3144


100%|██████████| 10/10 [01:39<00:00,  9.99s/it]                                                                                                       


[INFO]: Return for alpha 0.2: 0.2644


100%|██████████| 10/10 [01:25<00:00,  8.54s/it]                                                                                                       

[INFO]: Return for alpha 0.25: 0.1492
best alpha value: 0.10 (reward 0.34)
done



