## Cartpole exercise

## Initiate environment

In [None]:
import gym
env = gym.make('CartPole-v1')

## WandB

In [None]:
import wandb
from wandb.keras import WandbCallback

In [None]:
wandb.login(key="a269354c0723116b5fbadaaf8a45343df695d02f")

In [None]:
run = wandb.init(
    config={
        "target_model_update": 1e-2,
        "target_reward": 500,
        "units": 12,
        "learning_rate": 0.01,
        "loss": "mae",
        "nr_steps": 100000,
        "gamma": 0.99,
        "epsilon": 0.3,
        },
    project="CartPole",id="run1")

## Reinforcement learning

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Input
from tensorflow.keras.optimizers import Adam

In [None]:
states = env.observation_space.shape
actions = env.action_space.n

In [None]:
def build_model(states, actions):
    model = Sequential() 
    model.add(Input(states))
    model.add(Flatten())
    model.add(Dense(12, activation='relu', input_shape=(1,4)))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [None]:
model = build_model((1,4), actions)
model.build((1,4))

In [None]:
model.summary()

In [None]:
from rl.agents import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [None]:
def build_agent(model, actions):
    policy = EpsGreedyQPolicy(eps=.3)
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [None]:
dqn = build_agent(model, actions)

dqn.compile(Adam(learning_rate=0.01), metrics=['mae'])


In [None]:
dqn.fit(env, nb_steps=50000, callbacks=[WandbCallback()])

In [None]:
scores = dqn.test(env, nb_episodes=20, visualize=True)
print(np.mean(scores.history['episode_reward']))

In [None]:
dqn.save_weights('cartpole.h5f', overwrite=True)