## Cartpole exercise

#### Initiate environment

In [12]:
import gym
import numpy as np
from gym import utils
from gym.envs.mujoco import mujoco_env
from gym.spaces import Discrete


In [13]:
class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
    def __init__(self):
        utils.EzPickle.__init__(self)
        mujoco_env.MujocoEnv.__init__(self, "inverted_pendulum.xml",1) # was set to 2 frames
    def step(self, a):
        reward = 1.0
        self.do_simulation(a/(5/3), self.frame_skip)
        ob = self._get_obs()
        notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= 0.2)
        done = not notdone
        return ob, reward, done, {}

    def reset_model(self):
        qpos = self.init_qpos + self.np_random.uniform(
            size=self.model.nq, low=-0.01, high=0.01
        )
        qvel = self.init_qvel + self.np_random.uniform(
            size=self.model.nv, low=-0.01, high=0.01
        )
        self.set_state(qpos, qvel)
        return self._get_obs()

    def _get_obs(self):
        return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel()

    def viewer_setup(self):
        v = self.viewer
        v.cam.trackbodyid = 0
        v.cam.distance = self.model.stat.extent

In [14]:
env = InvertedPendulumEnv()
env.action_space = Discrete(5, start=-2)

### Reinforcement learning

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Input
from tensorflow.keras.optimizers import Adam 

In [16]:
states = env.observation_space.shape
actions = env.action_space.n

In [17]:
def build_model(states, actions):
    model = Sequential() 
    model.add(Input(states))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(actions, activation='relu'))
    return model

In [18]:
model = build_model((1,4), actions)
model.build((1,4))

In [19]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 4)                 0         
                                                                 
 dense_3 (Dense)             (None, 128)               640       
                                                                 
 dense_4 (Dense)             (None, 128)               16512     
                                                                 
 dense_5 (Dense)             (None, 5)                 645       
                                                                 
Total params: 17,797
Trainable params: 17,797
Non-trainable params: 0
_________________________________________________________________


In [20]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import WandbLogger

In [21]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=100000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [22]:
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=0.01) ,metrics=['mae','acc'])

2022-02-04 15:57:16.214790: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-02-04 15:57:16.214812: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-02-04 15:57:16.218836: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-02-04 15:57:16.218906: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-04 15:57:16.228925: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-02-04 15:57:16.273913: I tensorflow/core/grappler/o

In [29]:
#dqn.fit(env, nb_steps=50000, callbacks=[WandbLogger()])
dqn.fit(env, nb_steps=100000, visualize=False)

Training for 100000 steps ...
Interval 1 (0 steps performed)
93 episodes - episode_reward: 105.613 [10.000, 256.000] - loss: 2.359 - mae: 24.651 - acc: 0.871 - mean_q: 57.495

Interval 2 (10000 steps performed)
76 episodes - episode_reward: 132.605 [13.000, 285.000] - loss: 2.351 - mae: 23.218 - acc: 0.878 - mean_q: 58.144

Interval 3 (20000 steps performed)
93 episodes - episode_reward: 107.301 [8.000, 241.000] - loss: 2.319 - mae: 25.612 - acc: 0.885 - mean_q: 57.031

Interval 4 (30000 steps performed)
81 episodes - episode_reward: 124.901 [9.000, 269.000] - loss: 1.919 - mae: 25.364 - acc: 0.878 - mean_q: 57.706

Interval 5 (40000 steps performed)
107 episodes - episode_reward: 93.084 [11.000, 237.000] - loss: 1.805 - mae: 23.940 - acc: 0.874 - mean_q: 58.160

Interval 6 (50000 steps performed)
159 episodes - episode_reward: 63.151 [7.000, 163.000] - loss: 2.931 - mae: 29.482 - acc: 0.882 - mean_q: 59.157

Interval 7 (60000 steps performed)
119 episodes - episode_reward: 83.731 [7.0

<keras.callbacks.History at 0x2a0fc85b0>

In [27]:
dqn.test(env, nb_episodes=20, visualize=False)

Testing for 20 episodes ...
Episode 1: reward: 180.000, steps: 180
Episode 2: reward: 91.000, steps: 91
Episode 3: reward: 261.000, steps: 261
Episode 4: reward: 63.000, steps: 63
Episode 5: reward: 198.000, steps: 198
Episode 6: reward: 188.000, steps: 188
Episode 7: reward: 188.000, steps: 188
Episode 8: reward: 228.000, steps: 228
Episode 9: reward: 188.000, steps: 188
Episode 10: reward: 179.000, steps: 179
Episode 11: reward: 55.000, steps: 55
Episode 12: reward: 51.000, steps: 51
Episode 13: reward: 56.000, steps: 56
Episode 14: reward: 232.000, steps: 232
Episode 15: reward: 224.000, steps: 224
Episode 16: reward: 71.000, steps: 71
Episode 17: reward: 96.000, steps: 96
Episode 18: reward: 130.000, steps: 130
Episode 19: reward: 248.000, steps: 248
Episode 20: reward: 164.000, steps: 164


<keras.callbacks.History at 0x2a0fc8a90>

In [28]:
scores = dqn.test(env, nb_episodes=20)
print(np.mean(scores.history['episode_reward']))

Testing for 20 episodes ...
Episode 1: reward: 52.000, steps: 52
Episode 2: reward: 48.000, steps: 48
Episode 3: reward: 217.000, steps: 217
Episode 4: reward: 71.000, steps: 71


KeyboardInterrupt: 

In [26]:
#dqn.save_weights('cartpole_1.h5f', overwrite=True)