In [1]:
import sys
import os
from pathlib import Path

root_path = str(Path(os.getcwd()).parents[3])
sys.path.append(root_path)

# Import libraries

In [2]:
import gymnasium as gym
import numpy as np
import pandas as pd

import tensorflow as tf
from mercury.rl.agents import ConservativeDQLAgent
from mercury.rl.environment import ENV

Matplotlib is building the font cache; this may take a moment.


# Download the Dataset

https://www.kaggle.com/datasets/gibrano/offline-mountaincar?select=MountainCar.csv

# Creating the offline environment.

In [3]:
data_path = "../MountainCar.csv"
states_cols = ['x', 'vel']
action_col = 'action'
reward_col = 'reward'
episode_col_id = 'episode_id'
order_col = 'seq'

In [4]:
offline_env = ENV(data_path, states_cols, action_col, reward_col, episode_col_id, order_col, batch_size=64, shuffle=True)

# Training

In [5]:
agent = ConservativeDQLAgent(learning_rate=0.01, gamma=0.99, num_states=2, n_actions=3)

#agent.MEMORY_SIZE = 50000
history_reward = []

offline_env.reset()

episodes = min(offline_env.env.episodes, 5000)
    
for batch_id in range(episodes):

    episode_ids, sequence, states, actions, rewards = offline_env.get_replay(batch_id)

    next_states = np.concatenate([states[1:], [states[-1]]])

    dones = np.repeat(False, len(states))
    dones[-1] = True    
    for j in range(len(episode_ids)):
        agent.store_transition(states[j], actions[j], next_states[j], rewards[j], dones[j])

    # Training
    agent.learn()

    total_R = sum(rewards)
    history_reward.append(total_R)

    print("Epoch:", batch_id, "episode:", episode_ids[0], "Loss:", agent.loss.numpy())

Epoch: 0 episode: 1 Loss: 1.5416927
Epoch: 1 episode: 21 Loss: 1.4430265
Epoch: 2 episode: 3 Loss: 1.4085786
Epoch: 3 episode: 2 Loss: 1.4133666
Epoch: 4 episode: 108 Loss: 1.4004185
Epoch: 5 episode: 12 Loss: 1.3809547
Epoch: 6 episode: 9 Loss: 1.3770461
Epoch: 7 episode: 8 Loss: 1.3723487
Epoch: 8 episode: 2 Loss: 1.3714432
Epoch: 9 episode: 43 Loss: 1.3703233
Epoch: 10 episode: 17 Loss: 1.3680146
Epoch: 11 episode: 22 Loss: 1.3690372
Epoch: 12 episode: 50 Loss: 1.3692772
Epoch: 13 episode: 38 Loss: 1.3690034
Epoch: 14 episode: 1 Loss: 1.3677046
Epoch: 15 episode: 30 Loss: 1.3650346
Epoch: 16 episode: 12 Loss: 1.3648835
Epoch: 17 episode: 10 Loss: 1.3655647
Epoch: 18 episode: 8 Loss: 1.364216
Epoch: 19 episode: 57 Loss: 1.361446
Epoch: 20 episode: 16 Loss: 1.3597205
Epoch: 21 episode: 20 Loss: 1.3593284
Epoch: 22 episode: 84 Loss: 1.3597124
Epoch: 23 episode: 16 Loss: 1.3588685
Epoch: 24 episode: 14 Loss: 1.3585343
Epoch: 25 episode: 5 Loss: 1.3581294
Epoch: 26 episode: 11 Loss: 1.35

KeyboardInterrupt: 

In [6]:
agent.q_network.save(root_path+'/models/mountain_car_cql_model_env.h5')



In [7]:
env = gym.make('MountainCar-v0', render_mode="human")

In [8]:
agent2 = ConservativeDQLAgent(num_states=2, n_actions=3, deterministic=True)
agent2.q_network = tf.keras.models.load_model(root_path+'/models/mountain_car_cql_model_env.h5')



In [9]:
curr_state, info = env.reset()
    
total_R = 0

while True:
    
    action = agent2.choose_action(curr_state)

    next_state, reward, terminated, _, _ = env.step(action)
    
    curr_state = next_state.copy()
    total_R += reward

    if terminated:
        break

print("Total reward:", total_R)

Total reward: -160.0


In [10]:
env.close()