<a href="https://colab.research.google.com/github/sibyjackgrove/gym-SolarPVDER-environment/blob/master/examples/gym_PVDER_environment_tf_agents_DQN_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Clone gym-PVDER repository and install it

In [0]:
!git clone https://sibyjackgrove:@github.com/sibyjackgrove/gym-SolarPVDER-environment.git

## Go to directory and do pip install

In [0]:
cd gym-SolarPVDER-environment

In [0]:
!git pull

In [0]:
!pip install -e .

## Install tf-nightly and tf-agents

In [0]:
!pip install tf-nightly-gpu
!pip install tfp-nightly
!pip install tf-agents-nightly

## Import the necessary modules

In [14]:
import gym
import gym_PVDER
import matplotlib.pyplot as plt
import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.agents.dqn import q_network
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.environments import trajectory
from tf_agents.metrics import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.drivers import dynamic_episode_driver,dynamic_step_driver
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.utils import common
tf.compat.v1.enable_v2_behavior()
print(tf.__version__)

1.14.1-dev20190327


## Hyperparameters


In [0]:
env_name = 'PVDER-v0'  # @param
num_iterations = 20000  # @param

initial_collect_steps = 1000  # @param
collect_steps_per_iteration = 1  # @param
replay_buffer_capacity = 100000  # @param

fc_layer_params = (100,)

batch_size = 64  # @param
learning_rate = 1e-3  # @param
log_interval = 200  # @param

num_eval_episodes = 10  # @param
eval_interval = 1000  # @param

## Environment

In [0]:
env = suite_gym.load(env_name) #Load environment using tf-agents environment loader for gym
env.render()
print('Observation Spec:')
print(env.time_step_spec().observation)
print('Action Spec:')
print(env.action_spec())

In [0]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

## Create a DQN agent

In [0]:
q_net = q_network.QNetwork(train_env.observation_spec(),
                           train_env.action_spec(),
                           fc_layer_params=fc_layer_params)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.compat.v2.Variable(0)

tf_agent = dqn_agent.DqnAgent(train_env.time_step_spec(),
                              train_env.action_spec(),
                              q_network=q_net,
                              optimizer=optimizer,
                              td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
                              train_step_counter=train_step_counter)
tf_agent.initialize()

## Create policies from the agent

In [0]:
eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(time_step_spec=train_env.time_step_spec(),
                                                action_spec=train_env.action_spec())

## Create tf-agents driver for computing average return using tf-agents metrics module

In [0]:
average_return = tf_metrics.AverageReturnMetric()
env_episodes = tf_metrics.NumberOfEpisodes()
env_steps = tf_metrics.EnvironmentSteps()
average_return_observer = [average_return, env_episodes, env_steps]

def compute_average_return(num_episodes = num_eval_episodes):
    average_return_driver = dynamic_episode_driver.DynamicEpisodeDriver(eval_env, eval_policy, average_return_observer, num_episodes=num_episodes)
    # Initial driver.run will reset the environment and initialize the policy.
    final_time_step, policy_state = average_return_driver.run()

    print('final_time_step', final_time_step)
    print('Number of Steps: ', env_steps.result().numpy())
    print('Number of Episodes: ', env_episodes.result().numpy())
    print('Average Return: ', average_return.result().numpy())
    
    return average_return.result().numpy()

## Create replay Buffer

In [0]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(data_spec=tf_agent.collect_data_spec,
                                                               batch_size=train_env.batch_size,
                                                               max_length=replay_buffer_capacity)

## Create tf-agents driver for Data Collection

In [0]:
data_collect_observer = [replay_buffer.add_batch,env_episodes,env_steps]

def collect_data(num_steps,VERBOSE=False):
    data_collect_driver = dynamic_step_driver.DynamicStepDriver(train_env, collect_policy, data_collect_observer, num_steps=num_steps)
    # Initial driver.run will reset the environment and initialize the policy.
    final_time_step, policy_state = data_collect_driver.run()
    if VERBOSE:
        print('final_time_step', final_time_step)
        print('Number of Steps: ', env_steps.result().numpy())
        print('Number of Episodes: ', env_episodes.result().numpy())

collect_data(initial_collect_steps,VERBOSE=True)

In [0]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)

iterator = iter(dataset)

## Training the agent

In [0]:
# Reset the train step
tf_agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_average_return(num_episodes = num_eval_episodes)

returns = [avg_return]

for _ in range(num_iterations):
    
    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_data(collect_steps_per_iteration)
    
    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = tf_agent.train(experience)

    step = tf_agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss.loss))
    if step % eval_interval == 0:
        print('Evaluating agent at step = {}'.format(step))
        avg_return = compute_average_return(num_episodes = num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

## Plot average return

In [0]:
steps = range(0, num_iterations + 1, eval_interval)
plt.plot(steps, returns)
plt.ylabel('Average Return')
plt.xlabel('Step')
plt.ylim(top=250)

## Test the trained agent's policy for a few episodes

In [0]:
num_episodes = 3
for _ in range(num_episodes):
    time_step = eval_env.reset()
    while not time_step.is_last():
        action_step = tf_agent.policy.action(time_step)
        time_step = eval_env.step(action_step.action)
        print('Action:{}'.format(action_step.action))
        eval_py_env.render()