<a href="https://colab.research.google.com/github/prevelat/Machine_Learning/blob/master/DRIVE_ONLY_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [0]:
# Note: If you haven't installed the following dependencies, run:
!apt-get install -y xvfb
!pip install 'gym==0.10.11'
!pip install 'imageio==2.4.0'
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay
!pip install tf-agents
!pip install tensorflow==2.0.0
!pip install tensorflow-probability==0.8
try:
  %%tensorflow_version 2.x
except:
  pass

In [0]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import PIL.Image
import pyvirtualdisplay
import numpy as np
import datetime
import pandas as pd
import time

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.policies import policy_saver

tf.compat.v1.enable_v2_behavior()

In [0]:
tf.version.VERSION

In [0]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir("drive/My Drive/Colab Notebooks")

In [0]:
policy_drive_folder = 'Policies/'
data_drive_folder = 'Data/'

### Hyperparameters

In [0]:
num_iterations = 1000 # @param {type:"integer"}

replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}

num_eval_episodes = 100  # @param {type:"integer"}
eval_interval =   250# @param {type:"integer"}

collect_steps = 500  # @param {type:"integer"}

### Environment

Load the CartPole environment from the OpenAI Gym suite. <br/>
Usually two environments are instantiated: one for training and one for evaluation.

In [0]:
env_name = 'CartPole-v0'
env = suite_gym.load(env_name)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

### Agent

In [0]:
fc_layer_params = (100,)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

Now use `tf_agents.agents.dqn.dqn_agent` to instantiate a `DqnAgent`. In addition to the `time_step_spec`, `action_spec` and the QNetwork, the agent constructor also requires an optimizer (in this case, `AdamOptimizer`), a loss function, and an integer step counter.

In [0]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

### Replay Buffer

The replay buffer keeps track of data collected from the environment. This tutorial uses `tf_agents.replay_buffers.tf_uniform_replay_buffer.TFUniformReplayBuffer`, as it is the most common. 

The constructor requires the specs for the data it will be collecting. This is available from the agent using the `collect_data_spec` method. The batch size and maximum buffer length are also required.

In [0]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

For most agents, collect_data_spec is a named tuple called Trajectory, containing the specs for observations, actions, rewards, and other items.

In [0]:
agent.collect_data_spec

In [0]:
agent.collect_data_spec._fields

### Read data from file

In [0]:
def save_older_data(last_data, data_list):
  for name in data_list:
    if last_data != int(name.split('.')[0]):
      !mv $name $data_drive_folder

def get_data_drive(data_file_number):
  print('----------FETCHING DATA')
  last_data = 0
  # all_data = !ls $data_drive_folder
  all_data = !ls
  data_list = list()
  for name in all_data:
    if '.data.' in name:
      name = name[1:]
      data_list.append(name)
      s = name.split('.')
      if int(s[0]) > int(data_file_number):
        data_file_number = int(s[0])
        last_data = int(s[0])
        file_name = name
  if last_data == 0:
    print('----------NO NEW DATA, TRYING AGAIN...')
    time.sleep(10)
    return get_data_drive(data_file_number)
  else:
    save_older_data(last_data, data_list)
    print('----------DATA READY')
    # file_name = data_drive_folder + file_name
    return file_name, data_file_number

### Convert data back to Trajectory and add to buffer

In [0]:
def add_data_to_replay_buffer(buffer, data):
  for i in data:

    step_type = tf.convert_to_tensor(data[i]['step_type'], tf.int32)
    observation = tf.convert_to_tensor(data[i]['observation'], tf.float32)
    action = tf.convert_to_tensor(data[i]['action'], tf.int64)
    policy_info = tuple(data[i]['policy_info'])
    next_step_type = tf.convert_to_tensor(data[i]['next_step_type'], tf.int32)
    reward = tf.convert_to_tensor(data[i]['reward'], tf.float32)
    discount = tf.convert_to_tensor(data[i]['discount'], tf.float32)

    traj = trajectory.Trajectory(step_type, observation, action, policy_info, next_step_type, reward, discount)
    buffer.add_batch(traj)
  print('Replay buffer ready')

The replay buffer is now a collection of Trajectories.

The agent needs access to the replay buffer. This is provided by creating an iterable `tf.data.Dataset` pipeline which will feed data to the agent.

Each row of the replay buffer only stores a single observation step. But since the DQN Agent needs both the current and next observation to compute the loss, the dataset pipeline will sample two adjacent rows for each item in the batch (`num_steps=2`).

This dataset is also optimized by running parallel calls and prefetching data.

In [0]:
# dataset

In [0]:
# iterator = iter(dataset)
# print(iterator)
# # For the curious:
# # Uncomment to see what the dataset iterator is feeding to the agent.
# # Compare this representation of replay data 
# # to the collection of individual trajectories shown earlier.

# # iterator.next()

### Training Agent

#### Metrics and Evaluation

The most common metric used to evaluate a policy is the average return. The return is the sum of rewards obtained while running a policy in an environment for an episode. Several episodes are run, creating an average return.

The following function computes the average return of a policy, given the policy, environment, and a number of episodes.

In [0]:
#@test {"skip": true}
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]


# See also the metrics module for standard implementations of different metrics.
# https://github.com/tensorflow/agents/tree/master/tf_agents/metrics

#### Training

-   use data to train the agent's neural network(s)

This example also periodicially evaluates the policy and prints the current score.

### Visualization


#### Plots

Use `matplotlib.pyplot` to chart how the policy improved during training.

One iteration of `Cartpole-v0` consists of 200 time steps. The environment gives a reward of `+1` for each step the pole stays up, so the maximum return for one episode is 200. The charts shows the return increasing towards that maximum each time it is evaluated during training. (It may be a little unstable and not increase monotonically each time.)

In [0]:
def visualization(num_iterations, eval_interval, returns):
  iterations = range(0, num_iterations + 1, eval_interval)
  plt.plot(iterations, returns)
  plt.ylabel('Average Return')
  plt.xlabel('Iterations')
  plt.ylim(top=250)

### Policy Saver

In [0]:
def generate_policy_folder_name(avg, ID):
  i = datetime.datetime.now() - datetime.datetime(1970,1,1)
  folder_name = str(avg) + '.policy.' + str(i.total_seconds()) 
  return folder_name

## Training Loop

In [0]:
def trajectory_trimmer(data, avg):
  one_ep = list()
  reward = 0
  ref = avg
  if ref > 100:
    ref = 100
  for i in range(collect_steps - 1):
    reward += data[i]['reward']
    one_ep.append(i)
    if data[i]['step_type'] == 2:
      if reward < ref:
        for traj in one_ep:
          del data[traj]
      one_ep.clear()
      reward = 0

In [0]:
data_file_number = 0
best_avg = 0
ID = 0
while True:

  # Fetch new data
  data_name, data_file_number = get_data_drive(data_file_number)
  data_read = pd.read_json(data_name)
  # !rm $data_name
  trajectory_trimmer(data_read, best_avg)

  # Feed replay buffer and ready dataset for training
  replay_buffer.clear()
  add_data_to_replay_buffer(replay_buffer, data_read)
  dataset = replay_buffer.as_dataset(
      num_parallel_calls=3, 
      sample_batch_size=batch_size, 
      num_steps=2).prefetch(3)
  iterator = iter(dataset)

  # Train and Eval

  ## (Optional) Optimize by wrapping some of the code in a graph using TF function.
  agent.train = common.function(agent.train)

  ## Reset the train step
  agent.train_step_counter.assign(0)

  ## Evaluate the agent's policy once before training.
  avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
  print ('Evaluate the agent\'s policy once before training - ' ,avg_return)
  returns = [avg_return]
  try:
    print('Last best average return = ', best_avg)
  except:
    pass

  policy_list = dict()

  print('--------------------------------------')
  for _ in range(num_iterations):

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % 50 == 0:
      print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
      avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
      print('step = {0}: Average Return = {1}'.format(step, avg_return))
      # returns.append(avg_return)
      if avg_return not in policy_list:
        policy_list[avg_return] = policy_saver.PolicySaver(agent.policy, batch_size=None)
  print('--------------------------------------')

  ## Visualization
  # visualization(num_iterations, eval_interval, returns)

  # Save policy to drive
  avg_bested = False
  for avg in policy_list:
    if avg > best_avg:
      avg_bested = True
      best_avg = avg
  if avg_bested:
    saver = policy_list[best_avg]
    policy_folder_name = generate_policy_folder_name(best_avg, ID)
    saver.save(policy_folder_name)