<a href="https://colab.research.google.com/github/prevelat/Machine_Learning/blob/master/CartPoleDistributedDataCollection(Bucket).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [0]:
# Note: If you haven't installed the following dependencies, run:
!apt-get install -y xvfb
!pip install 'gym==0.10.11'
!pip install 'imageio==2.4.0'
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay
!pip install tf-agents
!pip install tensorflow==2.0.0
!pip install tensorflow-probability==0.8
try:
  %%tensorflow_version 2.x
except:
  pass


In [0]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import PIL.Image
import pyvirtualdisplay
import numpy as np
import datetime
import pandas as pd
import tensorflow as tf
import time

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.policies import policy_saver

tf.compat.v1.enable_v2_behavior()

In [0]:
tf.version.VERSION

In [0]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir("drive/My Drive/Colab Notebooks")

In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'ml-piscine-262622'
!gcloud config set project {project_id}

policy_bucket_folder = 'gs://ml-piscine-bucket/policy/'
data_bucket_folder = 'gs://ml-piscine-bucket/data/'
policy_drive_folder = 'policies/'
data_drive_folder = 'data/'

#### Hyperparameters

In [0]:
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}

traj_to_collect_per_file = 2000  # @param {type:"integer"}

#### Environment

Load the CartPole environment from the OpenAI Gym suite. <br/>
Usually two environments are instantiated: one for training and one for evaluation.

In [0]:
env_name = 'CartPole-v0'
env = suite_gym.load(env_name)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

#### Agent

In [0]:
fc_layer_params = (100,)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

Now use `tf_agents.agents.dqn.dqn_agent` to instantiate a `DqnAgent`. In addition to the `time_step_spec`, `action_spec` and the QNetwork, the agent constructor also requires an optimizer (in this case, `AdamOptimizer`), a loss function, and an integer step counter.

In [0]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

#### Policies

Download existing Policy if there is one

In [0]:
def get_policy_bucket():
  print('----------FETCHING POLICY')
  best_avg_ret = 0
  all_policies = !gsutil -q ls $policy_bucket_folder
  for name in all_policies:
    if '.policy.' in name:
      s = name[len(policy_bucket_folder):]
      split = s.split('.')
      if int(split[0]) > best_avg_ret:
        best_avg_ret = int(split[0])
        folder_name = s
  try:
    !mkdir $folder_name
  except:
    pass
  if best_avg_ret != 0:
    path = policy_bucket_folder + folder_name
    print(path)
    print('----------DOWNLOADING POLICY FROM BUCKET')
    print('--------------------------------------')
    print('--------------------------------------')
    !gsutil cp -r $path $folder_name
    print('--------------------------------------')
    print('--------------------------------------')
    sub_folder = folder_name + '/' + folder_name
    policy = tf.compat.v2.saved_model.load(sub_folder)
    !rm -r $folder_name
  else:
    policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())
    print('----------RANDOM POLICY')
  return policy, best_avg_ret

#### Replay Buffer

The replay buffer keeps track of data collected from the environment. This tutorial uses `tf_agents.replay_buffers.tf_uniform_replay_buffer.TFUniformReplayBuffer`, as it is the most common. 

The constructor requires the specs for the data it will be collecting. This is available from the agent using the `collect_data_spec` method. The batch size and maximum buffer length are also required.

In [0]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

For most agents, collect_data_spec is a named tuple called Trajectory, containing the specs for observations, actions, rewards, and other items.

In [0]:
agent.collect_data_spec

In [0]:
agent.collect_data_spec._fields

#### Data Collection and Uploading

Now execute the chosen policy in the environment for a few steps to collect data

In [0]:
def dict_from_traj(traj):
  d = {
      'step_type': traj[0].numpy()[0],
      'observation': traj[1].numpy()[0],
      'action': traj[2].numpy()[0],
      'next_step_type': traj[4].numpy()[0],
      'reward': traj[5].numpy()[0],
      'discount': traj[6].numpy()[0]
  }
  try:
    d['policy_info'] = traj[3].numpy()
  except:
    d['policy_info'] = traj[3]
  return d

def collect_step(environment, policy, data_dict, i):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)
  data_dict[i] = dict_from_traj(traj)

def collect_data(env, policy, steps):
  data_dict = dict()
  for i in range(steps):
    collect_step(env, policy, data_dict, i)
  return data_dict

Upload

In [0]:
def generate_data_file_name():
  i = datetime.datetime.now() - datetime.datetime(1970,1,1)
  file_name = str(i.total_seconds()) + '.data.json'
  return file_name

## Data Collection loop

In [0]:
 while True:

  # Get Policy
  policy, avg = get_policy_bucket()

  # Data Collection
  print('----------COLLECTING DATA WITH AVG = ', avg)
  data = collect_data(train_env, policy, steps=traj_to_collect_per_file)
  print('----------DATA COLLECTED')

  # Data to Drive
  data_file_name = generate_data_file_name()
  pd.DataFrame.to_json(pd.DataFrame(data=data),path_or_buf=data_file_name)
  print('----------FILE ' + data_file_name + ' CREATED')

  # Data to Bucket
  path = data_bucket_folder + data_file_name
  print('----------UPLOADING NEW DATA')
  print('--------------------------------------')
  print('--------------------------------------')
  !gsutil cp $data_file_name $path
  print('--------------------------------------')
  print('--------------------------------------')
  time.sleep(10)
  # !rm $data_file_name
  print('----------WAITING TO GENERATE NEW DATA')
  time.sleep(60)