<a href="https://colab.research.google.com/github/prevelat/Machine_Learning/blob/master/BQ_DataCollection_documented.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

### Installation

In [0]:
# Note: If you haven't installed the following dependencies, run:
!apt-get install -y xvfb
!pip install 'gym==0.10.11'
!pip install 'imageio==2.4.0'
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay
!pip install tf-agents
!pip install tensorflow==2.0.0
!pip install tensorflow-probability==0.8
try:
  %%tensorflow_version 2.x
except:
  pass

In [0]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import PIL.Image
import pyvirtualdisplay
import numpy as np
import datetime
import pandas as pd
import tensorflow as tf
import time

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.policies import policy_saver

tf.compat.v1.enable_v2_behavior()

In [0]:
tf.version.VERSION

### Cloud Management

In [0]:
# Google Cloud Setup

import os
from google.colab import drive
drive.mount('/content/drive')
try:
  os.chdir("drive/My Drive/Colab Notebooks")
except:
  pass
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

from google.cloud import bigquery

In [0]:
#Google Cloud Variables

project_id = 'ml-piscine-262622'
dataset_id = 'ml_piscine_bq'

environment_table_id = dataset_id + '.environment'
agent_policy_table_id = dataset_id + '.agent_policy_v2'
observation_table_id = dataset_id + '.observation'
episodes_table_id = dataset_id + '.episodes'
steps_table_id = dataset_id + '.steps'
traj_raw_table_id = dataset_id + '.traj_raw'

client = bigquery.Client(project=project_id)
try:
  dataset = bigquery.Dataset(dataset_id)
  dataset.location = "US"
  dataset = client.create_dataset(dataset)
except:
  pass

!gcloud config set project {project_id}
policy_bucket_folder = 'gs://ml-piscine-bucket/policy/'

### Hyperparameters

In [0]:
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}

collect_steps = 1000  # @param {type:"integer"}

dqn_layer_params = (100, )  # @param

### Environment

Load the CartPole environment from the OpenAI Gym suite. <br/>
Usually two environments are instantiated: one for training and one for evaluation.

In [0]:
env_name = 'CartPole-v0'
env = suite_gym.load(env_name)
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

### Agent

The DQN agent can be used in any environment which has a discrete action space.

At the heart of a DQN Agent is a QNetwork, a neural network model that can learn to predict QValues (expected returns) for all actions, given an observation from the environment.

Use tf_agents.networks.q_network to create a QNetwork, passing in the observation_spec, action_spec, and a tuple describing the number and size of the model's hidden layers.

In [0]:
fc_layer_params = dqn_layer_params

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

Now use `tf_agents.agents.dqn.dqn_agent` to instantiate a `DqnAgent`. In addition to the `time_step_spec`, `action_spec` and the QNetwork, the agent constructor also requires an optimizer (in this case, `AdamOptimizer`), a loss function, and an integer step counter.

In [0]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

### Policy

Download last Policy saved

In [0]:
def get_policy(ID):

  """
  Reach for the Bucket and update policy based on it's ID
  Arguments:
      ID : ID of the policy to be downloaded
  Returns:
      Agent policy and it's average return for reference
  """

  print('-----------------------FETCHING POLICY')
  query = "SELECT `" + agent_policy_table_id + "`.source \
            FROM `" + agent_policy_table_id + "` \
            ORDER BY ID DESC LIMIT 1"
  path = pd.read_gbq(query=query, project_id=project_id).transpose().to_numpy()[0][0]
  folder_name = path.split('/')[-1]
  avg_ret = float(folder_name.split('.')[1] + '.' + folder_name.split('.')[2])
  !mkdir $folder_name
  path = policy_bucket_folder + folder_name
  print('--------DOWNLOADING POLICY FROM BUCKET')
  print('--------------------------------------')
  print('--------------------------------------')
  !gsutil cp -r $path $folder_name
  print('--------------------------------------')
  print('--------------------------------------')
  sub_folder = folder_name + '/' + folder_name
  policy = tf.compat.v2.saved_model.load(sub_folder)
  !rm -r $folder_name

  return policy, avg_ret

### Replay Buffer

The replay buffer keeps track of data collected from the environment. This tutorial uses `tf_agents.replay_buffers.tf_uniform_replay_buffer.TFUniformReplayBuffer`, as it is the most common. 

The constructor requires the specs for the data it will be collecting. This is available from the agent using the `collect_data_spec` method. The batch size and maximum buffer length are also required.

In [0]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

For most agents, collect_data_spec is a named tuple called Trajectory, containing the specs for observations, actions, rewards, and other items.

In [0]:
agent.collect_data_spec

In [0]:
agent.collect_data_spec._fields

### Data Collection fn

In [0]:
def dict_from_traj(traj):

  """
  Serialize trajectory tensor to dict
  Arguments:
      traj : trajectory tensor
  Returns:
      d : dict containing the serialized tensor
  """

  d = {
    'step_type': traj[0].numpy()[0],
    'observation': traj[1].numpy()[0],
    'action': traj[2].numpy()[0],
    'next_step_type': traj[4].numpy()[0],
    'reward': traj[5].numpy()[0],
    'discount': traj[6].numpy()[0],
    'datetime': datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
  }
  try:
    d['policy_info'] = traj[3].numpy()
  except:
    d['policy_info'] = traj[3]
  return d

def collect_step(environment, policy, data_dict, i):

  """
  Having the policy as reference generate 1 step inside the environment and add it to the dict
  Arguments:
      environment : environment where data is going to be collected
      policy : agent policy to be used for collection
      data_dict : where the new step collected should be stored
      i : step counter
  """

  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)
  data_dict[i] = dict_from_traj(traj)

def collect_data(env, policy, steps):

  """
  Generate a dict containing new data collected
  Arguments:
      env : environment where data is going to be collected
      policy : agent policy to be used for collection
      steps : how many steps should be collected
  Returns:
      data_dict : the dict containing all new data generated
  """

  data_dict = dict()
  for i in range(steps):
    collect_step(env, policy, data_dict, i)
  return data_dict

### GBQ fn

In [0]:
steps_schema = [
  {'name': 'ID', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'obs_ID', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'epi_ID', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'order_in_epi', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'action', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'reward', 'type': 'FLOAT', 'mode': 'REQUIRED'},
  {'name': 'discount', 'type': 'FLOAT', 'mode': 'REQUIRED'},
  {'name': 'step_type', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'next_step_type', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'policy_info', 'type': 'STRING', 'mode': 'REQUIRED'}
]

observation_schema = [
  {'name': 'ID', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'obs0', 'type': 'FLOAT', 'mode': 'REQUIRED'},
  {'name': 'obs1', 'type': 'FLOAT', 'mode': 'REQUIRED'},
  {'name': 'obs2', 'type': 'FLOAT', 'mode': 'REQUIRED'},
  {'name': 'obs3', 'type': 'FLOAT', 'mode': 'REQUIRED'},
]

agent_policy_schema = [
  {'name': 'ID', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'source', 'type': 'STRING', 'mode': 'REQUIRED'},
  {'name': 'avg_return', 'type': 'FLOAT', 'mode': 'REQUIRED'},
  {'name': 'eval_episodes', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'training_steps', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'optimizer', 'type': 'STRING', 'mode': 'REQUIRED'},
  {'name': 'loss_fn', 'type': 'STRING', 'mode': 'REQUIRED'},
  {'name': 'learning_rate', 'type': 'FLOAT', 'mode': 'REQUIRED'},
  {'name': 'dqn_layer_params', 'type': 'STRING', 'mode': 'REQUIRED'}
]

episodes_schema = [
  {'name': 'ID', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'ag_pol_ID', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'env_ID', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'score', 'type': 'FLOAT', 'mode': 'REQUIRED'},
  {'name': 'datetime', 'type': 'STRING', 'mode': 'REQUIRED'}
]

environment_schema = [
  {'name': 'ID', 'type': 'INTEGER', 'mode': 'REQUIRED'},
  {'name': 'env_name', 'type': 'STRING', 'mode': 'REQUIRED'},
  {'name': 'source', 'type': 'STRING', 'mode': 'REQUIRED'},
  {'name': 'collect_steps', 'type': 'INTEGER', 'mode': 'REQUIRED'}
]

In [0]:
def get_last_ID(table_id):

  """
  Query for the last ID in table
  Arguments:
      table_id : qbq table id used as reference
  Returns:
      The last integer ID found in the table
  """

  path = project_id + "." + table_id
  query = "SELECT * FROM `" + path + "` ORDER BY ID DESC LIMIT 1"
  row = pd.read_gbq(query=query, project_id=project_id)
  return int(row['ID'][0])

def get_IDs():

  """
  Query for the last agent_policy, steps, observation and episode table IDs so no data is overlapped
  Returns:
      Each last ID found within each table
  """

  ag_pol_ID = 0
  try:
    ag_pol_ID = get_last_ID(agent_policy_table_id)
  except:
    pass
  steps_ID = 0
  try:
    steps_ID = 1 + get_last_ID(steps_table_id)
  except:
    pass
  obs_ID = 0
  try:
    obs_ID = 1 + get_last_ID(observation_table_id)
  except:
    pass
  epi_ID = 0
  try:
    epi_ID = 1 + get_last_ID(episodes_table_id)
  except:
    pass
  return ag_pol_ID, steps_ID, obs_ID, epi_ID

Upload environment data to Big Query

In [0]:
env_d = dict()

env_ID = 0
try:
  env_ID = 1 + get_last_ID(environment_table_id)
except:
  pass

env_d[0] = {
    'ID': env_ID,
    'env_name': env_name,
    'source': str("gym"),
    'collect_steps': int(collect_steps)
}

df_env = pd.DataFrame(env_d).transpose()
df_env.to_gbq(environment_table_id, if_exists='append', project_id=project_id, table_schema=environment_schema)

## Data Collection loop

In [0]:
first_run = True

In [0]:
while True:

  ag_pol_ID, steps_ID, obs_ID, epi_ID = get_IDs()

  # Get Policy
  if ag_pol_ID == 0 and first_run:
    first_run = False
    policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())
    avg = 0
  else:
    while True:
      if ag_pol_ID == 0:
        print("Awaiting new Policy")
        time.sleep(15)
        try:
          ag_pol_ID = get_last_ID(agent_policy_table_id)
        except:
          pass      
      else:
        try:
          policy, avg = get_policy(ag_pol_ID)
          break
        except:
          pass

  print("ag_pol = " + str(ag_pol_ID))
  print("steps = " + str(steps_ID))
  print("obs = " + str(obs_ID))
  print("epi = " + str(epi_ID))
  print("env = " + str(env_ID))

  # Data Collection
  print('----------COLLECTING DATA WITH EVALUATION AVG = ', avg)
  data = collect_data(train_env, policy, steps=collect_steps)
  print('----------DATA COLLECTED')

  # update tables
  step_d = dict()
  obs_d = dict()
  epi_d = dict()
  order_in_epi = 0
  score = 0

  for i in data:
    obs_d[i] = {
        'ID': obs_ID,
        'obs0': float(data[i]['observation'][0]),
        'obs1': float(data[i]['observation'][1]),
        'obs2': float(data[i]['observation'][2]),
        'obs3': float(data[i]['observation'][3])
    }
    step_d[i] = {
        'ID': steps_ID,
        'obs_ID': obs_ID,
        'epi_ID': epi_ID,
        'order_in_epi': order_in_epi,
        'action': int(data[i]['action']),
        'reward': float(data[i]['reward']),
        'discount': float(data[i]['discount']),
        'step_type': int(data[i]['step_type']),
        'next_step_type': int(data[i]['next_step_type']),
        'policy_info': str(data[i]['policy_info'])
    }
    order_in_epi += 1
    score += data[i]['reward']
    if (data[i]['step_type'] == 2):
      epi_d[i] = {
          'ID': epi_ID,
          'ag_pol_ID': ag_pol_ID,
          'env_ID': env_ID,
          'score': float(score),
          'datetime': data[i]['datetime']
      }
      order_in_epi = 0
      epi_ID += 1
      score = 0
    obs_ID += 1
    steps_ID += 1

  df_obs = pd.DataFrame(obs_d).transpose()
  df_step = pd.DataFrame(step_d).transpose()
  df_epi = pd.DataFrame(epi_d).transpose()

  df_obs.to_gbq(observation_table_id, if_exists='append', project_id=project_id, table_schema=observation_schema)
  df_step.to_gbq(steps_table_id, if_exists='append', project_id=project_id, table_schema=steps_schema)
  df_epi.to_gbq(episodes_table_id, if_exists='append', project_id=project_id, table_schema=episodes_schema)


  print("\nCOLLECTION COMPLETE - DATA TABLE UPDATED")
