In [1]:
import ray
import gym
import numpy as np
import psutil
import scipy.signal
num_cpus = psutil.cpu_count(logical=False)

ray.init(num_cpus=num_cpus)

2019-09-04 14:20:54,463	INFO node.py:498 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-09-04_14-20-54_462374_74501/logs.
2019-09-04 14:20:54,572	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:45234 to respond...
2019-09-04 14:20:54,795	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:28325 to respond...
2019-09-04 14:20:54,801	INFO services.py:809 -- Starting Redis shard with 3.44 GB max memory.
2019-09-04 14:20:54,816	INFO node.py:512 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-09-04_14-20-54_462374_74501/logs.
2019-09-04 14:20:54,818	INFO services.py:1475 -- Starting the Plasma object store with 5.15 GB memory using /tmp.


{'node_ip_address': '10.16.51.224',
 'redis_address': '10.16.51.224:45234',
 'object_store_address': '/tmp/ray/session_2019-09-04_14-20-54_462374_74501/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-09-04_14-20-54_462374_74501/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2019-09-04_14-20-54_462374_74501'}

In [None]:




@ray.remote
def f(image, random_filter):
    # Do some image processing.
    return scipy.signal.convolve2d(image, random_filter)[::5, ::5]

filters = [np.random.normal(size=(4, 4)) for _ in range(num_cpus)]

# Time the code below.
def test():
    for _ in range(10):
        image = np.zeros((3000, 3000))
        image_id = ray.put(image)
        ray.get([f.remote(image_id, filters[i]) for i in range(num_cpus)])
        
%time test()

In [None]:
from multiprocessing import Pool
import numpy as np
import psutil
import scipy.signal

num_cpus = psutil.cpu_count(logical=False)

def f(args):
    image, random_filter = args
    # Do some image processing.
    return scipy.signal.convolve2d(image, random_filter)[::5, ::5]

pool = Pool(num_cpus)

filters = [np.random.normal(size=(4, 4)) for _ in range(num_cpus)]

# Time the code below.

def test():
    for _ in range(10):
        image = np.zeros((3000, 3000))
        pool.map(f, zip(num_cpus * [image], filters))
%time test()

In [None]:
import numpy as np
import tensorflow as tf
import gym
import time
import pybullet
import reach2D
from SAC import *
from train import *

In [None]:
class mlp_gail_discriminator(Model):

  def __init__(self, hidden_sizes=[32,32,32], activation = 'relu'):
    super(mlp_gail_discriminator, self).__init__()
    self.mlp = mlp(list(hidden_sizes), activation, activation)
    self.prob = Dense(1, activation='sigmoid')


  def call(self, obs, acts):
    x = tf.concat([obs,acts], axis = -1)
    x = self.mlp(x)
    prob = self.prob(x)
    return prob



In [None]:
def discriminator_train_step(batch, expert_batch,  discriminator, discriminator_optimizer):
    batch_obs, batch_acts = batch['obs1'], batch['acts']
    batch_expert_obs, batch_expert_acts = expert_batch['obs'], expert_batch['acts']
    with tf.GradientTape() as tape:
        # We'd like to maximise the log probability of the expert actions, and minmise log prob of generated actions.
        expert_probs = discriminator(batch_expert_obs,batch_expert_acts)
        # in ML, we take gradient to minimise, therefore minimise negative log probability 
        expert_loss = -tf.math.log(expert_probs)
        agent_probs = discriminator(batch_obs,batch_acts)
        # i.e, minimise -log(1-prob_generated_is_true)
        agent_loss = -(tf.math.log(1-agent_probs))
        # and thus, the reward our SAC agent gets will be -(tf.math.log(1-agent_probs)), it is trying to maximise this, 
        # discriminator is trying to mimise it.
        loss = tf.reduce_sum(expert_loss + agent_loss)
        expert_accuracy = tf.reduce_mean(tf.cast(expert_probs > 0.5, tf.float32))
        agent_accuracy  = tf.reduce_mean(tf.cast(agent_probs < 0.5, tf.float32))


        
    gradients = tape.gradient(loss, discriminator.trainable_variables)
    discriminator_optimizer.apply_gradients(zip(gradients, discriminator.trainable_variables))
    return loss.numpy(), expert_accuracy.numpy(), agent_accuracy.numpy()

In [None]:
expert_obs = np.load('collected_data/expert_obs_Pendulum-v0_Hidden_32l_25000.npy')
expert_acts = np.load('collected_data/expert_actions_Pendulum-v0_Hidden_32l_25000.npy')

def sample_expert_transitions(batch_size):
    idxs = np.random.randint(0, len(expert_obs), size=batch_size)
    return {'obs':expert_obs[idxs], 'acts':expert_acts[idxs]}

In [None]:
def gail_run(discrim_req_acc):
    ENV_NAME='Pendulum-v0'
    env_fn = lambda : gym.make(ENV_NAME)
    hid =128
    l=2
    gamma=0.999
    steps_per_epoch=5000
    seed=0
    epochs=7
    max_ep_len = 200 # for reacher, 1000 for not. 
    exp_name = ENV_NAME+'_Hidden_'+str(hid)+'l_'+str(l)
    alpha=0.2
    batch_size=100
    lr=1e-3
    start_steps=5000 
    save_freq=1
    load = False
    render = False
    polyak=0.995
    replay_size=int(1e6)
    ac_kwargs = {}
    ac_kwargs['hidden_sizes'] = [hid]*l
    discrim_req_acc = discrim_req_acc
    tf.random.set_seed(seed)
    np.random.seed(seed)
    env, test_env = env_fn(), env_fn()
    # Get Env dimensions
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    SAC = SAC_model(env, obs_dim, act_dim, ac_kwargs['hidden_sizes'],lr, gamma, alpha, polyak,  load, exp_name)
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    #Logging 
    start_time = time.time()
    train_log_dir = 'logs/' + str(discrim_req_acc)+exp_name+':'+str(start_time)
    summary_writer = tf.summary.create_file_writer(train_log_dir)

    discriminator = mlp_gail_discriminator()
    discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    def update_models(model, replay_buffer, steps, batch_size):
        agent_accuracy = 0
        # until the discriminator is trained to sufficiently distinguish correct transitions.
        print('Updating Discriminator')
        while agent_accuracy < discrim_req_acc or expert_acurracy < discrim_req_acc:
            batch = replay_buffer.sample_batch(batch_size)
            expert_batch = sample_expert_transitions(batch_size)
            _,expert_acurracy,agent_accuracy = discriminator_train_step(batch, expert_batch, discriminator, discriminator_optimizer)
            print(expert_acurracy, agent_accuracy)

        # now update SAC
        print('Updating Policy')
        for j in range(steps):
            batch = replay_buffer.sample_batch(batch_size)
            batch_obs, batch_acts = batch['obs1'], batch['acts']
            agent_probs = discriminator(batch_obs,batch_acts)
            agent_reward = -(tf.math.log(1-agent_probs)).numpy().squeeze().astype('float32')
            # use GAIL reward instead of environment reward
            batch['rews'] = agent_reward

            LossPi, LossQ1, LossQ2, LossV, Q1Vals, Q2Vals, VVals, LogPi = model.train_step(batch)


    # now collect epsiodes
    total_steps = steps_per_epoch * epochs
    steps_collected = 0

    # collect some initial random steps to initialise
    random_steps = 5000
    steps_collected  += rollout_trajectories(n_steps = random_steps,env = env, max_ep_len = max_ep_len, actor = 'random', replay_buffer = replay_buffer, summary_writer = summary_writer)


    update_models(SAC, replay_buffer, steps = random_steps, batch_size = batch_size)

    # now act with our actor, and alternately collect data, then train.
    while steps_collected < total_steps:
    # collect an episode
        steps_collected  += rollout_trajectories(n_steps = max_ep_len,env = env, max_ep_len = max_ep_len, actor = SAC.get_action, replay_buffer = replay_buffer, summary_writer=summary_writer, current_total_steps = steps_collected)
        # take than many training steps
        update_models(SAC, replay_buffer, steps = max_ep_len, batch_size = batch_size)

        # if an epoch has elapsed, save and test.
        if steps_collected  > 0 and steps_collected  % steps_per_epoch == 0:
            #SAC.save_weights()
            # Test the performance of the deterministic version of the agent.
            rollout_trajectories(n_steps = max_ep_len*10,env = test_env, max_ep_len = max_ep_len, actor = SAC.get_deterministic_action, summary_writer=summary_writer, current_total_steps = steps_collected, train = False, render = True)


In [None]:
test_accs = [0.6, 0.7, 0.8, 0.9]
for a in test_accs:
    gail_run(a)

In [None]:
# Okay, we need to do two ablations.
# Whats the optimal accuracy that we want our discriminator to have before we train? It'll be different on every problem
# but lets see if changing it on this simple problem can give us some ideas of what will happen.

# How 

In [None]:
batch_size = 100
expert_batch = sample_expert_transitions(batch_size)
batch_expert_obs, batch_expert_acts = expert_batch['obs'], expert_batch['acts']
expert_probs = discriminator(batch_expert_obs,batch_expert_acts)


agent_reward = -(tf.math.log(1-expert_probs)).numpy().squeeze().astype('float32')
agent_reward

In [None]:
discriminator = mlp_gail_discriminator()


In [None]:
-(tf.math.log(0.0)).numpy()

In [None]:
-(tf.math.log(1-0.01)).numpy()

In [None]:
import matplotlib.pyplot as plt
xs = np.linspace(0,1,100)

In [None]:
def f(x):
    return -(tf.math.log(1-x+1e-8)).numpy()

In [None]:
ys = [f(x) for x in xs]

In [None]:
plt.plot(xs,ys)

In [None]:
def f2(x):
    return (tf.math.log(x+1e-8)-(tf.math.log(1-x+1e-8)).numpy())
ys = [f2(x) for x in xs]
plt.plot(xs,ys)

In [None]:
def f3(x):
    return tf.math.log(x+1e-8).numpy()
ys = [f3(x) for x in xs]
plt.plot(xs,ys)

In [None]:
x = np.linspace(1,100,100)


In [None]:
np.delete(x, [1,2,3])

In [None]:
a = tf.ones((100,1))
a = a.numpy().squeeze()

In [None]:
q = np.where(a < 2)
q

In [None]:
q[.shapeq

In [None]:
a = np.array([11,21,31])

In [None]:
b = np.array([1,2])

In [None]:
a[b]

In [None]:
5 % 1000

In [None]:
tf.tensor([np.ones(100)])

In [None]:
env

In [None]:
env.action_space

In [None]:
env = gym.make('Pendulum-v0')

In [None]:
env.reset()

In [None]:
env.step(1)

In [None]:
a