This uses the open AI gym baseline with some slight modifications. Mostly copied code from here
https://github.com/openai/baselines/tree/master/baselines/ppo1. Looks like the version I got from pip and the version currently on master don't quite sync up. (For example there is no tf_util.save_state fcn so we save manually

In [1]:
from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
from baselines.common import tf_util as U
import tensorflow as tf
from baselines import logger
import os
import sys
from baselines.ppo1 import mlp_policy, pposgd_simple

import gym
import seagul.envs
import policies.mlp_relu_policy as mlp_relu_policy

import numpy as np
from mpl_toolkits import mplot3d
%matplotlib ipympl
import matplotlib.pyplot as plt
import itertools

#Needed for saving 
import errno, datetime, time, inspect

def train(env_id, num_timesteps, seed=0):

    U.make_session(num_cpu=16).__enter__()
    
    def policy_fn(name, ob_space, ac_space):
        #return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=64)
        return mlp_relu_policy.ReluMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=4)

    env = gym.make(env_id)
    pi = pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95, schedule='linear',
        )
    env.close()
   
    return pi



def save_results(filename, description = None):
    """ 
    description: saves the results of a run of the second cell (the one that calls train) in this notebook
    
    """

    save_dir = "data/" + filename + "/"
    os.makedirs(save_dir)
    
    if description is None:
        description  = input("please enter a description of the run")
        
    datetime_str = str(datetime.datetime.today())
    datetime_str = datetime_str.replace(" ", "_")
    
    runtime_str = str(datetime.timedelta(seconds = runtime))
    
    readme = open(save_dir + "README.txt", "w+")
    readme.write("datetime: " + datetime_str + "\n\n")
    readme.write("enviroment: " + env_name + "\n\n")
    readme.write("description: " + description + "\n\n")
    readme.write("time_elapsed: " + runtime_str + "\n\n")
    readme.write("num_timesteps: " + str(num_timesteps) + "\n\n")
    readme.write("seed: " + str(seed) + "\n\n")
    readme.close()

    # TODO add code snippets that correspond to the run
    # TODO somehow store the tensorboard logs here after the fact
    
    saver = tf.train.Saver()
    saver.save(tf.get_default_session(), save_dir + filename)
    
    os.rename("./tmp_logs/", save_dir + "tensorboard")
   
   
#env_name = "Acrobot-v1"
env_name = "InvertedPendulum-v2"
#env_name = 'InvertedPendulumPyBulletEnv-v0'
#env_name = "su_cartpole_et-v0"
#env_name = "InvertedDoublePendulum-v2"


Logging to /var/folders/qq/gpxz4l6s1tndfdhysbz8bdym0000gn/T/openai-2018-12-06-18-02-10-667489


  from ._conv import register_converters as _register_converters


In [2]:
# comment one of these lines to switch between loading weights or training them from scratch
load_pretrained_network = True
#load_pretrained_network = False


if load_pretrained_network: #load the weights
    save_name = 'invertedpendulum_3layer'
    
    pi = train(env_name, num_timesteps=1, seed=0)
    # TODO eventually need to switch to .load_variables() instead of U.load_state() but this didn't work by default for me
    U.load_state(os.getcwd() + '/data/'+ save_name + '/' + save_name)
    
else: #run the RL algorithm
    num_timesteps = 2e6
    seed = 0
    
    print("training")
    
    start_time = time.time()
    
    logger.configure(dir = "./tmp_logs", format_strs=["tensorboard"] )
    with tf.device("/cpu:0"):    
        pi= train(env_name, num_timesteps=num_timesteps, seed=seed)

    runtime = time.time() - start_time



********** Iteration 0 ************
Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     -0.00218 |       0.00000 |      24.90593 |       0.00011 |       1.41553
     -0.01169 |       0.00000 |      21.33840 |       0.00210 |       1.40883
     -0.01960 |       0.00000 |      13.73937 |       0.01204 |       1.40406
     -0.02225 |       0.00000 |       6.67395 |       0.01329 |       1.40038
     -0.02368 |       0.00000 |       4.95351 |       0.01354 |       1.39781
     -0.02464 |       0.00000 |       4.55235 |       0.01399 |       1.39412
     -0.02576 |       0.00000 |       4.32681 |       0.01549 |       1.39097
     -0.02665 |       0.00000 |       4.23610 |       0.01673 |       1.38777
     -0.02737 |       0.00000 |       4.19393 |       0.01652 |       1.38482
     -0.02766 |       0.00000 |       4.23136 |       0.01757 |       1.38171
Evaluating losses...
     -0.02956 |       0.00000 |       4.17589 |       0.01711 |       1

In [None]:
# Plays out a trained policy

#env = make_mujoco_env(env_name,seed=0)
env = gym.make(env_name)
ob = env.reset()     

while True:
    action = pi.act(stochastic=False, ob=ob)[0]
    ob, _, done, _ =  env.step(action)
    #if reward == 1:
    #    print("balanced")
    env.render()
    if done:
        ob = env.reset()
        
#U.save_state("./saved/5mil_flat")

In [7]:
all_weights = pi.get_variables()
fc1 = all_weights[4]
fc1_weights = fc1.value()
fc1_weights.eval() # only positive weights contribute anything


array([-0.24696875, -0.23682539, -0.16115585, -0.2555738 , -0.21993473,
       -0.21259777, -0.19329132, -0.2441551 , -0.19577181,  0.13270849,
        0.32211903, -0.33642697,  0.19466048,  0.16979927,  0.28151765,
       -0.19948047,  0.18644999,  0.22629759,  0.24678789, -0.17854922,
        0.1880423 ,  0.21452071, -0.0797731 , -0.26142293,  0.25687432,
       -0.22584982, -0.10202789, -0.19495583, -0.18868645,  0.22214413,
       -0.26131016, -0.28009197,  0.2441879 , -0.11879358,  0.20997407,
       -0.30982113,  0.3346755 , -0.27933925,  0.30527967,  0.16208059,
       -0.14038529,  0.32242414,  0.22511746,  0.30295438,  0.24446884,
       -0.28062847,  0.248032  , -0.2782329 , -0.1831958 ,  0.21735889,
        0.32385787, -0.23780961, -0.22269765, -0.22899884, -0.17545885,
       -0.29416275,  0.27586707, -0.3017498 ,  0.27145833,  0.24073283,
        0.20933826,  0.28276113,  0.33492553,  0.23656006], dtype=float32)

In [None]:
#input_iter = itertools.combinations_with_replacement(range(-10,11),4)
#input_data = np.array([np.array(x) for x in input_iter],dtype='float32')
#output_data = np.array([pi.act(0, x)[0] for x in input_data],dtype='float32')

input_iter = itertools.combinations_with_replacement(range(-10,11),2)
input_data = np.array([np.concatenate((np.zeros(2), np.array(x))) for x in input_iter],dtype='float32')
output_data = np.array([pi.act(0, x)[0] for x in input_data],dtype='float32')

In [None]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
regr = linear_model.LinearRegression()

regr.fit(input_data,output_data.T.flatten())
#regr.fit(input_data,output_data)
#regr.fit(index, output_data.T.flatten())

lin_predict = regr.predict(input_data)
#lin_predict = regr.predict(index)

print("coefs are", regr.coef_)
print("mean sqared error:", mean_squared_error(lin_predict, output_data))

In [None]:
plt.plot(output_data)
plt.figure()
plt.plot(lin_predict)

In [None]:
input_data = np.array([[x, 0, 0, 0] for x in range(-10,11)])
output_data = np.array([pi.act(0, x)[0] for x in input_data])
plt.plot(input_data[:,0], output_data)
plt.figure()

input_data = np.array([[0, x, 0, 0] for x in range(-10,11)])
output_data = np.array([pi.act(0, x)[0] for x in input_data])
plt.plot(input_data[:,1], output_data)
plt.figure()

input_data = np.array([[0, 0, x, 0] for x in range(-10,11)])
output_data = np.array([pi.act(0, x)[0] for x in input_data])
plt.plot(input_data[:,2], output_data)
plt.figure()

input_data = np.array([[0, 0, 0, x] for x in range(-10,11)])
output_data = np.array([pi.act(0, x)[0] for x in input_data])
plt.plot(input_data[:,3], output_data)
plt.figure()

input_data = np.array([[x, x, x, x] for x in range(-10,11)])
output_data = np.array([pi.act(0, x)[0] for x in input_data])
plt.plot(input_data[:,0], output_data)

In [None]:
x = input_data[:,2]
y = input_data[:,3]
z= output_data.flatten()
z2 = lin_predict.flatten()

In [None]:
ax = plt.axes(projection='3d')
ax.plot_trisurf(x, y, z, cmap='viridis', edgecolor='none');

ax2 = plt.axes(projection='3d')
ax2.plot_trisurf(x, y, z2, cmap='viridis', edgecolor='none');

In [None]:
import scipy.io

all_weights = pi.get_variables()

# I'll fix this later..
kernel0 = all_weights[13].value().eval()
bias0   = all_weights[14].value().eval()

kernel1 = all_weights[15].value().eval()
bias1   = all_weights[16].value().eval()

kernel2 = all_weights[17].value().eval()
bias2   = all_weights[18].value().eval()

name_dict = {"kernel0": kernel0, "kernel1":kernel1, "kernel2":kernel2, "bias0":bias0, "bias1":bias1, "bias2":bias2}
scipy.io.savemat("savemat_test", name_dict)