In [226]:
import numpy as np
import gym
from seagul.nn import LinearNet

In [174]:
# The function we'll try to optimize 
env = gym.make('InvertedPendulum-v2')
def pend_rollout(policy):
    max_ep_length = 100
    reward_hist = np.zeros((max_ep_length,1))
    obs = env.reset()
    for i in range(max_ep_length):
        action = policy(obs)
        obs, reward, done, _ = env.step(action)
        #action_hist[i,:] = np.copy(actions)
        #state_hist[i,:] = np.copy(obs)
        reward_hist[i,:] = np.copy(reward)
        #env.render()
        if done:
            break
            
    return reward_hist
    
    
def quad(x):
    return -np.power((x-10),2)

In [207]:
#https://arxiv.org/pdf/1803.07055.pdf

# Basic Random Search
def brs(func, step_size = 1, n_delta = 10, exp_noise = .03, num_trials = 1000):
    n_param = 4
    th = np.zeros((n_param,1))
    
    for _ in range(num_trials):
        delta = np.random.normal(0.0, exp_noise, (n_param, n_delta))
        returns_p = func(th + delta)
        returns_n = func(th - delta)
                
        #print(returns.std())
        th = th + np.array(step_size/(n_delta)*np.sum((returns_p-returns_n)*delta,1)).reshape(n_param,-1)
        
    return th



#Augmented Random Search V1 (divide update step by std of rewards)
def ars(func, step_size = 1, n_delta = 10, exp_noise = .03,num_trials = 1000):
    n_param = 4
    th = np.zeros((n_param,1))
    
    for _ in range(num_trials):
        delta = np.random.normal(0.0, exp_noise, (n_param, n_delta))
        _,_,returns_p = func(th + delta)
        _,_,returns_n = func(th - delta)
        returns = np.concatenate((returns_p, returns_n))
        
        
        #print(returns.std())
        th = th + np.array(step_size/(n_delta*returns.std()+1e-6)*np.sum((returns_p-returns_n)*delta,1)).reshape(n_param,-1)
        
    return th
        
    
#Augmented Random Search V2 (divide by std of rewards std of rewards, normalize states)
def ars_v2(func, step_size = 1, n_delta = 10, exp_noise = .03, num_trials = 1000):
    n_param = 4
    th = np.zeros((n_param,1))
    s_mean = np.zeros((4,1))
    s_std  = np.ones((4,1))
    
    for _ in range(num_trials):
        delta = np.random.normal(0.0, exp_noise, (n_param, n_delta))
        
        states_p,_,returns_p = func(th + delta)
        states_n,_,returns_n = func(th - delta)
        
        returns = np.concatenate((returns_p, returns_n))
        states = np.concatenate((states_p, states_n))
        
        ep_steps = states.shape[0]
        s_mean = (states.mean(0)*ep_steps + s_mean*total_steps)/(total_steps + ep_steps)
        s_stdv = (states.stdv(0)*ep_steps + s_stdv*total_steps)/(total_steps + ep_steps)
        total_steps += ep_steps
        
        
        #print(returns.std())
        th = th + np.array(step_size/(n_delta*returns.std()+1e-6)*np.sum((returns_p-returns_n)*delta,1)).reshape(n_param,-1)
        
    return th
        
        
        
        
    
    
    

In [208]:
vec_env = lambda vec: np.array([pend_rollout(x) for x in vec.transpose(-1,0)])
policy  = brs(vec_env)
print(policy)
print(pend_rollout(policy))

[[-0.30318754]
 [ 0.33341854]
 [ 0.48657796]
 [ 1.00478189]]
100.0


In [None]:
pend_rollout(np.random.rand(4,1))