In [83]:
import numpy as np
import gym
from seagul.nn import LinearNet
import torch
torch.set_default_dtype(torch.float64)

In [122]:
# The function we'll try to optimize 
env = gym.make('InvertedPendulum-v2')
def pend_rollout(policy):
    max_ep_length = 100
    reward_hist = torch.zeros((max_ep_length,1))
    action_hist = torch.zeros((max_ep_length,1))
    state_hist = torch.zeros((max_ep_length,4))
    obs = env.reset()
    for i in range(max_ep_length):
        action = policy(torch.as_tensor(obs))
        obs, reward, done, _ = env.step(action.detach())
        action_hist[i,:] = action.clone()
        state_hist[i,:] = torch.tensor(obs).clone()
        reward_hist[i,:] = reward
        #env.render()
        if done:
            break
            
    return reward_hist, action_hist, state_hist
    
    
def quad(x):
    return -np.power((x-10),2)

In [129]:
#https://arxiv.org/pdf/1803.07055.pdf

# Basic Random Search
def brs(func, step_size = 1, n_delta = 10, exp_noise = .03, num_trials = 1000):
    n_param = 4
    th = np.zeros((n_param,1))
    
    for _ in range(num_trials):
        delta = np.random.normal(0.0, exp_noise, (n_param, n_delta))
        returns_p = func(th + delta)
        returns_n = func(th - delta)
                
        #print(returns.std())
        th = th + np.array(step_size/(n_delta)*np.sum((returns_p-returns_n)*delta,1)).reshape(n_param,-1)
        
    return th



#Augmented Random Search V1 (divide update step by std of rewards)
def ars(func, step_size = 1, n_delta = 10, exp_noise = .03,num_trials = 1000):
    n_param = 4
    th = np.zeros((n_param,1))
    
    for _ in range(num_trials):
        delta = np.random.normal(0.0, exp_noise, (n_param, n_delta))
        _,_,returns_p = func(th + delta)
        _,_,returns_n = func(th - delta)
        returns = np.concatenate((returns_p, returns_n))
        
        
        #print(returns.std())
        th = th + np.array(step_size/(n_delta*returns.std()+1e-6)*np.sum((returns_p-returns_n)*delta,1)).reshape(n_param,-1)
        
    return th
        
    
#Augmented Random Search V2 (divide by std of rewards std of rewards, normalize states)
def ars_v2(func, step_size = 1, n_delta = 10, exp_noise = .03, num_trials = 1000):
    n_param = 4
    th = torch.zeros((n_param,1))
    s_mean = torch.zeros((n_param,1))
    s_stdv  = torch.ones((n_param,1))
    policy = LinearNet(n_param,1) 
    total_steps = 0 
    
    exp_dist = torch.distributions.Normal(torch.zeros(n_param),torch.ones(n_param)*exp_noise)
    
    for _ in range(num_trials):
        delta = exp_dist.sample().reshape(n_param,1)
        
        policy.layer.weight[0,:] = (th + delta).reshape(-1); 
        states_p,_,returns_p = func(policy)
        
        policy.layer.weight[0,:] = (th + delta).reshape(-1); 
        states_n,_,returns_n = func(policy)
        
        returns = torch.cat((returns_p, returns_n))
        states = torch.cat((states_p, states_n))
        
        ep_steps = states.shape[0]
        s_mean = (states.mean(0)*ep_steps + s_mean*total_steps)/(total_steps + ep_steps)
        s_stdv = (states.std(0)*ep_steps + s_stdv*total_steps)/(total_steps + ep_steps)
        total_steps += ep_steps
        
        policy.state_means = s_mean
        policy.state_var = s_stdv
        
        #print(returns.std())
        th = th + np.array(step_size/(n_delta*returns.std()+1e-6)*np.sum((returns_p-returns_n)*delta,1)).reshape(n_param,-1)
        
    return th
        

In [130]:
vec_env = lambda vec: np.array([pend_rollout(x) for x in vec])
policy  = ars_v2(pend_rollout)
print(policy)
print(pend_rollout(policy))

RuntimeError: The size of tensor a (100) must match the size of tensor b (4) at non-singleton dimension 0

In [61]:
policy = LinearNet(4,1)
policy.layer.weight[0,:]

tensor([-0.1275,  0.3000, -0.0990, -0.4775], grad_fn=<SliceBackward>)