# Models
- Trained for 2,000,000 time steps (took 6601 seconds) using environment and procedure documented in `ad6f5817258303e2e092b4fbdb4fd0dc9356373b`
- Trained for 100,000 time steps (took 251 seconds) using environment 3 and procedure documented in `c4b74d2e110af467a0e0af745b4d31d6675bcc44`
- Small network (8-8) trained for 100,000 time steps (took 330 seconds) using environment 3 and procedure documented in `830eb4bf42dfbfc5b809ad59ce94335557a352f3`
- Small network (8-8) further trained for 400,000 time steps using procedure documented in `b6631ec96d9f604dc2f69fc393a47a25bfd68505`
- Small network (8-8) trained for 1,400,000 time steps (took 4,885 seconds) using environment 3 with modified rewards and procedure documented in `be827b0d3deb94789fd8adcc8d294afbfb015876` (evaluation over 30,000 steps claims perfect results i.e. reward 1.0+-0)

In [1]:
from stable_baselines3 import PPO
import torch

In [2]:
class OnnxableActionPolicy(torch.nn.Module):
    def __init__(self, extractor, action_net, value_net):
        super(OnnxableActionPolicy, self).__init__()
        self.extractor = extractor
        self.action_net = action_net
        self.value_net = value_net
        
        normalize_linear1 = torch.nn.Linear(2, 8)
        # ((max(0,x) - max(0,-x)) - max(0,x-1) + max(0,-x-1))
        normalize_linear1.weight.data = torch.Tensor([
            [1,0],[-1,0],[1,0],[-1,0],
            [0,1],[0,-1],[0,1],[0,-1]
        ])
        normalize_linear1.bias.data=torch.Tensor([0,0,-1,-1,0,0,-1,-1])
        A=1
        normalize_linear2 = torch.nn.Linear(8,2)
        normalize_linear2.weight.data = torch.Tensor([[A,-A,-A,A,0,0,0,0],[0,0,0,0,A,-A,-A,A]])
        normalize_linear2.bias.data=torch.Tensor([0])
        self.normalizer = torch.nn.Sequential(
            normalize_linear1,
            torch.nn.ReLU(),
            normalize_linear2)

    def forward(self, observation):
        # NOTE: You may have to process (normalize) observation in the correct
        #       way before using this. See `common.preprocessing.preprocess_obs`
        action_hidden, value_hidden = self.extractor(observation)
        action = self.action_net(action_hidden)
        return self.normalizer(action) #, self.value_net(value_hidden)

In [3]:
# Example: model = PPO("MlpPolicy", "Pendulum-v0")
model = PPO.load("model_backup/zeppelin-avoidance-windsystem-small2-1400000-1000000-0.5")
model.policy.to("cpu")
onnxable_model = OnnxableActionPolicy(model.policy.mlp_extractor, model.policy.action_net, model.policy.value_net)

In [4]:
dummy_input = torch.randn(1, 4)
torch.onnx.export(onnxable_model, dummy_input, "zeppelin-avoidance-small2-1400000-retrain-1000000-0.5.onnx", opset_version=9)

In [5]:
##### Load and test with onnx

import onnx
import onnxruntime as ort
import numpy as np

In [6]:
onnx_model = onnx.load("zeppelin-avoidance-small2-1400000-retrain-1000000-0.5.onnx")
onnx.checker.check_model(onnx_model)

observation = np.zeros((1, 7)).astype(np.float32)
ort_sess = ort.InferenceSession("zeppelin-avoidance-small2-1400000-retrain-1000000-0.5.onnx")

In [7]:
print(ort_sess.run(None, {'input.1': [[-20.24999277597358,130.0,20.,30.]]}))

[array([[ 1., -1.]], dtype=float32)]


# Rename output nodes to not purely numeric names!

In [8]:
onnx_model = onnx.load('zeppelin-avoidance-small2-1400000-retrain-1000000-0.5.onnx')

In [9]:
onnx_model.graph.output[0].name = "out1"

In [10]:
onnx_model.graph.node[-1].output[0]="out1"

In [11]:
onnx_model.graph.node[len(onnx_model.graph.node)-1].output[0]="out1"

In [12]:
onnx_model.graph.input[0].name

'input.1'

In [13]:
onnx.save(onnx_model, 'zeppelin-avoidance-small2-1400000-retrain-1000000-0.5.onnx')