In [2]:
import pickle as pcl
import numpy as np
import gym
env = gym.make("Marvin-v0")

In [3]:
with open("trained_marvin.pcl", 'rb') as f:
    nn_weights = pcl.load(f)

In [4]:
class NN:
    def __init__(self, layer_sizes, seed=None):
        if seed is not None:
            np.random.seed(seed)
        self.weights = [np.zeros((m, n)) * 1e-3 for m, n in zip(layer_sizes[1:], layer_sizes)]
    
    def predict(self, X):
        out = X
        for W in self.weights:
            Z = out @ W.T
            out = np.tanh(Z)
        if out.shape[0] == 1 and len(out.shape) == 1:
            return out.item()
        return out

    def set_weights(self, weights, copy=False):
        if copy:
            self.weights = [np.copy(l) for l in weights]
        else:
            self.weights = weights
        
    def get_weights(self, copy=False):
        if copy:
            return [np.copy(l) for l in self.weights]
        return self.weights
    
    def sample_like(self, sigma=1):
        return [np.random.randn(*l.shape) * sigma for l in self.weights]

In [5]:
def render_env(model, env, max_iter=None, verbose=True):
    observation = env.reset()
    done = False
    i = 0
    r_sum = 0
    while not done:
        if (max_iter is not None and i >= max_iter):
            break
        env.render()
        action = nn.predict(observation)
        observation, reward, done, _ = env.step(action)
#         print(f"R={reward} | A={action}")
        i += 1
        r_sum += reward
    if verbose:
        print(f"Episode end after {i} iterations with reward = {r_sum} and done status {done}")

In [6]:
nn = NN([24, 24, 4])
nn.set_weights(nn_weights)

In [7]:
render_env(nn, env, max_iter=2000)

Episode end after 1724 iterations with reward = 218.31647545943642 and done status True
