In [1]:
import gym3
from gym3 import types_np
import numpy as np
from procgen import ProcgenGym3Env

import matplotlib.pyplot as plt

In [2]:
# Train several agents in parallel and save weights at intervals

run_name = "test"

alpha = 1 # Learning rate. This does not influence training because training is under a random policy, so set it to one

agent_healths = np.array([1,2,3,4]) # Training agent healths to use
Nhealths = agent_healths.shape[0]
Nagents = 10 # Number of agents to train at each health
max_episodes = 500000 

continuing = False # Set to True to continue training for longer (increase max_episodes before running)

if not continuing:

    save_points = np.unique(np.round(np.logspace(0,np.log10(500000),110))) # A vector of episodes to save the weights at
    Nints = save_points.shape[0]
    save_ind = np.ones(Nagents)

    T=100 # Maximum episode length. N.B. this is currently hard-coded in the C++ code and cannot be changed by changing this constant
    Nfeats = 6720 # Input feature dimension

    env = ProcgenGym3Env(num=Nagents, env_name="bossfight2", agent_health=5, use_backgrounds=False, restrict_themes=True) # N.B. the agent_health argument is irrelevant--we do not use the returns computed by the environment/cpp code


    w = np.zeros((Nagents, Nhealths, Nfeats))
    ws = np.zeros((Nagents, Nints+1, Nhealths, Nfeats)) 
    y = np.zeros((Nagents, T+2))
    X = np.zeros((Nagents, Nfeats, T+2))
    acts = np.zeros(Nagents)
    a = np.zeros(Nagents)
    step = np.zeros(Nagents)



    total_episodes = np.zeros(Nagents)
    successful_episodes = np.zeros((Nagents, Nhealths))
    cumulative_rew = np.zeros(Nagents)


while any(total_episodes <= max_episodes):
    rew, obs, first = env.observe()
    cumulative_rew += rew
    
    for i in range(Nagents):
        if step[i] > 0 and first[i]: # First step of new episode
            step[i] = 0

            total_episodes[i] += 1

            successful_episode = cumulative_rew[i] > -agent_healths # Vectorized for all agent healths

            successful_episodes[i,:] += successful_episode

            
            # REINFORCE update
            u = np.mean(y[i,:]*X[i,:,:], axis=1).T
            w[i,:,:] = w[i,:,:] + alpha * np.outer(successful_episode, u) # Vectorized for all agent healths

            cumulative_rew[i] = 0
            
            if any(save_points==total_episodes[i]):
                ws[i,save_ind[i].astype(int),:,:] = w[i,:,:]
                save_ind[i] += 1
                if i==0:
                    print(f"Saved episode {total_episodes[i]}")
                   
            if i==0 and total_episodes[i] % 1000 == 0:
                print(f"Iteration {total_episodes[i]}")

        
        x = obs['rgb'][i,0:35,:,:].flatten()
        X[i,:,step[0].astype(int)] = x
        a[i] = np.random.rand()-1/2 # Pure random policy
        

        if a[i] > 0:
            acts[i] = 0 # Left
            y[i,step[i].astype(int)] = 1
        else:
            
            acts[i]=7 # Right
            y[i,step[i].astype(int)] = -1
        
        step[i] += 1
        
    env.act(acts) # Take actions in all envs

building procgen...done
Saved episode 1.0
Saved episode 2.0
Saved episode 3.0
Saved episode 4.0
Saved episode 5.0
Saved episode 6.0
Saved episode 7.0
Saved episode 8.0
Saved episode 9.0
Saved episode 10.0
Saved episode 11.0
Saved episode 13.0
Saved episode 14.0
Saved episode 16.0
Saved episode 18.0
Saved episode 20.0
Saved episode 23.0
Saved episode 26.0
Saved episode 29.0
Saved episode 33.0
Saved episode 37.0
Saved episode 42.0
Saved episode 47.0
Saved episode 53.0
Saved episode 60.0
Saved episode 68.0
Saved episode 76.0
Saved episode 86.0
Saved episode 97.0
Saved episode 109.0
Saved episode 123.0
Saved episode 139.0
Saved episode 157.0
Saved episode 177.0
Saved episode 200.0
Saved episode 225.0
Saved episode 254.0
Saved episode 287.0
Saved episode 323.0
Saved episode 365.0
Saved episode 411.0
Saved episode 464.0


In [3]:
np.savez(f"rand_agents_parallel_{run_name}.npz",ws=ws, Nagents=Nagents, Nints=Nints, Nfeats=Nfeats, agent_healths=agent_healths, save_points=save_points)

In [5]:
# Evaluate agents using softmax policy

eval_episodes = 1000 # Episodes per time point to use for evaluation
alpha = 1e-6 # Learning rate to use


env = ProcgenGym3Env(num=Nagents, env_name="bossfight2", agent_health=5, use_backgrounds=False, restrict_themes=True)


successful_episodes = np.zeros((Nagents, Nints+1, Nhealths, Nhealths)) #first Nhealth is train, second is test

max_episodes = Nints*eval_episodes

for j,train_ah in enumerate(agent_healths):
    acts = np.zeros(Nagents)
    a = np.zeros(Nagents)
    step = np.zeros(Nagents)

    total_episodes = np.zeros(Nagents)
    
    cumulative_rew = np.zeros(Nagents)


    while any(total_episodes <= max_episodes):
        rew, obs, first = env.observe()
        cumulative_rew += rew

        for i in range(Nagents):
            if step[i] > 0 and first[i]: # First step of new episode
                step[i] = 0

                total_episodes[i] += 1
                
                interval = int(total_episodes[i]/eval_episodes)
                
                successful_episode = cumulative_rew[i] > -agent_healths # Vectorized for all agent healths

                successful_episodes[i, interval, j, :] += successful_episode

                cumulative_rew[i] = 0



                if i==0 and total_episodes[i] % 1000 == 0:
                    print(f"Iteration {total_episodes[i]}")


            x = obs['rgb'][i,:,:,:].flatten()
            
            z = x.reshape((64,64,3))
            z[28:35,28:35,:]=0 # Remove spacecraft (turns out to be essential, otherwise get strong side bias
            z = z[0:35,:] # Remove bottom half because there's nothing there
            x = z.flatten()

            interval = int(total_episodes[i]/eval_episodes)
            s = 1./(1+np.exp(-alpha*ws[i,interval, j,:] @ x))
            a[i] = 2*((np.random.rand()<s) - 1/2)


            if a[i] > 0:
                acts[i] = 0 # Left
            else:

                acts[i]=7 # Right

            step[i] += 1

        env.act(acts) # Take actions in all envs

Iteration 1000.0


KeyboardInterrupt: 

In [6]:
np.savez(f"successful_episodes_{run_name}.npz",successful_episodes=successful_episodes, eval_episodes=eval_episodes, agent_healths=agent_healths, save_points=save_points)