In [1]:
import sys
sys.path.insert(0, "python/")

from unityagents import UnityEnvironment
import numpy as np
import torch
from dqn.dqn_agent_v2 import Agent
from collections import deque

from model import ActorModel, CriticModel
from agent import Agent as PpoAgent

In [2]:
env = UnityEnvironment(file_name="Soccer_Env/Soccer.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 2
        Number of External Brains : 2
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: GoalieBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 
Unity brain name: StrikerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 6
        Vector Action descriptions: , , , , , 


In [3]:
def ball_reward(state):
    reward = 0.0
    if not any(state[0::8]):
        reward = -0.03
    # Reward for kicking the ball
    else:
        idx = np.where(state[0::8])[0]
        distance = state[idx*8 + 7]
        if (np.amin(distance) <= 0.03):
            reward = 0.3

    return reward

In [4]:
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    g_losses = []
    g_losses_window = deque(maxlen=100)
    s_losses = []
    s_losses_window = deque(maxlen=100)
    for i_episode in range(1, n_episodes+1):
        env_info  = env.reset(train_mode=True)
        score = 0
        ball_reward_val = 0.0
        
        g_states = env_info[g_brain_name].vector_observations
        s_states = env_info[s_brain_name].vector_observations

        g_scores = np.zeros(num_g_agents)
        s_scores = np.zeros(num_s_agents)
        
        for t in range(max_t):
            action_g_0 = g_agent.act(g_states[0], eps)
            action_s_0 = s_agent.act(s_states[0], eps)  
            
            # Set other team to random
            action_g_1 = np.asarray( [np.random.choice(g_action_size)] ) 
            action_s_1 = np.asarray( [np.random.choice(s_action_size)] )
            # Combine actions
            actions_g = np.array( (action_g_0, action_g_1) )                                    
            actions_s = np.array( (action_s_0, action_s_1) )
            actions = dict( zip( [g_brain_name, s_brain_name], [actions_g, actions_s] ) )
            
            env_info = env.step(actions)                                                
            g_next_states = env_info[g_brain_name].vector_observations         
            s_next_states = env_info[s_brain_name].vector_observations
            
            g_rewards = env_info[g_brain_name].rewards
            s_rewards = env_info[s_brain_name].rewards
            g_scores += g_rewards
            s_scores += s_rewards
            
            ball_reward_val += ball_reward(s_states[0])
            
            done = np.any(env_info[g_brain_name].local_done)
            print(env_info[g_brain_name].text_observations)
            print(env_info[g_brain_name].visual_observations)
            
            g_agent.step(g_states[0], action_g_0, g_rewards[0], 
                         g_next_states[0], done)
            s_agent.step(s_states[0], action_s_0, s_rewards[0] + ball_reward(s_states[0]), # adding ball reward
                         s_next_states[0], done)

            if done:
                break
                
            g_states = g_next_states
            s_states = s_next_states
                
        goalie_loss = g_agent.learn(g_agent.memory.sample(), 0.99) # discount = 0.99
        striker_loss = s_agent.learn(s_agent.memory.sample(), 0.99) # discount = 0.99 
        
        g_losses.append(goalie_loss.item())
        g_losses_window.append(goalie_loss.item())
        s_losses.append(striker_loss.item())
        s_losses_window.append(striker_loss.item())
        
        score = g_scores[0] + s_scores[0]
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}\t Goalie Loss:' \
                  '{:.5f}\t Striker Loss: {:.5f}' \
                  '\t Ball Reward: {:.2f}'.format(i_episode, \
                                                  np.mean(scores_window), \
                                                  np.mean(g_losses_window), \
                                                  np.mean(s_losses_window), \
                                                  ball_reward_val), end="")
        #print(s_states[0][0:56])
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}\t Goalie Loss:' \
                  '{:.5f}\t Striker Loss: {:.5f}\n' \
                  '\t Ball Reward: {:.2f}'.format(i_episode, \
                                                  np.mean(scores_window), \
                                                  np.mean(g_losses_window), \
                                                  np.mean(s_losses_window), \
                                                  ball_reward_val))
            torch.save(g_agent.qnetwork_local.state_dict(), 'checkpoint_goalie.pth')
            torch.save(s_agent.qnetwork_local.state_dict(), 'checkpoint_striker.pth')
    return scores


In [5]:
# set the goalie brain
g_brain_name = env.brain_names[0]
g_brain = env.brains[g_brain_name]

# set the striker brain
s_brain_name = env.brain_names[1]
s_brain = env.brains[s_brain_name]

In [6]:
# reset the environment
env_info = env.reset(train_mode=True)

# number of agents 
num_g_agents = len(env_info[g_brain_name].agents)
num_s_agents = len(env_info[s_brain_name].agents)

# number of actions
g_action_size = g_brain.vector_action_space_size
s_action_size = s_brain.vector_action_space_size

# examine the state space 
g_states = env_info[g_brain_name].vector_observations
g_state_size = g_states.shape[1]
s_states = env_info[s_brain_name].vector_observations
s_state_size = s_states.shape[1]

g_agent = Agent(state_size=g_state_size, action_size=g_action_size, seed=0)
s_agent = Agent(state_size=s_state_size, action_size=s_action_size, seed=0)

In [7]:
n_episodes = 5000
n_episodes = 5
max_t = 100000
eps_start = 1.0
eps_end = 0.1
eps_decay = 0.9995

GOALIE = 'goalie_dqn.pth'
STRIKER = 'striker_dqn.pth'
g_agent.qnetwork_local.load (GOALIE )
s_agent.qnetwork_local.load( STRIKER )

# Train
eps_start = 0.1
eps_end = 0.1
scores = dqn(n_episodes, max_t, eps_start, eps_end, eps_decay)

In [8]:
# Load trained agents and run
g_agent_blue = Agent(state_size=g_state_size, action_size=g_action_size, seed=0)
s_agent_blue = Agent(state_size=s_state_size, action_size=s_action_size, seed=0)
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
goalie_actor_model = ActorModel( g_state_size, g_action_size ).to(DEVICE)
striker_actor_model = ActorModel( s_state_size, s_action_size ).to(DEVICE)
N_STEP = 8

GOALIE_red = 'goalie_dqn_run.pth'
STRIKER_red = 'striker_dqn_run.pth'
g_agent_red = PpoAgent( DEVICE, 0, goalie_actor_model, N_STEP )
s_agent_red = PpoAgent( DEVICE, 0, striker_actor_model, N_STEP )

GOALIE_blue = 'goalie_dqn_mod.pth'
STRIKER_blue = 'striker_dqn_mod.pth'
g_agent_blue.qnetwork_local.load (GOALIE_blue )
s_agent_blue.qnetwork_local.load( STRIKER_blue )

In [9]:
team_red_window_score = []
team_red_window_score_wins = []

team_blue_window_score = []
team_blue_window_score_wins = []

draws = []

for i in range(500):                                       # play game for 2 episodes
    env_info = env.reset(train_mode=True)                  # reset the environment    
    g_states = env_info[g_brain_name].vector_observations  # get initial state (goalies)
    s_states = env_info[s_brain_name].vector_observations  # get initial state (strikers)
    g_scores = np.zeros(num_g_agents)                      # initialize the score (goalies)
    s_scores = np.zeros(num_s_agents)                      # initialize the score (strikers)
    while True:
        action_g_0, _ = g_agent_red.act( g_states[0] )
        action_s_0, _ = s_agent_red.act( s_states[0] )
        action_g_1 = np.asarray( [np.random.choice(g_action_size)] ) 
        action_s_1 = np.asarray( [np.random.choice(s_action_size)] )

        # Combine actions
        actions_g = np.array( (action_g_0, action_g_1) )                                    
        actions_s = np.array( (action_s_0, action_s_1) )
        actions = dict( zip( [g_brain_name, s_brain_name], [actions_g, actions_s] ) )

        env_info = env.step(actions)                       
        
        # get next states
        g_next_states = env_info[g_brain_name].vector_observations         
        s_next_states = env_info[s_brain_name].vector_observations
        
        # get reward and update scores
        g_rewards = env_info[g_brain_name].rewards  
        s_rewards = env_info[s_brain_name].rewards
        g_scores += g_rewards
        s_scores += s_rewards
        
        # check if episode finished
        done = np.any(env_info[g_brain_name].local_done)  
        
        # roll over states to next time step
        g_states = g_next_states
        s_states = s_next_states
        
        # exit loop if episode finished
        if done:                                           
            break
    team_red_score = g_scores[0] + s_scores[0]
    team_red_window_score.append( team_red_score )
    team_red_window_score_wins.append( 1 if team_red_score > 0 else 0)        

    team_blue_score = g_scores[1] + s_scores[1]
    team_blue_window_score.append( team_blue_score )
    team_blue_window_score_wins.append( 1 if team_blue_score > 0 else 0 )

    draws.append( team_red_score == team_blue_score )
    print('Scores from episode {}: {} (goalies), {} (strikers)'.format(i+1, g_scores, s_scores))

print('Red Wins: \t{} \tScore: \t{:.5f} \tAvg: \t{:.2f} \tDraws: \t{}'.format( \
                  np.count_nonzero(team_red_window_score_wins), team_red_score, \
                  np.sum(team_red_window_score), np.count_nonzero(draws) ))

env.close()



Scores from episode 1: [1.00166669 1.00166669] (goalies), [-1.00166669 -1.00166669] (strikers)
Scores from episode 2: [-0.61166672  0.48833334] (goalies), [-0.48833334  0.61166672] (strikers)
Scores from episode 3: [1.00166669 1.00166669] (goalies), [-1.00166669 -1.00166669] (strikers)
Scores from episode 4: [ 0.255 -0.845] (goalies), [ 0.845 -0.255] (strikers)
Scores from episode 5: [ 0.65166667 -0.44833338] (goalies), [ 0.44833338 -0.65166667] (strikers)
Scores from episode 6: [ 0.65333335 -0.44666665] (goalies), [ 0.44666665 -0.65333335] (strikers)
Scores from episode 7: [1.00166669 1.00166669] (goalies), [-1.00166669 -1.00166669] (strikers)
Scores from episode 8: [1.00166669 1.00166669] (goalies), [-1.00166669 -1.00166669] (strikers)
Scores from episode 9: [1.00166669 1.00166669] (goalies), [-1.00166669 -1.00166669] (strikers)
Scores from episode 10: [-0.955  0.145] (goalies), [-0.145  0.955] (strikers)
Scores from episode 11: [ 0.59000001 -0.51000005] (goalies), [ 0.51000005 -0.59