In [1]:
# ! pip install unityagents

In [2]:
import torch
import numpy as np
from collections import deque
from unityagents import UnityEnvironment
from network import Actor, Critic
from agent import DDPGAgent, OUNoise, ReplayBuffer
from multiagent import MultiAgent

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# env = UnityEnvironment(file_name="/data/Tennis_Linux_NoVis/Tennis")
env = UnityEnvironment(file_name='./Tennis.app')
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [4]:
env_info = env.reset(train_mode=True)[brain_name]

In [5]:
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1]
num_agents = len(env_info.agents)
shared_replay_buffer = True

ma = MultiAgent(action_size, state_size, shared_replay_buffer, num_agents)


In [6]:
def ddpg(n_episodes=500, max_t=1000):
    all_scores = []
    scores_window = deque(maxlen=100)
    
    for i_episode in range(1, n_episodes+1):
        
        ma.reset()
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations           
        scores = np.zeros(num_agents)

        for i in range(max_t):
            actions = ma.act(states)
            env_info = env.step(actions)[brain_name]
            rewards = env_info.rewards
            next_states = env_info.vector_observations
            dones = env_info.local_done

            ma.step(states, actions, rewards, next_states, dones)

            scores += rewards
            states = next_states
                
        avg_score = np.max(scores)
        scores_window.append(avg_score)
        all_scores.append(avg_score)
        
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 5 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            torch.save(ma.ddpg_agents[0].actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(ma.ddpg_agents[0].critic_local.state_dict(), 'checkpoint_critic.pth')
        if np.mean(scores_window)>=0.5 and i_episode>=n_episodes:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-5, np.mean(scores_window)))
            torch.save(ma.ddpg_agents[0].actor_local.state_dict(), 'checkpoint_actor.pth')
            torch.save(ma.ddpg_agents[0].critic_local.state_dict(), 'checkpoint_critic.pth')
            break 
            
    return all_scores

In [7]:
# scores = ddpg()

In [8]:
 # import pandas as pd 

# outdf = pd.DataFrame()
# outdf['episode'] = np.arange(len(scores))
# outdf['score'] = scores
# outdf.plot(x='episode', y='score')

In [9]:
ma.ddpg_agents[0].actor_local.load_state_dict(torch.load('checkpoint_actor.pth', map_location='cpu'))
ma.ddpg_agents[0].critic_local.load_state_dict(torch.load('checkpoint_critic.pth', map_location='cpu'))

ma.ddpg_agents[1].actor_local.load_state_dict(torch.load('checkpoint_actor.pth', map_location='cpu'))
ma.ddpg_agents[1].critic_local.load_state_dict(torch.load('checkpoint_critic.pth', map_location='cpu'))

brain_name = env.brain_names[0]
brain = env.brains[brain_name]

num_agents = len(env_info.agents)
action_size = brain.vector_action_space_size
states = env_info.vector_observations
state_size = states.shape[1]

for i in range(11):                                         
    ma.reset()
    env_info = env.reset(train_mode=False)[brain_name]
    states = env_info.vector_observations           
    scores = np.zeros(num_agents)
    while True:
        actions = ma.act(states)
        env_info = env.step(actions)[brain_name]
        rewards = env_info.rewards
        next_states = env_info.vector_observations
        dones = env_info.local_done
        scores += rewards
        states = next_states
        if np.any(dones):                                 
            break