# Collaboration and Competition

---

You are welcome to use this coding environment to train your agent for the project.  Follow the instructions below to get started!

### 1. Start the Environment

Run the next code cell to install a few packages.  This line will take a few minutes to run!

In [2]:
from unityagents import UnityEnvironment
import numpy as np
from collections import namedtuple, deque
import torch

env = UnityEnvironment(file_name="/data/Tennis_Linux_NoVis/Tennis")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


Environments contain **_brains_** which are responsible for deciding the actions of their associated agents. Here we check for the first brain available, and set it as the default brain we will be controlling from Python.

In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [5]:
import numpy as np
import copy
from collections import namedtuple, deque
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [37]:
torch.manual_seed(8)
np.random.seed(8)

STATE_SIZE = 24
ACTION_SIZE = 2 
NUM_AGENTS = 2
BUFFER = 100000
BATCH_SIZE = 96
TAU = 0.01
LR_ACTOR = 1e-4
LR_CRITIC = 1e-4
W_DECAY = 0
UPDATE_EVERY = 1
GAMMA = 0.99

In [38]:
number_of_episodes = 3000
random_seed = 8

In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from collections import namedtuple, deque


In [40]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

In [41]:
class Actor(nn.Module):
    def __init__(self,s_size,a_size,seed,hidden_dim1=400,hidden_dim2=200):
        super(Actor, self).__init__()
               
        self.fc1 = nn.Linear(s_size,hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1,hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2,a_size)
        self.reset_parameters(seed)
        
    def reset_parameters(self,seed):                
        torch.manual_seed(seed)
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)    
        
    def forward(self,state):
        x=F.relu(self.fc1(state))
        x=F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))

In [42]:
class Critic(nn.Module):

    def __init__(self,s_size,a_size,seed,hidden_dim1=400,hidden_dim2=200):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(s_size,hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1+a_size,hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2,1)
        self.reset_parameters(seed)
        
    def reset_parameters(self,seed):
        torch.manual_seed(seed)
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(-3e-3, 3e-3)    
        
    def forward(self,state,action):
        
        x = F.relu(self.fc1(state))
        x = torch.cat((x, action),dim=1)      
        x = F.relu(self.fc2(x))
        return self.fc3(x)   
    

In [43]:
class OUNoise:

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        self.state = copy.copy(self.mu)

    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
        self.state = x + dx
        return self.state

In [44]:
class ReplayBuffer:

    def __init__(self, buffer_size, action_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  # internal memory (deque)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        np.random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [51]:
class Agent():
    def __init__(self, state_size, action_size, gamma, random_seed):
        
        self.actor_local = Actor(state_size,action_size,random_seed).to(device)
        self.actor_local = Actor(state_size,action_size,random_seed).to(device)
        self.actor_target = Actor(state_size,action_size,random_seed).to(device)
        self.actor_optim = optim.Adam(self.actor_local.parameters(),lr=LR_ACTOR)
        
        self.critic_local=Critic(state_size,action_size,random_seed).to(device)
        self.critic_target=Critic(state_size,action_size,random_seed).to(device)
        self.critic_optim=optim.Adam(self.critic_local.parameters(),lr=LR_CRITIC,weight_decay=W_DECAY)
        
        self.noise = OUNoise(action_size,random_seed)
        self.gamma=gamma   
        
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)    
        
    def reset(self):
        self.noise.reset()        
        
    def get_local_action(self,state,noise=0.0):
        
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state.view(1,-1)).cpu().data.numpy()        
        self.actor_local.train()
        action = np.squeeze(action)
        action += (self.noise.sample()*noise)
        return np.clip(action,-1,1)

    def get_target_action(self,states):
        
        self.actor_target.eval()
        action = self.actor_target(states.to(device)).cpu().data.numpy()
        self.actor_target.train()
        return np.clip(action,-1,1)
          
    
     
    def learn(self,experiences):

        states, actions,rewards,next_states,dones = experiences
        next_actions = self.get_target_action(next_states)
        
        next_actions = torch.from_numpy(next_actions).float().to(device)
        
        pred = self.critic_target(next_states,next_actions)
        Q_targets = rewards + self.gamma * pred * (1 - dones)
        Q_expected = self.critic_local(states,actions)  
        
        
        critic_loss = F.mse_loss(Q_expected,Q_targets)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optim.step() 
        
        actions_actor = self.actor_local(states)
        loss = -self.critic_local(states,actions_actor).mean()
        
        self.actor_optim.zero_grad()
        loss.backward()
        self.actor_optim.step()
        
        self.update_networks(TAU)
        
    def update_networks(self,Tau):    
        self.soft_update(self.critic_local,self.critic_target,Tau)
        self.soft_update(self.actor_local,self.actor_target,Tau)
        
        
    def soft_update(self,local_model,target_model,Tau):
        for local_params, target_params in zip(local_model.parameters(),target_model.parameters()):
            target_params.data.copy_(local_params*Tau + (1-Tau)*target_params)
    

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)    
    

In [62]:
class MADDPG():
    def __init__(self,state_size,action_size,num_agents,Gamma,random_seed):
        
        self.agents = [Agent(state_size,action_size,Gamma,random_seed) for val in range(num_agents)]
        self.num_agents=NUM_AGENTS
        self.replaybuff = ReplayBuffer(BUFFER,state_size,BATCH_SIZE,random_seed)
        self.batch_size = BATCH_SIZE
        self.gamma=GAMMA
   
                
        
    def get_action(self,states,noise=0.0):
        states  = torch.from_numpy(states).float().to(device)     
        actions=[agent.get_local_action(state,noise) for agent,state in zip(self.agents,states)]
        return np.array(actions)
       
      
    def step(self,states,actions,rewards,next_states,dones,ts):
        for state,action,reward,next_state,done in zip(states,actions,rewards,next_states,dones):
            self.replaybuff.add(state, action, reward, next_state, done)
        if len(self.replaybuff) > BATCH_SIZE  and ts % UPDATE_EVERY ==0:
            self.learn()        
            
        
    def learn(self):
        for _ in range(5):
            experiences = self.replaybuff.sample()
            for agent in self.agents:
                agent.learn(experiences)    
                   
        

In [63]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [70]:
multi_agent= MADDPG(STATE_SIZE,ACTION_SIZE,NUM_AGENTS,GAMMA,random_seed) 

In [71]:
scores=[]
rolling_window=deque(maxlen=100)
timesteps=0
noise_factor=1
weight_decay=0.998
running_avg=0.0
maxsc=0.17
save=False
num_episodes = 3000


for episode in range(num_episodes):
    env_info = env.reset(train_mode=True)[brain_name]  
    states = env_info.vector_observations 
    score = np.zeros(num_agents) 
    [agent.reset() for agent in multi_agent.agents]
    
    while True:
        timesteps+=1
        actions=multi_agent.get_action(states,noise_factor) 
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations 

        rewards = env_info.rewards
        dones = env_info.local_done
        score += rewards 

        multi_agent.step(states,actions,rewards,next_states,dones,timesteps)

        states = next_states

        if np.any(dones):
            break

    noise_factor= noise_factor*weight_decay
    scores.append(np.max(score))
    rolling_window.append(np.max(score))
      
    if episode %50 ==0 :
        print("Episode {}/{} Avg scores:{} last score :{} Timestep : {}".format(episode,num_episodes,np.mean(rolling_window),score,timesteps))

    if np.mean(rolling_window) >= 0.5:
        print ("solved in {} episodes".format(episode))
        print ("Saving Model...")
        for j,agent in enumerate(multi_agent.agents):
            torch.save(agent.actor_local.state_dict(), 'checkpoint_actor'+str(j)+'.pth')
            torch.save(agent.critic_local.state_dict(), 'checkpoint_critic'+str(j)+'.pth')
        break

Episode 0/3000 Avg scores:0.0 last score :[-0.01  0.  ] Timestep : 5
Episode 50/3000 Avg scores:0.003725490258897052 last score :[-0.01  0.  ] Timestep : 757
Episode 100/3000 Avg scores:0.01550000024959445 last score :[ 0.    0.09] Timestep : 1769
Episode 150/3000 Avg scores:0.0777000012435019 last score :[ 0.2   0.29] Timestep : 4192
Episode 200/3000 Avg scores:0.12250000193715095 last score :[ 0.2   0.19] Timestep : 6666
Episode 250/3000 Avg scores:0.15820000240579246 last score :[ 0.09  0.1 ] Timestep : 10903
Episode 300/3000 Avg scores:0.38450000574812293 last score :[ 0.1   0.19] Timestep : 22234
solved in 315 episodes
Saving Model...
