In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Critic(nn.Module):

    def __init__(self, state_size, action_size,fc1,fc2, seed):

        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)

        self.fc1=nn.Linear(state_size,fc1)
        self.fc2=nn.Linear(fc1+action_size,fc2)
        
        self.bn=nn.BatchNorm1d(state_size)
        self.bn2=nn.BatchNorm1d(fc1)
 
        self.fc5=nn.Linear(fc2,1)
        
        #last layer weight and bias initialization 
        self.fc5.weight.data.uniform_(-3e-4, 3e-4)
        self.fc5.bias.data.uniform_(-3e-4, 3e-4)
        
        #torch.nn.init.uniform_(self.fc5.weight, a=-3e-4, b=3e-4)
        #torch.nn.init.uniform_(self.fc5.bias, a=-3e-4, b=3e-4)
        
    def forward(self, state,action):
        
        x=self.bn(state)
        x=F.relu(self.bn2(self.fc1(x)))
        x=torch.cat([x,action],1)
        x=F.relu(self.fc2(x))
        
        x=self.fc5(x)
        
        return x

    
class Actor(nn.Module):

    def __init__(self,state_size, action_size, fc1,fc2,seed):
        super(Actor, self).__init__()
        

        # network mapping state to action 

        self.seed = torch.manual_seed(seed)
        
        self.bn=nn.BatchNorm1d(state_size)
        self.bn2=nn.BatchNorm1d(fc1)
        self.bn3=nn.BatchNorm1d(fc2)
        
        self.fc1= nn.Linear(state_size,fc1)
        self.fc2 = nn.Linear(fc1,fc2)
        self.fc4 = nn.Linear(fc2, action_size)
        
        #last layer weight and bias initialization 
        torch.nn.init.uniform_(self.fc4.weight, a=-3e-3, b=3e-3)
        torch.nn.init.uniform_(self.fc4.bias, a=-3e-3, b=3e-3)
        
        # Tanh
        self.tan = nn.Tanh()
        
        
    def forward(self, x):

        x=self.bn(x)
        x = F.relu(self.bn2(self.fc1(x)))
        x = F.relu(self.bn3(self.fc2(x)))

        return self.tan(self.fc4(x))

    

In [None]:
import numpy as np
import random
from collections import namedtuple, deque

from model import Critic, Actor
import torch
from torch import autograd
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import random_p as rm
from schedule import LinearSchedule

BUFFER_SIZE = int(1e6)  
BATCH_SIZE = 512         
GAMMA = 0.99            
TAU = 1e-3              
ACTOR_LR = 1e-3        
CRITIC_LR = 1e-4        
UPDATE_EVERY = 20      
#UPDATE_TIMES = 10       

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class Agent():


    def __init__(self, state_size, action_size, num_agents,seed,fc1=400,fc2=300,update_times=10,weight_decay=1.e-5):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.num_agents=num_agents
        self.n_seed=np.random.seed(seed)
        self.update_times=update_times
        self.n_step=0
        self.agents = []
        
        self.noise=[]
        for i in range(num_agents):
            self.noise.append(rm.OrnsteinUhlenbeckProcess(size=(action_size, ), std=LinearSchedule(0.4)))


        self.critic_local = Critic(state_size, action_size,fc1,fc2, seed).to(device)
        
        self.critic_target = Critic(state_size, action_size,fc1,fc2, seed).to(device)
        self.critic_target.load_state_dict(self.critic_local.state_dict())
        
        self.actor_local=Actor(state_size, action_size,fc1,fc2, seed).to(device)
        self.actor_target=Actor(state_size, action_size,fc1,fc2, seed).to(device)
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        

        self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR,weight_decay=1.e-5)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR)

        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        

        self.t_step = 0
        self.a_step = 0

    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        for i in range(self.num_agents):
            all_state=self.memory.add(state[i], action[i], reward[i], next_state[i], done[i])
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        
        if self.t_step == 0:
            
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE: 
                for i in range(self.update_times):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state,training=True):

        state = torch.from_numpy(state).float().detach().to(device)
        #print(state.shape,"act")
        
        self.actor_local.eval()
        with torch.no_grad():
            actions=self.actor_local(state)
        self.actor_local.train()
        
        self.n_step+=1
        dec=1.08*max((500-self.n_step)/500,.05)
        noise=[]
        #for i in range7(self.num_agents):
        #    noise.append(self.noise[i].sample())
        
        return np.clip(actions.cpu().data.numpy()+np.random.uniform(-1,1,(self.num_agents,self.action_size))*dec,-1,1)
        #np.clip(actions.cpu().data.numpy()+np.array(noise),-1,1)

    def learn(self, experiences, gamma):

        
        states, actions, rewards, next_states, dones = experiences

        next_actions=self.actor_target(next_states)
        with torch.no_grad():
            Q_target_next = self.critic_target(next_states,next_actions)
        Q_targets= rewards +(gamma * Q_target_next * (1-dones))
        
        Q_expected = self.critic_local(states,actions)
        
        #critic loss
        loss=F.mse_loss(Q_expected, Q_targets.detach())
        
        self.optimizer_critic.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.optimizer_critic.step()
        
        #actor loss
        
        
        
        action_pr = self.actor_local(states)
        p_loss=-self.critic_local(states,action_pr).mean()

        
        
        self.optimizer_actor.zero_grad()
        p_loss.backward()
        
        self.optimizer_actor.step()

        # ------------------- update target network ------------------- #

        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def reset(self):
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())
        self.t_step = 1
        self.a_step = 1
        
    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def reset_random(self):
        for i in range(self.num_agents):
            self.noise[i].reset_states()
        

class ReplayBuffer:

    def __init__(self, action_size, buffer_size, batch_size, seed):

        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        #actions.requires_grad=True
        #print(actions.requires_grad,"grad")
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [None]:
from unityagents import UnityEnvironment
import numpy as np
env = UnityEnvironment(file_name="C:/Users/itsra/google-football-pytorch/Tennis-MultiAgent/Tennis_Windows_x86_64/Tennis.exe",no_graphics=True) 
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=True)[brain_name]


num_agents = len(env_info.agents)
print('Number of agents:', num_agents)


action_size = brain.vector_action_space_size
print('Size of each action:', action_size)


states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])


for i in range(1, 6):                                      
    env_info = env.reset(train_mode=False)[brain_name]     
    states = env_info.vector_observations                  
    scores = np.zeros(num_agents)                          
    while True:
        actions = np.random.randn(num_agents, action_size) 
        actions = np.clip(actions, -1, 1)                 
        env_info = env.step(actions)[brain_name]           
        next_states = env_info.vector_observations         
        rewards = env_info.rewards                        
        dones = env_info.local_done                        
        scores += env_info.rewards                         
        states = next_states                               
        if np.any(dones):                                 
            break
    print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))


import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

from ddpg_agent import Agent
agent = Agent(state_size, action_size,num_agents,fc1=400,fc2=300, seed=0)

torch.backends.cudnn.benchmark = True
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("working on gpu")
else:
    device = torch.device("cpu")
    print("working on cpu")
def ddpg(n_episodes=5000):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores

    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        agent.reset_random()              #reset noise object
        state = env_info.vector_observations
        score = 0
        t=0
        print(i_episode)
        while True:
            t=t+1
            action=agent.act(state)
            #print(action)
            env_info = env.step(np.array(action))[brain_name] 
            next_state = env_info.vector_observations   # get the next state
            reward = env_info.rewards                   # get the reward

            done = env_info.local_done
            agent.step(state, action, reward, next_state, done)
            state = next_state
            #print(reward)
            score += max(reward)
            if np.any(done):
                break 

        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 300 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=1:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))

            torch.save({
                        'model_state_dict': agent.critic_local.state_dict(),
                        'optimizer_state_dict': agent.optimizer_critic.state_dict(),
                        }, 'trained_weights/checkpoint_critic.pth')

            torch.save({
                        'model_state_dict': agent.actor_local.state_dict(),
                        'optimizer_state_dict': agent.optimizer_actor.state_dict(),
                        }, 'trained_weights/checkpoint_actor.pth')
            break
    return scores
scores = ddpg()

# plot the scores
#fig = plt.figure()
#ax = fig.add_subplot(111)
#plt.plot(np.arange(len(scores)), scores)
#plt.ylabel('Score')
#plt.xlabel('Episode #')
#plt.show()

'''
agent.critic_local.load_state_dict(torch.load('trained_weights/checkpoint_critic.pth'))
agent.actor_local.load_state_dict(torch.load('trained_weights/checkpoint_actor.pth'))

env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations            # get the current state
#print(state.shape)
score = 0    
reward_i=[]# initialize the score
past_a=deque(maxlen=5)
while True:
    action=[]

    action=agent.act(state)        # select an action
    env_info = env.step(np.array(action))[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations   # get the next state
    reward = env_info.rewards                  # get the reward
    done = env_info.local_done                 # see if episode has finished
    score += np.mean(reward)                   # update the score
    state = next_state                         # roll over the state to next time step

    if np.any(done):                           # exit loop if episode finished
        break
    
print("Score: {}".format(score))'''

env.close()