In [1]:
import gfootball.env as football_env
import time
import pprint
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

from FeatureEncoder import *
from ppo import *

import torch.multiprocessing as mp 
import time
        
def actor(actor_num, center_model, data_queue, signal_queue, rollout_len):
    print("actor {} started".format(actor_num))
    #11_vs_11_easy_stochastic
    #academy_empty_goal_close 300 epi done
    #academy_empty_goal 450 epi done
    model = PPO()
    env = football_env.create_environment(env_name="academy_empty_goal", representation="raw", stacked=False, logdir='/tmp/football', write_goal_dumps=False, write_full_episode_dumps=False, render=False)
    fe = FeatureEncoder()
    
    n_epi = 0
    score = 0
    rollout = []
    
    while True:
        env.reset()
        done = False
        steps = 0
#         score = 0
        n_epi += 1
        
        while not done:
            t1 = time.time()
            while signal_queue.qsize() > 0:
                time.sleep(0.02)
            else:
                model.load_state_dict(center_model.state_dict())
            obs = env.observation()
            state_dict = fe.encode(obs[0])
            player_state = torch.from_numpy(state_dict["player"]).float().unsqueeze(0)
            ball_state = torch.from_numpy(state_dict["ball"]).float().unsqueeze(0)
            left_team_state = torch.from_numpy(state_dict["left_team"]).float().unsqueeze(0)
            right_team_state = torch.from_numpy(state_dict["right_team"]).float().unsqueeze(0)

            state_dict_tensor = {
              "player" : player_state,
              "ball" : ball_state,
              "left_team" : left_team_state,
              "right_team" : right_team_state,
            }

            prob, _ = model(state_dict_tensor)
            m = Categorical(prob)
            a = m.sample().item()
            obs, rew, done, info = env.step(a)
            state_prime_dict = fe.encode(obs[0])

            transition = (state_dict, a, rew, state_prime_dict, prob[0][a].item(), done)
            rollout.append(transition)

            if len(rollout) == rollout_len:
                data_queue.put(rollout)
                rollout = []
                
            state_dict = state_prime_dict

            steps += 1
            score += rew

            if done:
                if n_epi % 10 == 0 and actor_num == 0:
                    print("%d, Done, Step %d Reward: %f" % (n_epi, steps, score))
                    score = 0   


def learner(center_model, queue, signal_queue, batch_size, buffer_size):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = PPO(device)
    model.load_state_dict(center_model.state_dict())
    model.to(device)
    
    
    print("learner")
    
    while True:
        if queue.qsize() > batch_size*buffer_size:
            signal_queue.put(1)
            data = []
            for j in range(buffer_size):
                mini_batch_np = []
                for i in range(batch_size):
                    rollout = queue.get()
                    mini_batch_np.append(rollout)
                mini_batch = model.make_batch(mini_batch_np)
                data.append(mini_batch)
            model.train_net(data)
            center_model.load_state_dict(model.state_dict())
            signal_queue.get()
        else:
            time.sleep(0.1)
    

if __name__ == '__main__':
    # hyperparameters
    num_processes = 4 # train
    batch_size = 32 # 16   # learner
    buffer_size = 5 # 5    # learner
    rollout_len = 10       # actor
       
    np.set_printoptions(precision=3)
    np.set_printoptions(suppress=True)
    pp = pprint.PrettyPrinter(indent=4)
    torch.set_num_threads(1)
    
    center_model = PPO()
    center_model.share_memory()
    data_queue = mp.Queue()
    signal_queue = mp.Queue()
    processes = []
    
    p = mp.Process(target=learner, args=(center_model, data_queue, signal_queue, batch_size, buffer_size))
    p.start()
    processes.append(p)
    for rank in range(num_processes):
        p = mp.Process(target=actor, args=(rank, center_model, data_queue, signal_queue, rollout_len))
        p.start()
        processes.append(p)
        
    for p in processes:
        p.join()
        
        


actor 0 started
actor 1 started
actor 2 started
actor 3 started
learner
10, Done, Step 253 Reward: 0.000000
20, Done, Step 38 Reward: 1.000000
30, Done, Step 292 Reward: 0.000000


Process Process-4:
Process Process-1:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-1-f6be280ec56d>", line 105, in learner
    time.sleep(0.1)
  File "<ipython-input-1-f6be280ec56d>", line 60, in actor
    obs, rew, done, info = env.step(a)
KeyboardInterrupt
  File "/usr/local/lib/python3.6/dist-packages/gym/core.py", line 229, in step
    return self.env.step(action)
  File "/usr/local/lib/python3.6/dist-packages/gym/core.py", line 275, in step
    observation, reward, done, info = self.env.step(action)
  File "/usr/loca

KeyboardInterrupt: 

In [4]:
x = torch.tensor(1.0)
x = x+1
print(x)

tensor(2.)
