In [None]:
%%capture

!apt-get update
!apt-get install libsdl2-gfx-dev libsdl2-ttf-dev

# Make sure that the Branch in git clone and in wget call matches !!
!git clone -b v2.9 https://github.com/google-research/football.git
!mkdir -p football/third_party/gfootball_engine/lib

!wget https://storage.googleapis.com/gfootball/prebuilt_gameplayfootball_v2.8.so -O football/third_party/gfootball_engine/lib/prebuilt_gameplayfootball.so
!cd football && GFOOTBALL_USE_PREBUILT_SO=1 python3 -m pip install .

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import gfootball.env as football_env
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [None]:
def get_actor(dims_in, dims_out):
  
  input = Input(shape = dims_in, name = 'Input')
  old_probs = Input(shape = (1, dims_out, ), name = 'Old_probabilities')
  advantages = Input(shape = (1, 1, ), name = 'Advantages')
  q_values = Input(shape = (1, 1, ), name = 'Q-values')
  rewards = Input(shape = (1, 1, ), name = 'Rewards')

  x = Dense(256, activation = 'tanh')(input)
  x = Dense(128, activation = 'tanh')(x)
  output = Dense(n_actions, activation = 'softmax', name = 'Predictions')(x)

  model = Model(inputs = [input, old_probs, advantages, q_values, rewards], outputs = [output])
  model.compile(optimizer=Adam(lr = learning_rate), loss = get_ppo_loss(old_probs, advantages, q_values, rewards))

  return model


def get_critic(dims_in):
  
  input = Input(shape = dims_in, name = 'Input')

  x = Dense(256, activation = 'tanh')(input)
  x = Dense(128, activation = 'tanh')(x)
  output = Dense(1, name = 'Predictions')(x)

  model = Model(inputs = input, outputs = output)
  model.compile(optimizer=Adam(lr = learning_rate), loss = 'mse')

  return model

def get_ppo_loss(old_probs, advantages, q_values, rewards):
  
  def get_loss(y_true, y_pred):
    
    new_probs = y_pred

    r = K.exp(K.log(new_probs + 1e-10) - K.log(old_probs + 1e-10))

    actor_loss = -K.mean(K.minimum(r * advantages, 
        K.clip(r, min_value=1 - loss_clipping, max_value=1 + loss_clipping) * advantages))
    
    critic_loss = K.mean(K.square(rewards - q_values))

    total_loss = critic_discount * critic_loss + actor_loss - entropy_loss * K.mean(
        -(new_probs * K.log(new_probs + 1e-10)))
    
    return total_loss
  
  return get_loss

def get_normalize(x):
  
  mean = np.mean(x)
  
  std = np.std(x)
  
  return (x - mean) / (std + 1e-10)

def get_advantages(q_values, statuses, rewards):
    
  returns = []
  
  gae = 0

  for i in reversed(range(len(rewards))):
    
    delta = rewards[i] + gamma * q_values[i + 1] * statuses[i] - q_values[i]
    
    gae = delta + gamma * lambda_ * statuses[i] * gae
    
    returns.insert(0, gae + q_values[i])  

  adv = np.array(returns) - q_values[:-1]
  
  return returns, (adv - np.mean(adv)) / (np.std(adv) + 1e-10)

def get_empty_arrays():
  # create an empty array to store states
  states = []

  # create an empty array to store actions 
  actions = []

  # create an empty array to store values from critic model
  q_values = []

  # create an empty array to store if game is over or completed
  statuses = []

  # create an empty array to store rewards
  rewards = []
  
  # create an empty array to store one-hot-encoded actions
  actions_ohe = []
  
  # create an empty array to store probabilities of actions
  actions_probs = []

  return states, actions, q_values, statuses, rewards, actions_ohe, actions_probs

def get_action(state_in):
    
  actions_dist = model_actor.predict([state_in, dummy_1, dummy_2, dummy_2, dummy_2], steps = 1)
  
  action = np.random.choice(n_actions, p = actions_dist[0, :])
  
  action_ohe = np.zeros(n_actions)
  
  action_ohe[action] = 1

  return action, action_ohe, actions_dist


In [None]:
# create an environment academy_empty_goal_close
env = football_env.create_environment(env_name="academy_empty_goal",
                                      stacked=False, logdir='/tmp/football', 
                                      write_goal_dumps=False, 
                                      write_full_episode_dumps=False, 
                                      render=False, representation = 'simple115v2',
                                      rewards = 'scoring')


# get the dimensions of the observation space in the environment
obs_space = env.observation_space.shape
#print(obs_space)

# get the number of actions in the environment
n_actions = env.action_space.n
#print(n_actions)

#hyperparameters
loss_clipping = 0.2
entropy_loss = 0.005
gamma = 0.99
lambda_ = 0.95
critic_discount = 0.5
learning_rate = 1e-4


#dummy values
dummy_1 = np.zeros((1, 1, n_actions))
dummy_2 = np.zeros((1, 1, 1))

#get models
model_actor = get_actor(obs_space, n_actions)
model_critic = get_critic(obs_space)

episode = 0
episodes = 200
ppo_steps = 128
epochs = 5
best_rew = 0
all_rewards = []

In [None]:
state = env.reset()

while episode <= episodes:
  print(episode)
  
  states, actions, q_values, statuses, rewards, actions_ohe, actions_probs = get_empty_arrays()

  state_in = None
  
  for i in range(ppo_steps):
    
    state_in = K.expand_dims(state, 0)
    
    action, action_ohe, actions_dist = get_action(state_in)
    
    q_value = model_critic.predict(state_in, steps = 1)
    
    obs, rew, done, info = env.step(action)
    
    print("Step", i, "; Action", action, "; Reward", rew, "; Q-value", q_value[0])

    status = not done

    states.append(state)
    
    actions.append(action)
    
    actions_ohe.append(action_ohe)
    
    q_values.append(q_value)
    
    statuses.append(status)
    
    rewards.append(rew)
    
    actions_probs.append(actions_dist)
    
    state = obs

    if done:
      env.reset()
  
  all_rewards.append(sum(rewards))

  q_value = model_critic.predict(state_in, steps = 1)
  q_values.append(q_value)

  returns, advantages = get_advantages(q_values, statuses, rewards)

  actor_loss = model_actor.fit(
          [states, 
          actions_probs, 
          advantages,
          q_values[:-1],
          np.reshape(rewards, newshape = (-1, 1, 1))
          ],
          [(np.reshape(actions_ohe, newshape = (-1, n_actions)))], 
          verbose=1, 
          shuffle=False, 
          epochs=epochs)
  
  critic_loss = model_critic.fit(
          [states], 
          [np.reshape(returns, newshape=(-1, 1))],
          verbose=1,
          shuffle=False, 
          epochs=epochs)

  if episode % 5 == 0:
    model_actor.save('model_actor2.hdf5')
    model_critic.save('model_critic2.hdf5')
    print(all_rewards)

  if episode == episodes:
    print('Done')

  episode += 1
  env.reset()      

# close environment
env.close() 

In [None]:
def get_test_reward_dist(model, lim):
    
    state = env.reset()
    
    done = False
    
    total_reward = 0
    
    limit = 0
    
    while not done:
        
        state_input = K.expand_dims(state, 0)
        
        action_probs = model.predict([state_input, dummy_1, dummy_2, dummy_2, dummy_2], steps=1)
        
        action = np.random.choice(n_actions, p = action_probs[0, :])
        
        next_state, reward, done, _ = env.step(action)

        state = next_state
        
        total_reward += reward
        
        limit += 1
        
        if limit > lim:         
            break
    
    return total_reward


env = football_env.create_environment(env_name="academy_empty_goal",
                                      stacked=False, logdir='/tmp/football', 
                                      write_goal_dumps=False, 
                                      write_full_episode_dumps=False, 
                                      render=False, representation = 'simple115v2',
                                      rewards = 'scoring, checkpoint')

test_model_actor = load_model('/content/drive/MyDrive/Colab Notebooks/RL project/Actor-Critic/model_actor_11.06.hdf5', compile = False)

avg_rew = np.mean([get_test_reward_dist(test_model_actor, 128) for _ in range(20)])

print('Average test reward:', avg_rew)