In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install colabgymrender==1.0.2

In [None]:
! wget http://www.atarimania.com/roms/Roms.rar
! mkdir /content/ROM/
! unrar e /content/Roms.rar /content/ROM/
! python -m atari_py.import_roms /content/ROM/

In [None]:
# !pip3 install Box2D
# !pip3 install box2d-py
# !pip3 install gym[Box_2D]
import gym
from gym import envs
from colabgymrender.recorder import Recorder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Sequential

In [None]:
d=envs.registry.all()
for i in d:
  print(i)

In [None]:
trained_model=ResNet50(input_shape=(160,160,3),           
                    include_top=False,
                    weights='imagenet') 

for layer in trained_model.layers:
    layer.trainable=False
trained_model.summary()

In [None]:
env = gym.make('Pong-v0')
env = gym.wrappers.ResizeObservation(env,(160,160))
seed = 42
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)
#env._max_episode_steps = 1000
eps = np.finfo(np.float32).eps.item()

def create_model(mod):
  inputs = layers.Input(shape=(160,160,3))
  x = mod(inputs,training=False)
  x1 = layers.GlobalAveragePooling2D()(x)
  actor = layers.Dense(env.action_space.n,activation='linear')(x1)
  critic = layers.Dense(1,activation = 'linear')(x1)

  return keras.Model(inputs=inputs, outputs=[actor,critic])

model = create_model(trained_model)
loss_fn = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.summary()

In [None]:
def discount_rewards(r):

  r = np.array(r)
  discounted_r = np.zeros_like(r)
  running_add = 0
  gamma = 0.99

  for t in range(r.size-2,-1,-1):
    discounted_r[t] = r[t]+discounted_r[t+1]*gamma
  discounted_r -= np.mean(discounted_r) #normalizing the result
  discounted_r /= (np.std(discounted_r)+eps) #idem using standar deviation
  return discounted_r.astype(np.float32)

def compute_loss(action_probs,values,returns):

  advantage = returns - values
  action_log_probs = tf.math.log(action_probs)
  actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)
  critic_loss = loss_fn(values, returns)

  return actor_loss + critic_loss

In [None]:

total_rewards = 0
frame_count = 0
episode_count = 0
gamma = 0.99
episode_reward_history = []

for i in range(300):
  state = np.array(env.reset())
  rewards=[]
  action_probs = []
  values = []
  done=False
  with tf.GradientTape() as tape:
    while not done:
      frame_count+=1
      
      state_tensor = tf.convert_to_tensor(state)
      state_tensor = tf.expand_dims(state_tensor,0)
      action_prob,value = model(state_tensor)
      action = tf.random.categorical(action_prob, 1)[0, 0]
      action = action.numpy()
      action_probs_t = tf.nn.softmax(action_prob)

      state_next,reward,done,info = env.step(action)

      rewards.append(reward)
      action_probs.append(action_probs_t[0][action])
      values.append(tf.squeeze(value))

      state = np.array(state_next)

      if done:
        break


    returns = discount_rewards(rewards)
    action_probs, values, returns = [tf.expand_dims(x, 1) for x in [action_probs, values, returns]] 
    loss = compute_loss(action_probs,values,returns)
    loss = tf.convert_to_tensor([loss])
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))


  episode_reward_history.append(sum(rewards))
  if len(episode_reward_history)>100:
    del episode_reward_history[:1]
  total_rewards = np.mean(episode_reward_history)

  episode_count+=1
  #if episode_count%5==0:
  print("episode reward",sum(rewards),"reward",total_rewards,"episode:",episode_count,"frame:",frame_count)
  if total_rewards>19:
    print("The environment was solved at episode:",episode_count)
    break