In [1]:
import gym
from gym import envs
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50,MobileNetV2,Xception
from tensorflow.keras.models import Sequential
import pickle

In [2]:
trained_model=MobileNetV2(input_shape=(160,160,3),           
                    include_top=False,
                    weights='imagenet') 

for layer in trained_model.layers:
    layer.trainable=False
#trained_model.summary()

In [3]:
env = gym.make('Pong-v0')
seed = 42
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)
#env._max_episode_steps = 1000
eps = np.finfo(np.float32).eps.item()

def create_model(mod):
  inputs = layers.Input(shape=(160,160,3))
  x = mod(inputs,training=False)
  x1 = layers.GlobalAveragePooling2D()(x)
  actor = layers.Dense(env.action_space.n,activation='linear')(x1)
  #critic = layers.Dense(1,activation = 'linear')(x1)

  return keras.Model(inputs=inputs, outputs=actor)

model = create_model(trained_model)
model.compile(optimizer=Adam(learning_rate=0.001),loss="mean_squared_error")
target_model = create_model(trained_model)
target_model.compile(optimizer=Adam(learning_rate=0.001),loss="mean_squared_error")
#model.summary()

A.L.E: Arcade Learning Environment (version +978d2ce)
[Powered by Stella]


In [4]:
def discount_rewards(r):

  r = np.array(r)
  discounted_r = np.zeros_like(r)
  running_add = 0
  gamma = 0.99

  for t in range(r.size-2,-1,-1):
    discounted_r[t] = r[t]+discounted_r[t+1]*gamma
  discounted_r -= np.mean(discounted_r) #normalizing the result
  discounted_r /= (np.std(discounted_r)+eps) #idem using standar deviation
  return discounted_r.astype(np.float32)

def prepro(k):
    k=k[:160,:,:]
    return k

In [None]:

total_rewards = 0
frame_count = 0
episode_count = 0
max_memory_length = 100000
update_after = 4
update_target_after = 1000
epsilon = 1
gamma = 0.99

action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
action_taken=[]
prev = None
flag=0
for i in range(100):
  state = np.array(env.reset())
  rewards=0

  for step in range(1,10000):
    frame_count+=1

    if frame_count < 80000 or epsilon > np.random.rand(1)[0]:
      action = np.random.choice(6)
      state = prepro(state)
    else:
      state = prepro(state)
      state_tensor = tf.convert_to_tensor(state)
      state_tensor = tf.expand_dims(state_tensor,0)
      action_prob = model.predict(state_tensor)
      action = tf.random.categorical(action_prob, 1)[0, 0]
      action = action.numpy()

    epsilon -= 0.99/100000
    epsilon = max(epsilon,0.1)

    state_next,reward,done,info = env.step(action)
    state_next = np.array(state_next)
    state_next1 = prepro(state_next)

    rewards += reward

    action_history.append(action)
    state_history.append(state)
    state_next_history.append(state_next1)
    done_history.append(done)
    rewards_history.append(reward)

    state = state_next

    if frame_count % 4 == 0 and len(done_history) > 32:
      indices = np.random.choice(range(len(done_history)),size=32)

      state_sample = np.array([state_history[i] for i in indices])
      state_next_sample = np.array([state_next_history[i] for i in indices])
      rewards_sample = [rewards_history[i] for i in indices]
      action_sample = [action_history[i] for i in indices]
      done_sample = tf.convert_to_tensor([float(done_history[i]) for i in indices])

      future_rewards = target_model.predict(state_next_sample)
      current_rewards = model.predict(state_sample)
      updated_q = current_rewards[:]

      a = [i for i in range(6)]
      for idx,terminal in enumerate(done_sample):
        if terminal:
          future_rewards[idx]=0.0
        updated_q[idx,action_sample[idx]] = rewards_sample[idx] + gamma*tf.reduce_max(future_rewards,axis=1)[idx]

      model.train_on_batch(state_sample,updated_q)

    if frame_count % update_target_after ==0:
      target_model.set_weights(model.get_weights())
      #print("reward",total_rewards,"episode:",episode_count,"frame:",frame_count)

    if len(rewards_history)>max_memory_length:
      del rewards_history[:1]
      del state_history[:1]
      del state_next_history[:1]
      del action_history[:1]
      del done_history[:1]

    if done:
      break

  print("episode reward:",rewards,"mean reward",total_rewards,"episode:",episode_count,"frame:",frame_count)
  episode_reward_history.append(rewards)
  if len(episode_reward_history)>100:
    del episode_reward_history[:1]
  total_rewards = np.mean(episode_reward_history)

  episode_count+=1
    
  if episode_count%20==0:
    !rm -rf ./saved_models/
    !mkdir saved_models
    model.save(f'./saved_models/model_{episode_count}.h5')
    target_model.save(f'./saved_models/target_{episode_count}.h5')
    d = {'ac':action_history,'st':state_history,'stn':state_next_history,
         'r':reward_history,'d':done_history,'epr':episode_reward_history,
         'at':action_taken,'ep':epsilon}
    file = open('history','wb')
    pickle.dump(d,file)
    file.close()

  if total_rewards > -5:
    print("solved at episode: ",episode_count)
    break
#env.play()

episode reward: -21.0 mean reward 0 episode: 0 frame: 1319
episode reward: -20.0 mean reward -21.0 episode: 1 frame: 2821
episode reward: -20.0 mean reward -20.5 episode: 2 frame: 4188
