In [0]:
import tensorflow as tf
from keras import Sequential
from keras.layers import *
from keras.optimizers import Adam
import numpy as np
import random
import gym
class FrameProcessor(object):
    def __init__(self, frame_height=84, frame_width=84):
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.frame = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
        self.processed = tf.image.rgb_to_grayscale(self.frame)
        self.processed = tf.image.crop_to_bounding_box(self.processed, 34, 0, 160, 160)
        self.processed = tf.image.resize_images(self.processed, 
                                                [self.frame_height, self.frame_width], 
                                                method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
    
    def __call__(self, session, frame):
        return session.run(self.processed, feed_dict={self.frame:frame})

    
class Atari:
    def __init__(self, frame_height=84, frame_width=84, no_of_frames=4):
        self.frame_height = frame_height
        self.frame_width = frame_width
        self.no_of_frames = no_of_frames
        self.state = None
        self.env = gym.make("Pong-v4")
        self.process_frame = FrameProcessor()
        
    def reset(self):
        state = self.env.reset()
        state = self.process_frame(session, state)
        self.pre_state = np.repeat(state, self.no_of_frames, axis=2) # List of states -- 3d
        kk = list((1,))
        kk.extend(list(self.pre_state.shape))
        self.state = self.pre_state.reshape(tuple(kk)) # One extra dimension to list of states -- 5d
        
    
    def step(self, action):
        next_state, reward, done, other_info = self.env.step(action)
        next_state = self.process_frame(session, next_state)
        self.pre_state = np.append(self.pre_state[:, :, 1:], next_state, axis=2) # -- 4d
        kk = list((1,))
        kk.extend(list(self.pre_state.shape))
        self.state = self.pre_state.reshape(tuple(kk)) #--5d
        return self.pre_state[:, :, -1:],reward, done, other_info # returns --4d as 1st


    
from collections import deque
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=1000000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 0.99995
        self.epsilon_min = 0.01
        self.learning_rate = 0.00001
        self.model = self._create_model()
        self.target_model = self._create_model()
        
    def _create_model(self):
        model = Sequential()
        model.add(Convolution2D(32,(8,8),strides=4, padding="valid", activation='relu',input_shape=self.state_size, kernel_initializer=tf.variance_scaling_initializer(scale=2),use_bias=False))
        model.add(Convolution2D(64,(4,4),strides=2, padding="valid", activation='relu', kernel_initializer=tf.variance_scaling_initializer(scale=2),use_bias=False))
        model.add(Convolution2D(64,(3,3),strides=1, padding="valid", activation='relu', kernel_initializer=tf.variance_scaling_initializer(scale=2),use_bias=False))
        model.add(Convolution2D(64,(7,7),strides=1, padding="valid", activation='relu', kernel_initializer=tf.variance_scaling_initializer(scale=2),use_bias=False))
        model.add(Flatten())
        model.add(Dense(self.action_size,activation='softmax'))
        model.compile(loss=tf.keras.losses.Huber(),optimizer="adam",metrics=["accuracy"])
        return model
    
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            #print("Working on random..")
            return random.randrange(self.action_size)
        #print("Working on prediction..")
        return np.argmax(self.model.predict(state)[0])
    
    def train(self, batch_size=32):
        minibatch = random.sample(self.memory, batch_size)
        for experience in minibatch:
            state, action, reward, next_state, done = experience
            
            if not done:
                pre_state = state[0]
                pre_next_state = np.append(pre_state[:, :, 1:], next_state, axis=2)
                kk = list((1,))
                kk.extend(list(pre_next_state.shape))
                pre_next_state = pre_next_state.reshape(tuple(kk))
                main_model_action = np.argmax(self.model.predict(pre_next_state)[0])
                target = reward + self.gamma*(self.target_model.predict(pre_next_state)[0][main_model_action])
            else:
                target = reward
            
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    
    def load(self, name):
        self.model.load_weights(name)
    
    def save(self, name):
        self.model.save_weights(name)

n_episodes = 200000
output_dir = "/content/drive/My Drive/Final_exec_Pong_modelv2/"
batch_size = 32
atari = Atari()
session = tf.Session()
atari.reset()
agent = Agent(state_size=atari.pre_state.shape, action_size=atari.env.action_space.n)
done = False
total_sum = 0
games = []
avg_reward = []
state_number = 0
for e in range(n_episodes):
    atari.reset()
    total_reward = 0
    for time in range(18000):
        state = (atari.state).copy() # -- 5d
        #atari.env.render()
        action = agent.act(state)
        state_number += 1
        next_state, reward, done, other_info = atari.step(action)
        total_reward += reward
        # next_state -- 4d
        agent.remember(state, action, reward, next_state, done)

        if state_number > 50000 and (state_number % 4 == 0):
          #print("Training the model")
          agent.train(batch_size)
        
        if state_number > 50000 and (state_number % 10000 == 0):
            print("Updating target_model weights")
            agent.target_model.set_weights(agent.model.get_weights())
            print("Done updating target model weights") 
        
        if done:
            print("Episode : %s/%s, time : %s, reward:%s, done:%s, info:%s, Epsilon : %s" % (e, n_episodes, time, total_reward, done,other_info,agent.epsilon))
            break
    
    if e%10 == 0:
        print("Dumping : %s" % ("weights_%s.hdf5" % e))
        agent.save(output_dir + ("weights_%s.hdf5" % e))
      
    total_sum += total_reward
    if e!=0 and (e%10 == 0):
      games.append(e)
      avg_reward.append(int(total_sum/11))
      total_sum = 0
      print("games : %s" % games)
      print("avg_reward : %s" % avg_reward)

atari.env.close()

Episode : 0/200000, time : 1453, reward:-19.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Dumping : weights_0.hdf5
Episode : 1/200000, time : 1422, reward:-18.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Episode : 2/200000, time : 1304, reward:-19.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Episode : 3/200000, time : 1229, reward:-19.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Episode : 4/200000, time : 1052, reward:-21.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Episode : 5/200000, time : 1394, reward:-20.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Episode : 6/200000, time : 1256, reward:-21.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Episode : 7/200000, time : 1320, reward:-20.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Episode : 8/200000, time : 1273, reward:-20.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Episode : 9/200000, time : 1224, reward:-20.0, done:True, info:{'ale.lives': 0}, Epsilon : 1.0
Episode : 10/200000, time

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
