In [6]:
!pip install -q keras
!pip install gym
!pip install “gym[atari]"

/bin/bash: -c: line 0: unexpected EOF while looking for matching `"'
/bin/bash: -c: line 1: syntax error: unexpected end of file


In [7]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [8]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [0]:
import random
import numpy as np
from keras import backend as K
from keras.layers import Dense, Conv2D, Flatten, BatchNormalization, Activation, MaxPooling2D
from keras.models import Sequential
from keras.optimizers import Adam

class Agent:

    def __init__(self):
        self.memory = []
        self.epsilon = 1.0  # exploration rate
        self.model = self.__model()

    def __model(self, lr=0.001):
        model = Sequential()
        model.add(Conv2D(16, kernel_size=8, strides=4, input_shape=(80, 80, 1)))
        #model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2,2)))
        
        model.add(Flatten())
        model.add(Dense(128))
        #model.add(BatchNormalization())
        model.add(Activation('sigmoid'))
        
        model.add(Dense(3, activation='softmax'))
        model.compile(loss='mean_squared_error', optimizer=Adam(lr=lr), metrics=['accuracy'])
        return model

    def preprocess(self, I):
        # prepro 210x160x3 uint8 frame into 6400 (80x80x1) 2D float vector
        I = I[35:195]  # crop
        I = I[::2, ::2, 0]  # downsample by factor of 2
        I[I == 144] = 0  # erase background (background type 1)
        I[I == 109] = 0  # erase background (background type 2)
        I[I != 0] = 1  # everything else (paddles, ball) just set to 1
        I= np.reshape(I, (80, 80, 1))
        return I

    def discount_rewards(self, r, gamma=0.99):
        """ take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, len(r))):
            if r[t] != 0: running_add = 0 # reset the sum
            running_add = running_add * gamma + r[t]
            discounted_r[t] = running_add
        #normalize
        discounted_r -= np.mean(discounted_r)
        discounted_r /= np.std(discounted_r)
        return discounted_r

    def remember(self, state, action, actions, reward):
        # assure states are preprocessed before keep in memory
        if (state.shape[0] != 80 and state.shape[1] != 80):
            state = self.preprocess(state)

        # store in memory the different states, actions, rewards...
        self.memory.append((state, action, actions, reward))

    def replay(self):
        # fit model from memory
        gamma = 0.99 # importance of the next reward

        # initialize
        list_x_batch, list_y_batch = [], []
        
        # get the list of rewards
        _, _, _, list_r_batch = zip(*self.memory)
        r_batch = self.discount_rewards(list_r_batch, gamma) #process rewards
                
        for i, (state, action, actions, _) in enumerate(self.memory):

            state = np.expand_dims(state, axis=0)
            r = r_batch[i]  #reward of ith step
            #print("in) a:{} as:{} r:{}".format(action, actions, r))
            actions[action] += actions[action]*r
            #print("out) a:{} as:{} r:{}".format(action, actions, r))

            # append
            list_x_batch.append(state)
            list_y_batch.append(actions)
                
        # clean
        self.memory = []

        # train the model
        x_batch = np.vstack(list_x_batch)
        y_batch = np.vstack(list_y_batch)

        # fitting
        self.model.fit(x_batch, y_batch, verbose=0)

    def act(self, state):
        if self.epsilon > np.random.rand():
            idx_a = random.randint(0, 2)
            a = np.zeros([3])
            a[idx_a] = 1
            return idx_a, a

        # preprocess the sample
        state = self.preprocess(state)
        state = np.expand_dims(state, axis=0)

        # predict the action to do
        action_values = self.model.predict(state)
        #print("Prediction({}) from {}".format(np.argmax(action_values), action_values))

        return np.argmax(action_values), action_values[0]

In [0]:
import os
import gym
import random
import numpy as np
#from Agent import Agent
from time import sleep

!rm weights.h5

# code for the two only actions in Pong
UP_ACTION = 2
DOWN_ACTION = 3
NO_ACTION = 0

# mapping actions: model output -> environment
action2move = {0:NO_ACTION, 1:UP_ACTION, 2:DOWN_ACTION}

# initializing our environment
env = gym.make("Pong-v0")

# beginning of an episode
observation = env.reset()
previousObs = np.zeros_like(observation)

# model weights
h5file = "weights.h5"

# agent
agent = Agent()

# try to load previous model
if os.path.exists(h5file):
    agent.model.load_weights(h5file)

# training conf
training = True

# main loop
episode = 0
wins = 0
win_performance = 0

while episode < 10000: 
    # predict action
    diffObs = observation - previousObs
    
    action, actions = agent.act(diffObs)
    move = action2move[action]     

    # do one step
    next_observation, reward, done, _ = env.step(move)

    # save the current observation
    agent.remember(diffObs, action, actions, reward)

    # update state
    previousObs = observation
    observation = next_observation

    
    if reward != 0:
        if reward == 1:
            wins += 1

        if training:
            agent.replay()
            agent.model.save_weights(h5file)
    
    if done:
        print("******* episode:{} wins:{} perf:{:.3f} (epsilon:{:.3f}) ********".format(episode, wins, win_performance/(episode+1), agent.epsilon))
        s = agent.preprocess(diffObs)
        s = np.expand_dims(s, axis=0)
        print("\t\t\t\t\t{}".format(agent.model.predict(s)))
        # decrease exploration rate
        if agent.epsilon > 0.01:
            agent.epsilon *= 0.97
        
        observation = env.reset()
        episode += 1
        win_performance += wins
        wins = 0


******* episode:0 wins:0 perf:0.000 (epsilon:1.000) ********
					[[0.37393495 0.32183784 0.30422717]]
******* episode:1 wins:0 perf:0.000 (epsilon:0.970) ********
					[[0.37240514 0.31079662 0.31679827]]
******* episode:2 wins:1 perf:0.000 (epsilon:0.941) ********
					[[0.33556342 0.3657936  0.298643  ]]
******* episode:3 wins:1 perf:0.250 (epsilon:0.913) ********
					[[0.37810448 0.3006338  0.32126173]]
******* episode:4 wins:0 perf:0.400 (epsilon:0.885) ********
					[[0.36240795 0.33983213 0.29775992]]
******* episode:5 wins:1 perf:0.333 (epsilon:0.859) ********
					[[0.41715792 0.30442497 0.2784171 ]]
******* episode:6 wins:0 perf:0.429 (epsilon:0.833) ********
					[[0.4141489  0.32086238 0.2649887 ]]
******* episode:7 wins:0 perf:0.375 (epsilon:0.808) ********
					[[0.34250528 0.3183351  0.33915964]]
******* episode:8 wins:1 perf:0.333 (epsilon:0.784) ********
					[[0.34031585 0.36279565 0.29688847]]
******* episode:9 wins:1 perf:0.400 (epsilon:0.760) ********
					[[0.458