In [2]:
!pip install -q keras
!pip install gym
!pip install “gym[atari]"

/bin/bash: -c: line 0: unexpected EOF while looking for matching `"'
/bin/bash: -c: line 1: syntax error: unexpected end of file


In [3]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']

In [0]:
import random
import numpy as np
from keras import backend as K
from keras.layers import Dense, Conv2D, Flatten
from keras.models import Sequential
from keras.optimizers import Adam

class Agent:

    def __init__(self):
        self.memory = []
        self.epsilon = 1.0  # exploration rate
        self.model = self.__model()

    def __model(self):
        model = Sequential()
        model.add(Conv2D(16, kernel_size=8, strides=4, activation='relu', input_shape=(80, 80, 3)))
        model.add(Conv2D(32, kernel_size=4, strides=2, activation='relu'))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
        return model

    def preprocess(self, I):
        # prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector
        I = I[35:195]  # crop
        I = I[::2, ::2, :]  # downsample by factor of 2
        I[I == 144] = 0  # erase background (background type 1)
        I[I == 109] = 0  # erase background (background type 2)
        I[I != 0] = 1  # everything else (paddles, ball) just set to 1
        return I  # shape:(80, 80, 3)

    def discount_rewards(self, r, gamma=0.99):
        """ take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, len(r))):
            if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
            running_add = running_add * gamma + r[t]
            discounted_r[t] = running_add
        #normalize
        discounted_r -= np.mean(discounted_r)
        discounted_r /= np.std(discounted_r)
        return discounted_r

    def remember(self, state, action, reward, next_state, done):
        # states must be preprocessed
        if (state.shape[0] != 80 and state.shape[1] != 80):
            state = self.preprocess(state)
        if (next_state.shape[0] != 80 and next_state.shape[1] != 80):
            next_state = self.preprocess(next_state)

        # store in memory the different states, actions, rewards...
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        # fit model from memory
        gamma = 0.5  # importance of the next reward
        # max_batch_size = 512

        # take care the memory could be big, so using minibatch
        # minibatch = random.sample(self.memory, min(max_batch_size, len(self.memory)))

        list_x_batch, list_y_batch, list_r_batch = [], [], []
        _,_,list_r_batch,_,_ = zip(*self.memory)

        print("steps:{}".format(len(self.memory)))
        for state, action, reward, next_state, done in self.memory:

            state = np.expand_dims(state, axis=0)
            #target = self.model.predict(state)[0]
            
            target = np.zeros([2])  #????
            target[action] = 1

            #if not done:
            #    next_state = np.expand_dims(next_state, axis=0)
            #    future_action_values = self.model.predict(next_state)[0]
            #    #following the formula of action-value expectation
            #    target[action] = reward + gamma * np.amax(future_action_values)

            ##customize the obtained reward with the calculated
            #print("action {} ({}) from {} to {}".format(action, r_batch[step], t, target))

            # append
            list_x_batch.append(state)
            list_y_batch.append(target)
                
        # clean
        self.memory = []

        # train the model
        x_batch = np.vstack(list_x_batch)
        y_batch = np.vstack(list_y_batch)
        r_batch = self.discount_rewards(list_r_batch)
        #yr_batch = r_batch.reshape(-1,1) * y_batch

        self.model.fit(x_batch, y_batch, sample_weight=r_batch, verbose=1)
        #self.model.fit(x_batch, yr_batch, verbose=1)

    def act(self, state):
        # preprocess the sample
        state = self.preprocess(state)

        if self.epsilon > np.random.rand():
            return random.randint(0, 1)

        # predict the action to do
        state = np.expand_dims(state, axis=0)
        action_values = self.model.predict(state)
        print("Predictions:{}".format(action_values))

        return np.argmax(action_values)

In [38]:
import os
import gym
import random
import numpy as np
#from Agent import Agent
from time import sleep


# code for the two only actions in Pong
UP_ACTION = 2
DOWN_ACTION = 3

# initializing our environment
env = gym.make("Pong-v0")

# beginning of an episode
observation = env.reset()

# model weights
h5file = "weights.h5"

# agent
agent = Agent()

# get model
if os.path.exists(h5file):
    agent.model.load_weights(h5file)

# training conf
training = True
# x_train, y_train, rewards = [], [], []
# reward_sum = 0

episode = 0
# main loop
while episode < 1000: 
    # predict action
    action = agent.act(observation)
    movement = UP_ACTION if action == 1 else DOWN_ACTION

    # do one step
    next_observation, reward, done, info = env.step(movement)

    # save the current observation
    agent.remember(observation, action, reward, next_observation, done)

    # update state
    observation = next_observation

    if reward != 0:
        if reward == 1:
            print("Win!!")
        else:
            print("Lose..")

        if training:
            agent.replay()
    
    if done:
        print("epsilon:{}".format(agent.epsilon))
        # decrease exploration rate
        if agent.epsilon > 0.01:
            agent.epsilon *= 0.97
            
        observation = env.reset()
        episode += 1
        agent.model.save_weights(h5file)


Lose..
steps:170
Epoch 1/1
Lose..
steps:45
Epoch 1/1
Lose..
steps:126
Epoch 1/1
Lose..
steps:47
Epoch 1/1
Lose..
steps:46
Epoch 1/1
Lose..
steps:46
Epoch 1/1
Lose..
steps:48
Epoch 1/1
Lose..
steps:46
Epoch 1/1
Lose..
steps:45
Epoch 1/1
Lose..
steps:44
Epoch 1/1
Lose..
steps:126
Epoch 1/1
Lose..
steps:49
Epoch 1/1
Lose..
steps:47
Epoch 1/1
Lose..
steps:44
Epoch 1/1
Lose..
steps:131
Epoch 1/1
Lose..
steps:130
Epoch 1/1
Lose..
steps:48
Epoch 1/1
Lose..
steps:47
Epoch 1/1
Lose..
steps:120
Epoch 1/1
Lose..
steps:48
Epoch 1/1
Lose..
steps:136
Epoch 1/1
epsilon:1.0


KeyboardInterrupt: ignored

In [0]:
np.random.choice(2, 1, p=(0.6, 0.4))[0]
a = np.array([[1, 2], [0, 1], [1,0]])
b = np.array([[3], [2], [1]])
a*b

array([[3, 6],
       [0, 2],
       [1, 0]])

In [0]:
!rm weights.h5