In [1]:
!pip install -q keras
!pip install gym
!pip install “gym[atari]"

/bin/bash: -c: line 0: unexpected EOF while looking for matching `"'
/bin/bash: -c: line 1: syntax error: unexpected end of file


In [2]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [3]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']

In [0]:
import random
import numpy as np
from keras import backend as K
from keras.layers import Dense, Conv2D, Flatten, BatchNormalization
from keras.models import Sequential
from keras.optimizers import Adam

class Agent:

    def __init__(self):
        self.memory = []
        self.epsilon = 0.009  # exploration rate
        self.model = self.__model()

    def __model(self):
        model = Sequential()
        model.add(Conv2D(16, kernel_size=8, strides=4, activation='relu', input_shape=(80, 80, 1)))
        model.add(BatchNormalization())
        model.add(Conv2D(32, kernel_size=4, strides=2, activation='relu'))
        model.add(BatchNormalization())
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(3, activation='softmax'))
        model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
        return model

    def preprocess(self, I):
        # prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector
        I = I[35:195]  # crop
        I = I[::2, ::2, 0]  # downsample by factor of 2
        I[I == 144] = 0  # erase background (background type 1)
        I[I == 109] = 0  # erase background (background type 2)
        I[I != 0] = 1  # everything else (paddles, ball) just set to 1
        I = np.reshape(I, (80, 80, 1))
        return I  # shape:(80, 80, 1)

    def discount_rewards(self, r, gamma=0.99):
        """ take 1D float array of rewards and compute discounted reward """
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(range(0, len(r))):
            if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
            running_add = running_add * gamma + r[t]
            discounted_r[t] = running_add
        #normalize
        discounted_r -= np.mean(discounted_r)
        discounted_r /= np.std(discounted_r)
        return discounted_r

    def remember(self, state, action, reward, next_state, done):
        # states must be preprocessed
        if (state.shape[0] != 80 and state.shape[1] != 80):
            state = self.preprocess(state)
        if (next_state.shape[0] != 80 and next_state.shape[1] != 80):
            next_state = self.preprocess(next_state)

        # store in memory the different states, actions, rewards...
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        # fit model from memory
        gamma = 0.5  # importance of the next reward

        # initialize
        list_x_batch, list_y_batch = [], []
        
        # get the list of rewards
        _,_,list_r_batch,_,_ = zip(*self.memory)
        
        #print("steps:{}".format(len(self.memory)))
        for state, action, reward, next_state, done in self.memory:

            state = np.expand_dims(state, axis=0)
            
            target = np.zeros([3])  #0's for up and down => [0, 0]
            target[action] = 1 #performed action is set to 1 

            # append
            list_x_batch.append(state)
            list_y_batch.append(target)
                
        # clean
        self.memory = []

        # train the model
        x_batch = np.vstack(list_x_batch)
        y_batch = np.vstack(list_y_batch)
        r_batch = self.discount_rewards(list_r_batch)
        #print(r_batch)

        self.model.fit(x_batch, y_batch, sample_weight=r_batch, verbose=0)
        #self.model.fit(x_batch, yr_batch, verbose=1)

    def act(self, state):
        # preprocess the sample
        state = self.preprocess(state)

        if self.epsilon > np.random.rand():
            return random.randint(0, 2)

        # predict the action to do
        state = np.expand_dims(state, axis=0)
        action_values = self.model.predict(state)
        #print("Predictions:{}".format(action_values))
        action = np.argmax(action_values)
        '''if action == 1:
          print("--up") 
        elif action == 2:
          print("--down")
        else:
          print("--no_action")
          '''
        return action

In [0]:
import os
import gym
import random
import numpy as np
#from Agent import Agent
from time import sleep

#!rm weights.h5

# code for the two only actions in Pong
UP_ACTION = 2
DOWN_ACTION = 3
NO_ACTION = 0

# initializing our environment
env = gym.make("Pong-v0")

# beginning of an episode
observation = env.reset()

# model weights
h5file = "weights.h5"

# agent
agent = Agent()

# get model
if os.path.exists(h5file):
    agent.model.load_weights(h5file)

# training conf
training = True
# x_train, y_train, rewards = [], [], []
# reward_sum = 0

episode = 0
previousObs = np.zeros_like(observation)
wins=0
# main loop
while episode < 10000: 
    # predict action
    diffObs = observation - previousObs
    
    action = agent.act(diffObs)
    
    #movement = UP_ACTION if action == 1 else DOWN_ACTION
    movement = NO_ACTION
    if action == 1:
      movement = UP_ACTION
    elif action == 2:
      movement = DOWN_ACTION      

    # do one step
    next_observation, reward, done, info = env.step(movement)

    # save the current observation
    agent.remember(diffObs, action, reward, next_observation, done)

    # update state
    previousObs = observation
    observation = next_observation

    
    if reward != 0:
        if reward == 1:
            wins += 1

        if training:
            agent.replay()
            agent.model.save_weights(h5file)
    
    if done:
        print("******* episode:{} wins:{} (epsilon:{}) ********".format(episode, wins, agent.epsilon))
        
        #if wins >= 10:
        #  break
          
        # decrease exploration rate
        if agent.epsilon > 0.01:
            agent.epsilon *= 0.997
        
        observation = env.reset()
        episode += 1
        wins = 0


******* episode:0 wins:0 (epsilon:0.009) ********
******* episode:1 wins:0 (epsilon:0.009) ********
******* episode:2 wins:2 (epsilon:0.009) ********
******* episode:3 wins:2 (epsilon:0.009) ********
******* episode:4 wins:1 (epsilon:0.009) ********
******* episode:5 wins:1 (epsilon:0.009) ********
******* episode:6 wins:0 (epsilon:0.009) ********
******* episode:7 wins:0 (epsilon:0.009) ********
******* episode:8 wins:0 (epsilon:0.009) ********
******* episode:9 wins:0 (epsilon:0.009) ********
******* episode:10 wins:0 (epsilon:0.009) ********
******* episode:11 wins:0 (epsilon:0.009) ********
******* episode:12 wins:0 (epsilon:0.009) ********
******* episode:13 wins:0 (epsilon:0.009) ********
******* episode:14 wins:0 (epsilon:0.009) ********
******* episode:15 wins:0 (epsilon:0.009) ********
******* episode:16 wins:1 (epsilon:0.009) ********
******* episode:17 wins:1 (epsilon:0.009) ********
******* episode:18 wins:2 (epsilon:0.009) ********
******* episode:19 wins:0 (epsilon:0.009)

In [0]:
np.random.choice(2, 1, p=(0.6, 0.4))[0]
a = np.array([[1, 2], [0, 1], [1,0]])
b = np.array([[3], [2], [1]])
a*b

array([[3, 6],
       [0, 2],
       [1, 0]])

In [5]:
!ls weights.h5

ls: cannot access 'weights.h5': No such file or directory
