In [1]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Reshape, Flatten
from keras.optimizers import Adam
from keras.layers.convolutional import Convolution2D


class PGAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.001
        self.states = []
        self.gradients = []
        self.rewards = []
        self.probs = []
        self.model = self._build_model()
        self.summary = self.model.summary;

    def _build_model(self):
        model = Sequential()
        model.add(Reshape((80, 80, 1), input_shape=(self.state_size,)))
        model.add(Convolution2D(5, (6, 6), subsample=(3, 3), border_mode='same',
                                activation='relu', init='he_uniform'))
        model.add(Flatten())
        model.add(Dense(20, activation='relu', init='he_uniform'))
        model.add(Dense(20, activation='relu', init='he_uniform'))
        model.add(Dense(self.action_size, activation='softmax'))
        opt = Adam(lr=self.learning_rate)
        model.compile(loss='categorical_crossentropy', optimizer=opt)
        return model

    def remember(self, state, action, prob, reward):
        y = np.zeros([self.action_size])
        y[action] = 1
        self.gradients.append(np.array(y).astype('float32') - prob)
        self.states.append(state)
        self.rewards.append(reward)

    def act(self, state):
        # state = state.reshape([1, state.shape[0]])
        aprob = self.model.predict(state, batch_size=1).flatten()
        self.probs.append(aprob)
        prob = aprob / np.sum(aprob)
        action = np.random.choice(self.action_size, 1, p=prob)[0]
        return action, prob

    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, rewards.size)):
            if rewards[t] != 0:
                running_add = 0
            running_add = running_add * self.gamma + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards

    def train(self,rewards):
        gradients = np.vstack(self.gradients)
        # rewards = np.vstack(self.rewards)
        # rewards = self.discount_rewards(rewards)
        rewards = (rewards - np.mean(rewards,keepdims=1)) / np.std(rewards)
        gradients *= rewards
        X = np.squeeze(np.vstack([self.states]))
        Y = self.probs + self.learning_rate * np.squeeze(np.vstack([gradients]))
        self.model.train_on_batch(X, Y)
        self.states, self.probs, self.gradients, self.rewards = [], [], [], []

    def load(self, name):
        self.model.load_weights(name)
        

    def save(self, name):
        self.model.save_weights(name)

def preprocess(I):
    I = I[35:195]
    I = I[::2, ::2, 0]
    I[I == 144] = 0
    I[I == 109] = 0
    I[I != 0] = 1
    return I.astype(np.float).ravel()
if __name__ == "__main__":
    env = gym.make("Pong-v0")
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = 80 * 80
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
    
agent.summary();

Using TensorFlow backend.
[2017-06-12 12:05:47,361] Making new env: Pong-v0


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 80, 80, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 27, 27, 5)         185       
_________________________________________________________________
flatten_1 (Flatten)          (None, 3645)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                72920     
_________________________________________________________________
dense_2 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 126       
Total params: 73,651
Trainable params: 73,651
Non-trainable params: 0
_________________________________________________________________




In [5]:
if __name__ == "__main__":
    env = gym.make("Pong-v0")
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = 80 * 80
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
    
agent.summary();

26

In [2]:
if __name__ == "__main__":
    env = gym.make("Pong-v0")
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = 80 * 80
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
#     agent.summary();

AgentName = 'pong_minimal-s1'
AgentFile = 'Models/%s.h5'%AgentName;
LogName = 'Models/%s.log'%AgentName;
# LogFile = open(LogName,'a');
resume = 0;
render = 0;

if __name__ == "__main__":
    env = gym.make("Pong-v0")
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = 80 * 80
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
#     agent.summary();
    

    if resume:
        agent.load(AgentFile)
    while episode > -1:
        if render:
            env.render()

        cur_x = preprocess(state)
        x = cur_x - prev_x if prev_x is not None else np.zeros(state_size)
        prev_x = cur_x

        action, prob = agent.act(np.expand_dims(x,1).T)
        state, reward, done, info = env.step(action)
        score += reward
        agent.remember(x, action, prob, reward)
        
        if done:
            episode += 1
            rewards = np.vstack(agent.rewards)
            rewards = agent.discount_rewards(rewards)
            agent.train(rewards)
            msg = '%s\t%d\t%s\t%f' % ('Episode',episode,'Score', score);
            print(msg);
            with open(LogName,'a+') as LogFile:
                LogFile.write(msg+'\n');
            
            score = 0
            state = env.reset()
            prev_x = None
            if episode > 1 and episode % 10 == 0:
                agent.save(AgentFile)

[2017-06-12 12:05:54,616] Making new env: Pong-v0
[2017-06-12 12:05:55,116] Making new env: Pong-v0


Episode	1	Score	-21.000000
Episode	2	Score	-19.000000
Episode	3	Score	-20.000000
Episode	4	Score	-20.000000
Episode	5	Score	-21.000000
Episode	6	Score	-20.000000
Episode	7	Score	-21.000000
Episode	8	Score	-20.000000
Episode	9	Score	-21.000000
Episode	10	Score	-20.000000
Episode	11	Score	-21.000000
Episode	12	Score	-21.000000
Episode	13	Score	-21.000000
Episode	14	Score	-20.000000
Episode	15	Score	-20.000000
Episode	16	Score	-21.000000
Episode	17	Score	-21.000000
Episode	18	Score	-21.000000
Episode	19	Score	-19.000000
Episode	20	Score	-21.000000
Episode	21	Score	-20.000000
Episode	22	Score	-21.000000
Episode	23	Score	-21.000000
Episode	24	Score	-19.000000
Episode	25	Score	-21.000000
Episode	26	Score	-21.000000
Episode	27	Score	-21.000000
Episode	28	Score	-18.000000
Episode	29	Score	-20.000000
Episode	30	Score	-20.000000
Episode	31	Score	-20.000000
Episode	32	Score	-21.000000
Episode	33	Score	-21.000000
Episode	34	Score	-20.000000
Episode	35	Score	-21.000000
Episode	36	Score	-21.000000
E

Episode	288	Score	-20.000000
Episode	289	Score	-18.000000
Episode	290	Score	-21.000000
Episode	291	Score	-20.000000
Episode	292	Score	-20.000000
Episode	293	Score	-21.000000
Episode	294	Score	-20.000000
Episode	295	Score	-21.000000
Episode	296	Score	-21.000000
Episode	297	Score	-20.000000
Episode	298	Score	-21.000000
Episode	299	Score	-19.000000
Episode	300	Score	-20.000000
Episode	301	Score	-19.000000
Episode	302	Score	-20.000000
Episode	303	Score	-21.000000
Episode	304	Score	-19.000000
Episode	305	Score	-20.000000
Episode	306	Score	-20.000000
Episode	307	Score	-20.000000
Episode	308	Score	-21.000000
Episode	309	Score	-20.000000
Episode	310	Score	-21.000000
Episode	311	Score	-20.000000
Episode	312	Score	-20.000000
Episode	313	Score	-20.000000
Episode	314	Score	-20.000000
Episode	315	Score	-21.000000
Episode	316	Score	-21.000000
Episode	317	Score	-20.000000
Episode	318	Score	-21.000000
Episode	319	Score	-20.000000
Episode	320	Score	-20.000000
Episode	321	Score	-21.000000
Episode	322	Sc

Episode	571	Score	-20.000000
Episode	572	Score	-19.000000
Episode	573	Score	-21.000000
Episode	574	Score	-21.000000
Episode	575	Score	-20.000000
Episode	576	Score	-20.000000
Episode	577	Score	-20.000000
Episode	578	Score	-21.000000
Episode	579	Score	-21.000000
Episode	580	Score	-21.000000
Episode	581	Score	-21.000000
Episode	582	Score	-20.000000
Episode	583	Score	-20.000000
Episode	584	Score	-20.000000
Episode	585	Score	-21.000000
Episode	586	Score	-19.000000
Episode	587	Score	-21.000000
Episode	588	Score	-21.000000
Episode	589	Score	-21.000000
Episode	590	Score	-21.000000
Episode	591	Score	-20.000000
Episode	592	Score	-21.000000
Episode	593	Score	-21.000000
Episode	594	Score	-21.000000
Episode	595	Score	-20.000000
Episode	596	Score	-20.000000
Episode	597	Score	-21.000000
Episode	598	Score	-21.000000
Episode	599	Score	-21.000000
Episode	600	Score	-20.000000
Episode	601	Score	-21.000000
Episode	602	Score	-21.000000
Episode	603	Score	-20.000000
Episode	604	Score	-19.000000
Episode	605	Sc

KeyboardInterrupt: 

In [7]:
print >> LogFile, 'Hi'

In [15]:
LogFile = open('Models/%s.log'%AgentName,'a')
LogFile.write('Hi')