In [1]:
lst = [1,2,3];
lst.index(1)

0

In [2]:
import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Reshape, Flatten
from keras.optimizers import Adam
from keras.layers.convolutional import Convolution2D


class PGAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.001
        self.states = []
        self.gradients = []
        self.rewards = []
        self.probs = []
        self.model = self._build_model()
        self.summary = self.model.summary;

    def _build_model(self):
        model = Sequential()
        model.add(Reshape((80, 80, 1), input_shape=(self.state_size,)))
        model.add(Convolution2D(25, (6, 6), subsample=(3, 3), border_mode='same',
                                activation='relu', init='he_uniform'))
        model.add(Convolution2D(5, (6, 6), subsample=(1, 1), border_mode='same',
                                activation='relu', init='he_uniform'))
        model.add(Flatten())
        model.add(Dense(20, activation='relu', init='he_uniform'))
        model.add(Dense(20, activation='relu', init='he_uniform'))
        model.add(Dense(self.action_size, activation='softmax'))
        opt = Adam(lr=self.learning_rate)
        model.compile(loss='categorical_crossentropy', optimizer=opt)
        return model

    def remember(self, state, action, prob, reward):
        y = np.zeros([self.action_size])
        y[action] = 1
        self.gradients.append(np.array(y).astype('float32') - prob)
        self.states.append(state)
        self.rewards.append(reward)

    def act(self, state):
        # state = state.reshape([1, state.shape[0]])
        aprob = self.model.predict(state, batch_size=1).flatten()
        self.probs.append(aprob)
        prob = aprob / np.sum(aprob)
        action = np.random.choice(self.action_size, 1, p=prob)[0]
        return action, prob

    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, rewards.size)):
            if rewards[t] != 0:
                running_add = 0
            running_add = running_add * self.gamma + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards

    def train(self,rewards):
        gradients = np.vstack(self.gradients)
        # rewards = np.vstack(self.rewards)
        # rewards = self.discount_rewards(rewards)
        rewards = (rewards - np.mean(rewards,keepdims=1)) / np.std(rewards)
        gradients *= rewards
        X = np.squeeze(np.vstack([self.states]))
        Y = self.probs + self.learning_rate * np.squeeze(np.vstack([gradients]))
        self.model.train_on_batch(X, Y)
        self.states, self.probs, self.gradients, self.rewards = [], [], [], []

    def load(self, name):
        global episode
        self.model.load_weights(name)
    def readlog(self, LogName):
#         LogName = self.LogName;
        with open(LogName,'rb') as f:
                first = f.readline()      # Read the first line.
                f.seek(-2, 2)             # Jump to the second last byte.
                while f.read(1) != b"\n": # Until EOL is found...
                    f.seek(-2, 1)         # ...jump back the read byte plus one more.
                last = f.readline() 
                lst = last.split('\t');
                eind = lst.index('Episode')+1;
                self.episode = int(lst[eind]);

    def save(self, name):
        self.model.save_weights(name)

def preprocess(I):
    I = I[35:195]
    I = I[::2, ::2, 0]
    I[I == 144] = 0
    I[I == 109] = 0
    I[I != 0] = 1
    return I.astype(np.float).ravel()
if __name__ == "__main__":
    env = gym.make("Pong-v0")
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = 80 * 80
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
    
agent.summary();

Using TensorFlow backend.
[2017-06-14 20:15:10,997] Making new env: Pong-v0


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 80, 80, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 27, 27, 25)        925       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 27, 27, 5)         4505      
_________________________________________________________________
flatten_1 (Flatten)          (None, 3645)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                72920     
_________________________________________________________________
dense_2 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 126       
Total para



In [3]:
if __name__ == "__main__":
    env = gym.make("Pong-v0")
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = 80 * 80
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
    
agent.summary();

[2017-06-14 20:15:11,515] Making new env: Pong-v0


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_2 (Reshape)          (None, 80, 80, 1)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 27, 27, 25)        925       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 27, 27, 5)         4505      
_________________________________________________________________
flatten_2 (Flatten)          (None, 3645)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 20)                72920     
_________________________________________________________________
dense_5 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 126       
Total para



In [11]:
if __name__ == "__main__":
    env = gym.make("Pong-v0")
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = 80 * 80
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
#     agent.summary();

AgentName = 'pong_minimal-s5L1'
AgentFile = 'Models/%s.h5'%AgentName;
LogName = 'Models/%s.log'%AgentName;
# LogFile = open(LogName,'a');
resume = 1;
render = 1;
# agent.readlog(LogName)
# print(agent.episode)

[2017-06-14 21:28:32,037] Making new env: Pong-v0


In [5]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 1)

In [1]:
if __name__ == "__main__":
    env = gym.make("Pong-v0")
    state = env.reset()
    prev_x = None
    score = 0
    episode = 0

    state_size = 80 * 80
    action_size = env.action_space.n
    agent = PGAgent(state_size, action_size)
#     agent.summary();
    

    if resume:
        agent.load(AgentFile);
        agent.readlog(LogName);
        episode = agent.episode;
    while episode ==  agent.episode:
        if render:
            env.render()

        cur_x = preprocess(state)
        x = cur_x - prev_x if prev_x is not None else np.zeros(state_size)
        prev_x = cur_x

        action, prob = agent.act(np.expand_dims(x,1).T)
        state, reward, done, info = env.step(action)
        score += reward
        agent.remember(x, action, prob, reward)
        
        if done:
            episode += 1
            rewards = np.vstack(agent.rewards)
            rewards = agent.discount_rewards(rewards)
            grads = np.vstack(agent.gradients);
            pca.fit (grads )
            lvar = pca.explained_variance_ratio_[0];
#             all_var = np.sum(np.var(grads,axis = 0));            
            
            agent.train(rewards)
            msg = '%s\t%d\t%s\t%f\t%s\t%f' % ('Episode',episode,'Score', score,'largest_variance',lvar);
            print(msg);
            with open(LogName,'a+') as LogFile:
                LogFile.write(msg+'\n');
            
            score = 0
            state = env.reset()
            prev_x = None
            if episode > 1 and episode % 10 == 0 and not render:
                agent.save(AgentFile)

NameError: name 'gym' is not defined

In [7]:
pca.explained_variance_ratio_[0]

0.99948996594645101

In [8]:
# # len(agent.probs)
# from sklearn.decomposition import PCA
# X = np.vstack(agent.gradients)
# pca = PCA(n_components = 1)
# pca.fit(X)
# print(pca.explained_variance_ratio_)