# Solving CartPole problem using PG
> PG 알고리즘을 이용하여 CartPole 문제를 풀어보자

<video src="completed/pg-500-episodes.mp4" controls></video>

In [1]:
import os, sys, gym, copy
import numpy as np
from numpy.random import seed
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras import backend as K
from gym import envs
%matplotlib inline

Using TensorFlow backend.


In [2]:
# 매번 같은 숫자를 내어준다는 의미가 아니라, 시드 함수 호출 이후 항상 같은 랜덤 숫자를 반환합니다
# <a href="https://machinelearningmastery.com/reproducible-results-neural-networks-keras/">Reproducible_Keras_Vars</a>
random_seed = 4523
seed(random_seed)
expected=[0.32639907465349627, 0.2682238894755389, 0.7840440067765225]

class RandomNumber:
    def __init__(self, expected):
        self.expected = expected
    def assert_equals(self, i):
        actual = np.random.rand()
        expected = self.expected[i]
        assert(expected == actual)

randn = RandomNumber(expected)
for x in range(len(expected)):
    randn.assert_equals(x)
    
# 텐서 플로우의 경우에도 마찬가지로 시드 변수를 제공하면 동일한 효과를 가질 수 있습니다
from tensorflow import set_random_seed
import tensorflow as tf

set_random_seed(random_seed)

# 테스트 랜덤 시더
seednum = 1047104523

class RandomSeeder:
    global random_seed
    def __init__(self):
        self.set_seed()
        
    def set_seed(self, seed_number=seednum):
        seed(seed_number)
        set_random_seed(seed_number)
        print("set_random_seed({})".format(seed_number))
        
    def reset(self):
        seed()
        set_random_seed(0)
        
    def debug(self):
        x = np.random.rand()
        print(x)

In [3]:
# REINFORCE algorithm agent
class REINFORCEAgent:
    def __init__(self, env):
        self.env = env
        self.action_size = env.action_space.n
        self.observation_space = env.observation_space.shape[0]
        self.discount_factor = 0.999
        self.learning_rate = 0.001
        self.states, self.actions, self.rewards = [], [], []
        self.model = self.build_model()
        self.optimizer = self.build_optimizer()
        print("action:", self.action_size, ", state: ", self.observation_space)
        
    def __del__(self):
        pass
    
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.observation_space, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='softmax'))
        model.summary()
        return model
    
    def build_optimizer(self):
        action = K.placeholder(shape=[None, self.action_size])
        discounted_rewards = K.placeholder(shape=[None,])
        # cross-entropy
        action_prob = K.sum(action * self.model.output, axis=1)
        cross_entropy = K.log(action_prob) * discounted_rewards
        loss = -K.sum(cross_entropy)
        # training-function
        optimizer = Adam(lr=self.learning_rate)
        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
        train = K.function([self.model.input, action, discounted_rewards], [], updates=updates)
        return train
    
    def append_sample(self, state, action, reward):
        self.states.append(state[0]) # bugfix, state[0] 대신 state 를 넣었음
        act = np.zeros(self.action_size)
        act[action] = 1
        self.actions.append(act)
        self.rewards.append(reward)
    
    def get_action(self, state):
        policy = self.model.predict(state)[0]
        return np.random.choice(self.action_size, 1, p=policy)[0]
    
    def get_greedy_action(self, state):
        _state = np.reshape(state, [1, self.observation_space])
        policy = self.model.predict(_state)[0]
        return np.argmax(policy)
    
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards
    
    def train_model(self):
#         print('rewards.shape', type(self.rewards))
#         print('discount_rewards', type(self.discount_rewards(self.rewards)))
        discounted_rewards = np.float32(self.discount_rewards(self.rewards))
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)
#         print('discounted_rewards', type(discounted_rewards), discounted_rewards.shape)
#         print('states', type(self.states), len(self.states))
#         print('actions', type(self.actions))
#         print('rewards', type(self.rewards))
        
        self.optimizer([self.states, self.actions, discounted_rewards])
        self.states, self.actions, self.rewards = [], [], []
    
    def has_model(self, filename):
        return os.path.exists(filename)

    def save_weights(self, filename):
        self.model.save_weights(filename)

    def load_weights(self, filename):
        self.model.load_weights(filename)
        
    def reset_weights(self, filename):
        if os.path.isfile(filename):
            os.remove(filename)
            
    def clear_cache_dir(self, dirname):
        if os.path.isdir(dirname):
            import shutil
            shutil.rmtree(dirname)

In [4]:
model_name='./models/pg.h5'
image_name='./graphs/pg.png'
cache_dir='./cartpole'
envs_name='CartPole-v1'

def test():
    env = gym.make(envs_name)
    agent = REINFORCEAgent(env)
    if agent.has_model(model_name):
        agent.load_weights(model_name)
    agent.clear_cache_dir(cache_dir)
        
    wenv = gym.wrappers.Monitor(env, 'cartpole')
    t = 0
    done = False
    current_state = wenv.reset()
    while not done:
        wenv.render()
        t = t+1
        action = agent.get_greedy_action(current_state)
        obs, reward, done, _ = wenv.step(action)
        new_state = obs
        current_state = new_state
    return t

def learn(max_episodes, render, reset):
    env = gym.make(envs_name)
    agent = REINFORCEAgent(env)
    global_step = 0
    episodes, scores = [], []
    
    if reset and agent.has_model(model_name):
        agent.reset_weights(model_name)
    if not reset and agent.has_model(model_name):
        agent.load_weights(model_name)
    
    for e in range(max_episodes+1):
        if render:
            env.render()
            
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1,4])
        
        while not done:
            global_step += 1
            
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            agent.append_sample(state, action, reward)
            
            score += reward
            next_state = np.reshape(next_state, [1,4])
            state = copy.deepcopy(next_state)
            
            if done:
                agent.train_model()
                scores.append(score)
                episodes.append(e)
                print("episodes:", e, " score:", score, " time_step:" , global_step)
    agent.save_weights(model_name) 
    env.close()

In [5]:
seeder = RandomSeeder()
learn(500, False, True)
test()

set_random_seed(1047104523)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
action: 2 , state:  4
episodes: 0  score: 11.0  time_step: 11
episodes: 1  score: 26.0  time_step: 37
episodes: 2  score: 13.0  time_step: 50
episodes: 3  score: 41.0  time_step: 91
episodes: 4  score: 28.0  time_step: 119
episodes: 5  score: 10.0  time_step: 129
episodes: 6  score: 23.0  time_step: 152
episodes: 7  score: 22.0  time_step: 174
episodes: 8  score: 25.0  time_ste

episodes: 175  score: 182.0  time_step: 7221
episodes: 176  score: 78.0  time_step: 7299
episodes: 177  score: 94.0  time_step: 7393
episodes: 178  score: 49.0  time_step: 7442
episodes: 179  score: 15.0  time_step: 7457
episodes: 180  score: 15.0  time_step: 7472
episodes: 181  score: 230.0  time_step: 7702
episodes: 182  score: 58.0  time_step: 7760
episodes: 183  score: 26.0  time_step: 7786
episodes: 184  score: 52.0  time_step: 7838
episodes: 185  score: 145.0  time_step: 7983
episodes: 186  score: 39.0  time_step: 8022
episodes: 187  score: 31.0  time_step: 8053
episodes: 188  score: 47.0  time_step: 8100
episodes: 189  score: 67.0  time_step: 8167
episodes: 190  score: 95.0  time_step: 8262
episodes: 191  score: 95.0  time_step: 8357
episodes: 192  score: 122.0  time_step: 8479
episodes: 193  score: 50.0  time_step: 8529
episodes: 194  score: 59.0  time_step: 8588
episodes: 195  score: 84.0  time_step: 8672
episodes: 196  score: 201.0  time_step: 8873
episodes: 197  score: 65.0 

episodes: 357  score: 369.0  time_step: 39650
episodes: 358  score: 207.0  time_step: 39857
episodes: 359  score: 231.0  time_step: 40088
episodes: 360  score: 465.0  time_step: 40553
episodes: 361  score: 350.0  time_step: 40903
episodes: 362  score: 196.0  time_step: 41099
episodes: 363  score: 378.0  time_step: 41477
episodes: 364  score: 179.0  time_step: 41656
episodes: 365  score: 266.0  time_step: 41922
episodes: 366  score: 372.0  time_step: 42294
episodes: 367  score: 318.0  time_step: 42612
episodes: 368  score: 172.0  time_step: 42784
episodes: 369  score: 339.0  time_step: 43123
episodes: 370  score: 500.0  time_step: 43623
episodes: 371  score: 332.0  time_step: 43955
episodes: 372  score: 207.0  time_step: 44162
episodes: 373  score: 241.0  time_step: 44403
episodes: 374  score: 111.0  time_step: 44514
episodes: 375  score: 491.0  time_step: 45005
episodes: 376  score: 500.0  time_step: 45505
episodes: 377  score: 346.0  time_step: 45851
episodes: 378  score: 260.0  time_

500