# Solving CartPole problem using PG
> PG 알고리즘을 이용하여 CartPole 문제를 풀어보자

In [32]:
import os, sys, gym, copy
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras import backend as K
from gym import envs
%matplotlib inline

In [58]:

class REINFORCEAgent:
    def __init__(self, env):
        self.env = env
        self.action_size = env.action_space.n
        self.observation_space = env.observation_space.shape[0]
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.states, self.actions, self.rewards = [], [], []
        self.model = self.build_model()
        self.optimizer = self.build_optimizer()
        print("action:", self.action_size, ", state: ", self.observation_space)
        
    def __del__(self):
        pass
    
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.observation_space, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='softmax'))
        model.summary()
        return model
    
    def build_optimizer(self):
        action = K.placeholder(shape=[None, self.action_size])
        discounted_rewards = K.placeholder(shape=[None,])
        # cross-entropy
        action_prob = K.sum(action * self.model.output, axis=1)
        cross_entropy = K.log(action_prob) * discounted_rewards
        loss = -K.sum(cross_entropy)
        # training-function
        optimizer = Adam(lr=self.learning_rate)
        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
        train = K.function([self.model.input, action, discounted_rewards], [], updates=updates)
        return train
    
    def append_sample(self, state, action, reward):
        self.states.append(state[0]) # bugfix, state[0] 대신 state 를 넣었음
        act = np.zeros(self.action_size)
        act[action] = 1
        self.actions.append(act)
        self.rewards.append(reward)
    
    def get_action(self, state):
        policy = self.model.predict(state)[0]
        return np.random.choice(self.action_size, 1, p=policy)[0]
    
    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * self.discount_factor + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards
    
    def train_model(self):
        print('rewards.shape', type(self.rewards))
        print('discount_rewards', type(self.discount_rewards(self.rewards)))
        discounted_rewards = np.float32(self.discount_rewards(self.rewards))
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)
        print('discounted_rewards', type(discounted_rewards), discounted_rewards.shape)
        print('states', type(self.states), len(self.states))
        print('actions', type(self.actions))
        print('rewards', type(self.rewards))
        
        self.optimizer([self.states, self.actions, discounted_rewards])
        self.states, self.actions, self.rewards = [], [], []

In [59]:

def learn(max_episodes, render, reset):
    env = gym.make('CartPole-v1')
    agent = REINFORCEAgent(env)
    global_step = 0
    episodes, scores = [], []
    
    for e in range(max_episodes+1):
        if render:
            env.render()
            
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1,4])
        print(state)
        break
        
        while not done:
            global_step += 1
            
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            agent.append_sample(state, action, reward)
            
            score += reward
            next_state = np.reshape(next_state, [1,4])
            state = copy.deepcopy(next_state)
            
            if done:
                agent.train_model()
                scores.append(score)
                episodes.append(e)
                print("episodes:", e, " score:", score, " time_step:" , global_step)
            

In [62]:
learn(1000, True, False)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_58 (Dense)             (None, 24)                120       
_________________________________________________________________
dense_59 (Dense)             (None, 24)                600       
_________________________________________________________________
dense_60 (Dense)             (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
action: 2 , state:  4
[[ 0.03144073 -0.0032607   0.00020903 -0.04581364]]
