<a href="https://colab.research.google.com/github/sjbaek12/sjbaek12.github.io/blob/master/policy_gradient_module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
tf.config.run_functions_eagerly

import tensorflow as tf
import numpy as np

from tensorflow import keras
import keras.layers as layers
from keras.layers import Dense
from keras.models import Model
from keras.optimizers import Adam, RMSprop, SGD
import keras.backend as K  # 케라스의 backend를 K. 형식으로 호출하는 것이고 여기서는 tensorflow이다.

In [None]:
class Agent(object):
  def __init__(self, ALPHA=0.01, GAMMA=0.99, n_actions=4, layer1_size=8, layer2_size=10, input_dims=4, fname='reinforce.h5'):
    self.gamma = GAMMA
    self.lr = ALPHA
    self.G = 0
    self.input_dims = input_dims
    self.fc1_dims = layer1_size
    self.fc2_dims = layer2_size
    self.n_actions = n_actions
    self.state_memory = []
    self.action_memory = []
    self.reward_memory = []

    self.policy, self.predict = self.build_policy_network()
    self.action_space = [i for i in range(n_actions)]
    self.model_file = fname

  def build_policy_network(self):
    input=Input(shape=(self.input_dims,))
    advantages = Input(shape=[1])
    dense1 = Dense(self.fc1_dims, activation='relu')(input)
    dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
    probs = Dense(self.n_actions, activation='softmax')(dense2)

    def custom_loss(y_true, y_pred):
      out = K.clip(y_pred, 1e-8, 1-1e-8)
      log_lik = K.sum(y_true*K.log(out), axis=1)*advantages
      return -log_lik
   
    policy = Model(inputs = [input, advantages], outputs = probs) # input = , output= 를 생략한 상태이다.
    policy.compile(optimizer = Adam(lr=self.lr), loss=custom_loss)
    
    predict = Model(inputs = input, outputs = probs) #input = , output= 를 생략한 상태이다.

    return policy, predict

  def choose_action(self, observation):
    state = observation[np.newaxis, :] # [1.0, 1.0]을 [[1.0, 1.0]]와 같이 한축을 추가한다.
    probabilities = self.predict.predict(state)[0]
    action = np.random.choice(self.action_space, p=probabilities)

    return action

  def store_transition(self, observation, action, reward):
    self.action_memory.append(action)
    self.state_memory.append(observation)
    self.reward_memory.append(reward)
    
  def learn(self):
    state_memory = np.array(self.state_memory)
    action_memory = np.array(self.action_memory)
    reward_memory = np.array(self.reward_memory)

    y = np.zeros([len(action_memory), self.n_actions])
    y[np.arange(len(action_memory)), action_memory] = 1 # action memory 길이만큼 zero 리스트를 만들고, 각 리스트에 선택한 행동에 1을 넣어준다


    G= np.zeros_like(reward_memory)
    for t in range(len(reward_memory)):
      G_sum = 0
      discout = 1
      for k in range(t, len(reward_memory)):
        G_sum += reward_memory[k]*discout
        discout *= self.gamma

      G[t] = G_sum
    mean = np.mean(G)
    std = np.std(G) if np.std(G) > 0 else 1
    self.G = (G-mean)/std

    self.G = np.reshape(self.G, (10,1))


    history = self.policy.train_on_batch([state_memory, self.G], y)

 

    self.state_memory= []
    self.action_memory = []
    self.reward_memory = []
    


In [None]:
agent = Agent(ALPHA=0.01, GAMMA=0.99, n_actions=4, layer1_size=8, layer2_size=10, input_dims=4, fname='reinforce.h5')

score_history = []

n_episodes = 4000

bandits = [20,10,5,-10]

def pullBandit(chosen_action):
  result = np.random.randn(1)
  if result > bandits[chosen_action]:
    reward = 1.0
    return reward
  else:
    reward = 0.0
    return reward

weights = np.array([1.0, 1.0, 1.0, 1.0])

for i in range(n_episodes):
  done = 0
  score = 0
  observation = weights

  while done < 10:
    action = agent.choose_action(observation)
    reward = pullBandit(action)
    agent.store_transition(observation, action, reward)
    score += reward
    done += 1
  score_history.append(score)

  agent.learn()

#  print('episode', i, 'score:', score_history[i])


  "Even though the tf.config.experimental_run_functions_eagerly "


In [None]:
import matplotlib.pyplot as plt

In [None]:
zero = 0
one = 0
two = 0
three = 0
four = 0
five = 0
six = 0
seven = 0

for i in range(len(score_history)):
  if i < 3000:
    continue
  if score_history[i] == 0:
    zero += 1
  if score_history[i] == 1:
    one += 1
  if score_history[i] == 2:
    two += 1
  if score_history[i] == 3:
    three += 1
  if score_history[i] == 4:
    four += 1
  if score_history[i] == 5:
    five += 1

In [None]:
print(one, two, three, four, five)

312 288 186 74 16


In [None]:
w = np.array(weights)
w[np.newaxis, :]

array([[1., 1., 1., 1.]])