<a href="https://colab.research.google.com/github/sjbaek12/sjbaek12.github.io/blob/master/bandit_policy_gradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np

높은 보상을 일으키는 행동의 weights를 증가시킨다.

이것은 보상에 대한 그라디언트가 아니라 Policy 함수를 직접 편미분하는 방식이다. 아래 예제는 정확한 답을 구해내고는 있지만 사실 이것은 policy gradient가 아니다.

In [None]:
bandits = [0.2,0,-0.2,-5]

def pullBandit(chosen_action):
  result = np.random.randn(1)
  rewards = [0.0, 0.0, 0.0, 0.0]
  if result > bandits[chosen_action]:
    rewards[chosen_action] = 1.0
    return rewards
  else:
    rewards[chosen_action] = -1.0
    return rewards

num_episode = 4000

episode = 0 

e = 0.5

weights = [0.1, 0.1, 0.1, 0.1]

small = 0.000000000001

def deriva():       # policy 함수를 구성하는 웨이트의 그래디언트를 직접 구하는 함수이다
  dv = []
  for i in range(len(weights)):
    prob_chosen_small = (weights[i]+small)/(np.sum(weights)+small)
    prob_chosen = (weights[i])/np.sum(weights)
    d = (prob_chosen_small - prob_chosen)/small
    dv.append(d)
  return dv


while episode < num_episode:
  
  if np.random.rand(1) < e:
    chosen_action = np.random.randint(4)    
  else:
    probs = [weights[0]/np.sum(weights), weights[1]/np.sum(weights), weights[2]/np.sum(weights), weights[3]/np.sum(weights)]
    chosen_action = np.argmax(probs)
    
  rewards = pullBandit(chosen_action)
  dv = deriva()
  weights[chosen_action] = max((weights[chosen_action] + 0.01*np.matmul(dv,rewards), 0))
  
  
  if episode % 100 == 0:
    print(chosen_action, rewards, weights)

  e = e * (1- episode/num_episode)


  episode = episode + 1


policy gradient는 목적함수인 E[r]을 최대화 하는 것이다.

이론적 도출과정을 통해서 ▽E[r] = E[▽log p(s,a) * r]과 같고,▽logp(s,a)는 score 함수라고 부른다.

뉴럴넷에서 loss 함수는 신경망의 mininmize 목적함수이고 cross-entropy함수로 접근하면 loss = - y*ln(a)이다.
여기서 y = 1일때 r이 발생되므로 이것은 최대화의 경우를 생각하면 ln(a)*r을 목적함수를 생각하면 된다. 

In [2]:
from tensorflow import keras
import keras.layers as layers
from keras.layers import Dense
from keras.models import Model
from keras.optimizers import Adam, RMSprop, SGD
import keras.backend as K  # 케라스의 backend를 K. 형식으로 호출하는 것이고 여기서는 tensorflow이다.
import tensorflow as tf

In [16]:
def custom_loss_test1(y_true, y_pred):
  log_lik = K.sum(K.log(y_pred + 1e-20)*y_true, axis=1)*adv
#  log_lik = K.log(y_pred + 1e-10)*y_true*adv
  return -log_lik # 모든 샘플의 크로스엔트로피의 평균인데, n의 배수로 나누어준 값이 된다.


In [17]:
def get_policy_model(lr):
  
  adv = layers.Input(shape=[1])
  inp_L = layers.Input(shape=(4,)) 
  dense1 = Dense(8,activation="relu")(inp_L) 
  dense1 = Dense(10, activation="relu")(dense1)
  output_s = layers.Dense(4, activation="softmax")(dense1)

  model_train = Model(inputs=[inp_L, adv], outputs = output_s)
#  model_train = Model(inputs=inp_L, outputs = output_s)
  model_train.compile(loss=custom_loss_test1, optimizer=SGD(lr), metrics = ['mae'])
  model_predict = Model(inputs=inp_L, outputs=output_s)   # model_train.predict와 model_predict.predict의 결과는 완전히 일치한다. 다만 입력의 편의를 위해서 분리한다.
  return model_train, model_predict

In [18]:
model_train, model_predict = get_policy_model(0.1)

In [19]:
model_train.summary()

Model: "functional_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
dense_9 (Dense)                 (None, 8)            40          input_8[0][0]                    
__________________________________________________________________________________________________
dense_10 (Dense)                (None, 10)           90          dense_9[0][0]                    
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 1)]          0                                            
______________________________________________________________________________________

In [20]:
bandits = [20,10,5,-10]

def pullBandit(chosen_action):
  result = np.random.randn(1)
  rewards = [0.0, 0.0, 0.0, 0.0]
  if result > bandits[chosen_action]:
    rewards[chosen_action] = 1.0
    return rewards
  else:
    rewards[chosen_action] = 0.0
    return rewards

num_episode = 4000

episode = 0 


weights = [1.0, 1.0, 1.0, 1.0]
score = [0.0, 0.0, 0.0, 0.0]

reward_memory = []
action_memory =[]

while episode < num_episode:
  
  for i in range(10):
    
    probs = model_predict.predict([weights])
    chosen_action = np.random.choice(4, p=probs[0])
    
    rewards = np.sum(pullBandit(chosen_action))
    reward_memory.append(rewards)

    b = [0.0, 0.0, 0.0,0.0]
    b[chosen_action] = 1.0
    action_memory.append(b)
  
  y = np.array(action_memory) 

  G = np.zeros_like(reward_memory)

  for t in range(len(reward_memory)):
    G_sum = 0.0
    discount = 1
    gamma = 1

    for k in range(t, len(reward_memory)):
      G_sum += reward_memory[k]*discount
      discount *= gamma

    G[t]= G_sum

  mean = np.mean(G)
  std = np.std(G) if np.std(G) > 0 else 1
  G = (G-mean)/std 

  G = np.array(G)
  adv = np.reshape(G, (10,1))

  
#  adv = np.array([[np.sum(pullBandit(chosen_action))]])
  
  a = []
  for i in range(len(reward_memory)):
    a.append(weights)
  a = np.array(a)

  
  loss=model_train.train_on_batch([a, adv], y)

  reward_memory = []
  action_memory =[]

  if episode % 500 == 0:
    print(episode, chosen_action, "p_act1:{:0.2f}, p_act2:{:0.2f}, p_act3:{:0.2f}, p_act4:{:0.2f}  ".format(probs[0][0],probs[0][1],probs[0][2],probs[0][3]))

  episode = episode + 1


0 3 p_act1:0.19, p_act2:0.21, p_act3:0.29, p_act4:0.31  
500 1 p_act1:0.19, p_act2:0.21, p_act3:0.29, p_act4:0.31  
1000 3 p_act1:0.19, p_act2:0.21, p_act3:0.29, p_act4:0.31  
1500 2 p_act1:0.19, p_act2:0.21, p_act3:0.29, p_act4:0.31  
2000 0 p_act1:0.19, p_act2:0.21, p_act3:0.29, p_act4:0.31  
2500 0 p_act1:0.19, p_act2:0.21, p_act3:0.29, p_act4:0.31  
3000 0 p_act1:0.19, p_act2:0.21, p_act3:0.29, p_act4:0.31  
3500 0 p_act1:0.19, p_act2:0.21, p_act3:0.29, p_act4:0.31  


In [None]:
print("averge: {:0.2f}".format(0.009001))

averge: 0.01


In [21]:
 model_predict.predict([weights])

array([[0.19022828, 0.20891608, 0.29173118, 0.3091244 ]], dtype=float32)

In [None]:
  a = []
  for i in range(len(reward_memory)):
    a.append(weights)
  a = np.array(a)

In [None]:
a = [0.0, 0.0, 0.0, 0.0]
b = [1.0, 1.0, 1.0, 1.0]

[sum(x) for x in zip(a, b)]

[1.0, 1.0, 1.0, 1.0]

In [None]:
def custom_loss_test2(y_true, y_pred):
  log_lik = K.sum(K.log(y_true * (y_true - y_pred) + (1-y_true) * (y_true+y_pred)))*adv
#
  return log_lik # 모든 샘플의 크로스엔트로피의 평균인데, n의 배수로 나누어준 값이 된다.


y_pred = np.array([[0.99, 0.5, 0.1, 0.1]])
y_true = np.array([[1.0,0.0,0.0, 0.0]])

adv = np.array([1.0]) 

custom_loss_test2(y_true, y_pred)

<tf.Tensor: shape=(1,), dtype=float64, numpy=array([-9.90348755])>

In [None]:
a = [0.1, 0.1, 0.1, 0.1]

a = np.array([a])

 

b = np.array([[0.0, 1.0, 0.0, 0.0]])

adv = np.array([[-1.0]])

history = model_train.train_on_batch([a, adv],b)



In [None]:
class Agendt(object):
  def __init__(self, ALPHA, )


y_pred = np.array([[0.9, 0.9, 0.1, 0.0]])
y_true = np.array([[1.0,0.0,0.0, 0.0]])

adv = np.array([1.0]) 

print(custom_loss(y_true, y_pred))

def choose_action(observation):
  probabilites = model_predict.predict()
  action = np.random.choice(4, p=probabilities)
  return action

def store_transtion(observation, action, reward):
  action_memory.append

tf.Tensor(0.10536051565782628, shape=(), dtype=float64)


In [None]:
def custom_loss(y_true, y_pred):
  out = K.clip(y_pred, 1e-8, 1-1e-8)
  log_lik = y_true*K.log(out)*adv

  return K.sum(-log_lik)