In [6]:
#덱도 추가해서 dqn처럼만들것


In [174]:
from collections import deque
import sys
import gym
import pylab
import numpy as np
from keras.layers import Dense,merge
from keras.models import Sequential ,Input, Model
from keras.optimizers import Adam
from keras import backend as K
import random
import matplotlib.pyplot as plt

In [175]:
from numpy.random import randint

In [205]:
#target만들어서 해줘보자
class ActorCriticModel:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.value_size = 1
        
        self.discount_factor = 0.99
        self.actor_lr = 0.001
        self.critic_lr = 0.005
        self.epsilon = 1
        self.decaying_epsilon = 0.999
        self.tau = 0.001
        self.memory = deque(maxlen =2000)
        
        self.actor ,_ = self.build_actor()
        self.target_actor,_ = self.build_actor()
        #
        self.actor_updater = self.actor_optimizer()
        self.target_actor_updater = self.actor_optimizer()
        
        self.critic = self.build_critic()
        self.target_critic = self.build_critic()
        self.critic_updater = self.critic_optimizer()
        self.target_critic_updater = self.critic_optimizer()
        
        
        
    def build_actor(self):
        input_layer = Input(shape =(self.state_size,))
        h1 = Dense(24, activation ='relu',kernel_initializer='he_uniform')(input_layer)
        h2 = Dense(24, activation ='relu',kernel_initializer='he_uniform')(h1)
        h3 = Dense(self.action_size, activation = 'relu',kernel_initializer='he_uniform')(h2)
        output = Dense(self.action_size, activation='softmax',kernel_initializer='he_uniform')(h3)
        model = Model(inputs =input_layer, outputs = output)
        return model,input_layer
        
    def build_critic(self):
        critic = Sequential()
        critic.add(Dense(24, input_dim=self.state_size, activation='relu',
                         kernel_initializer='he_uniform'))
        critic.add(Dense(24, input_dim=self.state_size, activation='relu',
                         kernel_initializer='he_uniform'))
        critic.add(Dense(self.value_size, activation='linear',
                         kernel_initializer='he_uniform'))
        return critic
        
        
        #return model
        
    def get_action(self,state):
        if(random.random()< self.epsilon):
            if(self.epsilon > 0.01):
                self.epsilon = self.epsilon * self.decaying_epsilon
            
            return randint((self.action_size))
        
        policy= self.target_actor.predict(state) #.flatten()
        #print(policy)
        #print(np.random.choice(self.action_size,1,p=policy[0])[0])
        return np.random.choice(self.action_size,1,p=policy[0])[0]
        
    def actor_optimizer(self):
        action = K.placeholder(shape =[None, self.action_size])
        advantage = K.placeholder(shape=[None,])
        action_prob = K.sum(action * self.actor.output,axis = 1)
        cross_entropy = K.log(action_prob) * advantage
        loss = -K.sum(cross_entropy) #최대화하려면 마이너스?
        
        optimizer = Adam(lr=self.actor_lr)
        updates = optimizer.get_updates(self.actor.trainable_weights,[],loss)
        train = K.function([self.actor.input,action,advantage],[],updates=updates)
        return train
            
    def critic_optimizer(self):
        target = K.placeholder(shape=[None,])
        loss = K.mean(K.square(target - self.critic.output))
        
        optimizer = Adam(lr=self.critic_lr)
        updates = optimizer.get_updates(self.critic.trainable_weights,[],loss)
        train = K.function([self.critic.input,target],[],updates = updates)
        return train
    
    def target_actor_train(self):
        actor_weights = self.actor.get_weights()
        actor_target_weights = self.target_actor.get_weights()
        for i in range(len(actor_weights)):
            actor_target_weights[i] = self.tau * actor_weights[i] + (1-self.tau) \
            * actor_target_weights[i]
        self.target_actor.set_weights(actor_target_weights)
    
    def target_critic_train(self):
        critic_weights = self.critic.get_weights()
        critic_target_weights = self.target_critic.get_weights()
        for i in range(len(critic_weights)):
            critic_target_weights[i] = self.tau * critic_weights[i] + (1-self.tau)\
            * critic_target_weights[i]
        self.target_critic.set_weights(critic_target_weights)
    
    def train_model(self,batch_size):
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            value = self.target_critic.predict(state)[0]
            next_value = self.target_critic.predict(next_state)[0]
            
            #print(value)
            act = np.zeros([1,self.action_size])
            act[0][action] = 1


            if done:
                advantage = reward - value
                target = [reward]

            else :
                advantage = (reward + self.discount_factor * next_value) - value
                target = reward + self.discount_factor * next_value
            self.actor_updater([state,act,advantage])
            self.critic_updater([state,target])
        self.target_actor_train()
        self.target_critic_train()
        self.memory.clear()
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state,action,reward,next_state,done))

In [206]:

    # CartPole-v1 환경, 최대 타임스텝 수가 500
EPISODES = 100
env = gym.make('CartPole-v1')
    # 환경으로부터 상태와 행동의 크기를 받아옴
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

    # 액터-크리틱(A2C) 에이전트 생성
agent = ActorCriticModel(state_size, action_size)

scores, episodes = [], []



In [207]:
batch_size = 128
for e in range(1000):
    done = False
    score = 0
    rendering = False
    state = env.reset()
    state = np.reshape(state, [1, state_size])

    while not done:
        if rendering:
            env.render()

        action = agent.get_action(state)
        #print(action)
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
            # 에피소드가 중간에 끝나면 -100 보상
        reward = reward if not done or score == 499 else -100
        agent.remember(state, action, reward, next_state, done)  
        #agent.train_model(state, action, reward, next_state, done)

        score += reward
        state = next_state
        if done:
                # 에피소드마다 학습 결과 출력
            score = score if score == 500.0 else score + 100
            scores.append(score)
            episodes.append(e)
            print("episode:", e, "  score:", score)

                # 이전 10개 에피소드의 점수 평균이 490보다 크면 학습 중단
            if np.mean(scores[-min(10, len(scores)):]) > 490:
                    #agent.actor.save_weights("./save_model/cartpole_actor.h5")
                    #agent.critic.save_weights(
                    #    "./save_model/cartpole_critic.h5")
                sys.exit()
        if len(agent.memory) > batch_size:
            agent.train_model(batch_size)

episode: 0   score: 21.0
episode: 1   score: 19.0
episode: 2   score: 13.0
episode: 3   score: 19.0
episode: 4   score: 13.0
episode: 5   score: 17.0
episode: 6   score: 13.0
episode: 7   score: 17.0
episode: 8   score: 12.0
episode: 9   score: 12.0
episode: 10   score: 12.0
episode: 11   score: 17.0
episode: 12   score: 18.0
episode: 13   score: 20.0
episode: 14   score: 22.0
episode: 15   score: 16.0
episode: 16   score: 27.0
episode: 17   score: 43.0
episode: 18   score: 22.0
episode: 19   score: 18.0
episode: 20   score: 39.0
episode: 21   score: 24.0
episode: 22   score: 15.0
episode: 23   score: 19.0
episode: 24   score: 11.0
episode: 25   score: 28.0
episode: 26   score: 31.0
episode: 27   score: 12.0
episode: 28   score: 12.0
episode: 29   score: 46.0
episode: 30   score: 15.0
episode: 31   score: 30.0
episode: 32   score: 26.0
episode: 33   score: 38.0
episode: 34   score: 13.0
episode: 35   score: 26.0
episode: 36   score: 16.0
episode: 37   score: 11.0
episode: 38   score: 1

episode: 310   score: 22.0
episode: 311   score: 27.0
episode: 312   score: 14.0
episode: 313   score: 17.0
episode: 314   score: 54.0
episode: 315   score: 11.0
episode: 316   score: 12.0
episode: 317   score: 23.0
episode: 318   score: 10.0
episode: 319   score: 23.0
episode: 320   score: 14.0
episode: 321   score: 21.0
episode: 322   score: 13.0
episode: 323   score: 11.0
episode: 324   score: 14.0
episode: 325   score: 21.0
episode: 326   score: 10.0
episode: 327   score: 24.0
episode: 328   score: 31.0
episode: 329   score: 13.0
episode: 330   score: 15.0
episode: 331   score: 10.0
episode: 332   score: 22.0
episode: 333   score: 26.0
episode: 334   score: 51.0
episode: 335   score: 31.0
episode: 336   score: 26.0
episode: 337   score: 13.0
episode: 338   score: 10.0
episode: 339   score: 17.0
episode: 340   score: 17.0
episode: 341   score: 20.0
episode: 342   score: 24.0
episode: 343   score: 21.0
episode: 344   score: 11.0
episode: 345   score: 33.0
episode: 346   score: 13.0
e

episode: 616   score: 16.0
episode: 617   score: 13.0
episode: 618   score: 22.0
episode: 619   score: 37.0
episode: 620   score: 53.0
episode: 621   score: 17.0
episode: 622   score: 12.0
episode: 623   score: 56.0
episode: 624   score: 31.0
episode: 625   score: 26.0
episode: 626   score: 27.0
episode: 627   score: 13.0
episode: 628   score: 22.0
episode: 629   score: 34.0
episode: 630   score: 12.0
episode: 631   score: 13.0
episode: 632   score: 16.0
episode: 633   score: 22.0
episode: 634   score: 46.0
episode: 635   score: 23.0
episode: 636   score: 15.0
episode: 637   score: 13.0
episode: 638   score: 18.0
episode: 639   score: 9.0
episode: 640   score: 10.0
episode: 641   score: 20.0
episode: 642   score: 21.0
episode: 643   score: 20.0
episode: 644   score: 8.0
episode: 645   score: 11.0
episode: 646   score: 20.0
episode: 647   score: 15.0
episode: 648   score: 13.0
episode: 649   score: 14.0
episode: 650   score: 14.0
episode: 651   score: 25.0
episode: 652   score: 8.0
epis

KeyboardInterrupt: 

In [None]:
plt.plot(scores)
plt.show()

In [195]:
test = deque(maxlen = 3)

In [197]:
test.append("a")

1