In [2]:
from keras import backend as K
from keras.layers import Dense,Input,Conv2D,MaxPool2D,Flatten
from keras.optimizers import Adam
from keras.models import Model
import numpy as np
import pydot
import tensorflow as tf

In [3]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()
a=[0.9,0.03,0.03,0.03]

In [4]:
def create_model_actor(output,shape):

    model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1,1),activation="relu",input_shape=shape),
    tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(1,1), padding='valid'),
    tf.keras.layers.Conv2D(filters=64,kernel_size=(1,1),strides=(1,1),activation="relu"),
    tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(1,1), padding='valid'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dense(output,activation="softmax"),
    
    ])
    adam=tf.keras.optimizers.Adam(
        learning_rate=0.00005, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
        name='Adam')
    model.compile(optimizer=adam,
              loss="categorical_crossentropy",
              metrics=['accuracy'])

    return model


In [5]:
def create_model_critic(shape):

    model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1,1),activation="relu",input_shape=shape),
    tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(1,1), padding='valid'),
    tf.keras.layers.Conv2D(filters=64,kernel_size=(1,1),strides=(1,1),activation="relu"),
    tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(1,1), padding='valid'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dense(1),
    
    ])
    adam=tf.keras.optimizers.Adam(
        learning_rate=0.00001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
        name='Adam')
    loss_fn =tf.keras.losses.MeanSquaredError()
    model.compile(optimizer=adam,
              loss=loss_fn,
              metrics=['accuracy'])

    return model


In [19]:
class a2c():
    def __init__(self,info={}):

        self.prev_state=None
        self.last_action=None
        self.gamma=info.get("gamma",0.99)
        self.n_actions=info.get("n_actions",4)
        self.input_shape=info.get("input_shape",(10,10,1))
        self.actor=create_model_actor(self.n_actions,self.input_shape)
        self.critic=create_model_critic(self.input_shape)
        self.action_space=list(range(self.n_actions))
        self.calback=None
    def save(self,calback):
        self.calback=[calback]
    def choose_action(self,state):
        probabilities=self.actor(np.array([state]))[0]
        if(self.last_action==None):
            action=np.random.choice(self.action_space,p= softmax(probabilities))
            return action
        else:
            cant_go=self.Cant_Go()
            action=np.random.choice(self.action_space,p= softmax(probabilities))
            while action==cant_go:
                action=np.random.choice(self.action_space,p= softmax(probabilities))
            return action
        
    def learn(self,reward,state,terminal):
        prev_state=self.prev_state[np.newaxis,:]
        state=state[np.newaxis,:]
        critic_value=self.critic.predict(prev_state)
        if(terminal):
            critic_value_=np.array([[0.0]])
        else:
            critic_value_=self.critic.predict(state)
        target=reward+self.gamma*critic_value_
        delta=target-critic_value
        actions=np.zeros([1,self.n_actions])
        actions[np.arange(1),self.last_action]=1.0
        actions=actions*delta
        self.actor.fit(prev_state,actions,verbose=0,callbacks=self.calback)
        self.critic.fit(prev_state,target,verbose=0,callbacks=self.calback)
        
        
    def start(self,state):
        current_action=self.choose_action(state)
        self.prev_state=state
        self.last_action=current_action
        return self.last_action
    def step(self,reward,state):
        current_action=self.choose_action(state)
        self.learn(reward,state,False)
        tf.keras.backend.clear_session()
        self.prev_state=state
        self.last_action=current_action
        return self.last_action
    def end(self,reward):
        self.learn(reward,self.prev_state,True)
        self.last_action=None
    def Cant_Go(self):
        if self.last_action==0:
            return 1
        if self.last_action==1:
            return 0
        if self.last_action==2:
            return 3
        if self.last_action==3:
            return 2
    

0

0