In [1]:
import numpy as np

class CGridWorld(object):
    def __init__(self,size, p):       
            
            self.actionSpace = {'U':1, 'D':2, 'L':3, 'R':4}
            self.possibleActions = ['U', 'D', 'L', 'R']
            
            self.size = size
            self.p = p
            
            self.prefAction = self.setPrefAction()
            self.firstState = self.generate_random_map(size=5, p = self.p)
            
    def onTree(self, state, action, row=None, col=None):
        #if grid value at center is equal to tree at center loction set on tree to true
        
        #set loction based off action
        if action == 'U':
            row = 0
            col = 1
        elif action == 'D':
            row = -1
            col = 1
        elif action == 'L':
            row = 1
            col = 0
        elif action == 'R':
            row = 1
            col = -1
        
        agentLoc = state[row][col]
        #if location matches tree 
        if agentLoc.all() == 0:
            return True
        
        return False
    
    def setPrefAction(self, prefAction = None):
        if prefAction is None:
            prefAction = np.random.choice(['U', 'D', 'L', 'R'])
        self.prefAction = prefAction
        
    def getPrefAction(self):
        return self.prefAction
            
        
    # DFS to check that it's a valid path.
    def is_valid(self, res, rsize, csize):
        #tracks a list of nodes from from (0,0) to end of graph goal
        frontier, discovered = [], set()
        frontier.append((0,0))
        #loops for all connected nodes till goal node
        while frontier:
            #row column position
            r, c = frontier.pop()
            #adds to examins and adds to discovered 
            if not (r,c) in discovered:
                discovered.add((r,c))
                #possible x ,y directions
                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
                #loops through each direction
                for x, y in directions:
                    #increasec row and colum postions exmined in x,y directions 
                    r_new = r + x
                    c_new = c + y
                    #examines node and added new nods to exmin to frontier
                    if r_new < 0 or r_new >= rsize or c_new < 0 or c_new >= csize:
                        continue
                    if res[r_new][c_new] == 3:
                        return True
                    if (res[r_new][c_new] != 0):
                        frontier.append((r_new, c_new))
        return False
    
   #generates start enviroment
    def generate_random_map(self, size, p):
        """Generates a random valid map (one that has a path from start to goal)
        :param size: size of each side of the grid
        :param p: probability that a tile is frozen
        """
        valid = False
        # loop tile valid generated
        while not valid:
            #generate random array of trres and gaps  bases of probabilty of polulation
            self.p = min(1, p)
            res = np.random.choice(
                    [1, 0], (size, size), 
                                   p=[p, 1-p])
            
            #set center value to agent
            res[2][2] = 2
            
            #set boarder edge to goal for valid travesal
            for i in range( self.size):
                res[0][i] = 3
                res[-1][i] = 3   
                res[i][0] = 3
                res[i][-1] = 3
             
            #check if valid
            valid = self.is_valid(res, size, size)
            
        #remove goal edge    
        res = np.delete(res, 0, 0)
        res = np.delete(res, -1, 0)
        res = np.delete(res, 0, 1)
        res = np.delete(res, -1, 1)
            
        return res 
    
    def setState(self, state = None):
        if state is None:
            sState = self.firstState
        else :
            sState = state
            
        return sState
    
    #Generate new row of column of map
    def stateUpdate(self, state, size, action, p = None):
        """Generates a random valid map (one that has a path from start to goal)
        :param size: size of each side of the grid
        :param p: probability that a tile is frozen
        """
        if p is None:
            p = self.p
        
        
        genRow = None
        direction = None
        rowCol = None
        
        valid = False
        
        #loops until valid genration
        while not valid:
            
            #decides col/ rows to removed based off actions taken and determins withether generatiig a row or colum in newstate
            if action == 'U':
                rowCol = 0
                direction = -1
                genRow = 1
            elif action == 'D':
                rowCol = 0
                direction = 0
                genRow = 1
            elif action == 'L':
                rowCol = 1
                direction = -1
                genRow = 0
            elif action == 'R':
                rowCol = 1
                direction = 0
                genRow = 0
            
            
            #sets new space genreation based of column or row being generated
            if genRow == 1:
                rowSize = 2
                colSize = self.size
            elif genRow == 0:
                rowSize = self.size
                colSize = 2
            
            #removes last colm/row based of action
            newState= np.delete(state, direction, rowCol)
            
            #repositions agent X to center space
            if not self.onTree(newState, action):
                if action == 'U':
                    temp = newState[0][1]
                    newState[0][1] = newState[1][1]
                    newState[1][1] = temp
    
                elif action == 'D':
                    temp = newState[1][1]
                    newState[1][1] = newState[0][1]
                    newState[0][1] = temp
    
                elif action == 'R':
                    temp = newState[1][1]
                    newState[1][1] = newState[1][0]
                    newState[1][0] = temp
    
                elif action == 'L':
                    temp = newState[1][0]
                    newState[1][0] = newState[1][1]
                    newState[1][1] = temp
            else:
                return print("onTree = true")
                
            
            #generates new space or obstical poulation based off probability p
            p = min(1, p)
            newRCArr = np.random.choice([1, 0], (rowSize, colSize), p=[p, 1-p])
        
            
            #adds new space and sets furthest row/col as goal for path validation DFS
            if action == 'U':
                newState = np.concatenate((newRCArr, newState), axis=0)
                for i in range(size):
                    newState[0][i] = 3
            elif action == 'D':
                newState = np.concatenate((newState, newRCArr), axis=0)
                for i in range(size):
                    newState[-1][i] = 3
            elif action == 'L':
                newState = np.concatenate((newRCArr, newState), axis=1)
                for i in range(size):
                    newState[i][0] = 3
            elif action == 'R':
                newState = np.concatenate((newState, newRCArr), axis=1)
                for i in range(size):
                    newState[i][-1] = 3
             
            
            #determins wither generated row/col is valid
            if genRow == 1:
                valid = self.is_valid(newState, rsize = 2+size, csize = self.size)
            elif genRow == 0:
                valid = self.is_valid(newState, rsize = self.size, csize = 2+size)
                
            
            
        #delete goal row/col
        if action == 'U':
            newState = np.delete(newState, 0, 0)
        elif action == 'D':
            newState = np.delete(newState, -1, 0)
        elif action == 'L':
            newState = np.delete(newState, 0, 1)
        elif action == 'R':
            newState = np.delete(newState, -1, 1)       
            
            
        return newState
    
    
    
    def step(self, action):
        
        #action = self.actionSpace[action]
        
        reward = 0 
        
        resultingState = None
        
        done = self.onTree(self.setState(), action)
    
        if not done:
            
            resultingState = self.stateUpdate(  self.setState(), size=3, action = action, p=0.6)
            
           
            
            reward += 15
              
            #prefred action function
            if action == self.prefAction:
                reward += 1
            else:
                reward += 1
            
            self.setState(resultingState)
            
            return resultingState, reward, \
                   done, None
                   
        else:
            
            if resultingState is None:
                reward += -50
            
            return resultingState, reward, \
                   done, None
        

    def reset(self):
        self.setPrefAction()
        return self.setState(self.generate_random_map(self.size + 2, self.p))
    
    
    def render(self):
        print('------------------------------------------')
        for row in self.setState():
            for col in row:
                if col == 1:
                    print('-', end='\t')
                elif col == 2:
                    print('X', end='\t')
                elif col == 0:
                    print('o', end='\t')
            print('\n')
        print('------------------------------------------')
        
        

In [2]:
from keras.layers import Dense,Activation, Input, concatenate, Flatten
from keras.models import Model, load_model
from keras.optimizers import Adam
import keras.backend as K
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

class Agent(object):
    def __init__(self, ALPHA, GAMMA=0.99,n_actions=4,
                 layer1_size=16, layer2_size=16, input_dims=128,
                 fname='reinforce.h5'):
        self.gamma = GAMMA
        self.lr = ALPHA
        self.G = 0
        self.input_dims = input_dims
        self.fc1_dims = layer1_size
        self.fc2_dims = layer2_size
        self.n_actions = n_actions
        
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        
        self.policy, self.predict = self.build_policy_network()
        self.action_space = [1, 2, 3, 4]
        self.model_file = fname
        
    def build_policy_network(self):
        
        
        env2d = Input(shape=(self.input_dims,self.input_dims))
        env = Flatten()(env2d)
        
        advantages = Input(shape=[1])
        
        dense1 = Dense(self.fc1_dims, activation='relu')(env)
        dense2 = Dense(self.fc2_dims, activation='relu')(dense1)
        probs = Dense(self.n_actions, activation='softmax')(dense2)
       
        
        def custom_loss(y_true, y_pred):
            out = K.clip(y_pred, 1e-8,  1-1e-8)
            log_lik = y_true*K.log(out)
            
            return K.sum(-log_lik*advantages)
        
        
        
        policy = Model(input=[env2d,advantages], output=[probs])
        policy.compile(optimizer=Adam(lr=self.lr), loss=custom_loss)
        
        
        predict = Model(input=[env2d], output=[probs])
        
        return policy, predict
    
    def choose_action(self, observation):
        state = observation[np.newaxis, :]
        probabilities = self.predict.predict(state)[0]
        action = np.random.choice(self.action_space, p=probabilities)
        
        return action
    
    
    def store_transition(self, observation, action, reward):
        state = observation
        self.action_memory.append(action)
        self.state_memory.append(state)
        self.reward_memory.append(reward)
        
    #find position around agent funtion    
        
    def learn(self):
        state_memory = np.array(self.state_memory)
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)
        
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(action_memory)
        
        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        actions = onehot_encoder.fit_transform(integer_encoded)
        
        print(actions)
        
        G = np.zeros_like(reward_memory)
        for t in range(len(reward_memory)):
            G_sum = 0
            discount = 1
            for k in range(t, len(reward_memory)):
                G_sum += reward_memory[k]*discount
                discount *= self.gamma
                
            G[t] = G_sum
        mean = np.mean(G)
        std = np.std(G) if np.std(G) > 0 else 1
        self.G = (G-mean)/std
        
        cost = self.policy.train_on_batch([state_memory ,self.G], actions)
        
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        
    #return cost funtion
        
    def save_model(self):
        self.policy.save(self.model_file)
        
    def load_model(self):
        self.policy = load_model(self.model_file)

Using TensorFlow backend.


In [None]:

import matplotlib.pyplot as plt
import numpy as np


if __name__ == '__main__':
    
    env = CGridWorld(size = 3, p = 0.5)
    
    agent = Agent(ALPHA=0.0005, input_dims=3, GAMMA=0.99,n_actions=4,
                  layer1_size=64, layer2_size=64)

    score_history = []
    prefActionEp = []
    
    #print observation
    n_episodes = 20
   
    for i in range(n_episodes):
        done=False
        score = 0
        steps=0
        observation = env.reset()
        prefActionEp.append(env.getPrefAction)
        
        while not done:
            action = agent.choose_action(observation)
            
            
            
            if action == 1:
                strAction = 'U'
            elif action == 2:
                strAction = 'D'
            elif action == 3:
                strAction = 'L'
            elif action == 4:
                strAction = 'R'
            
            observation_, reward, done, info = env.step(strAction)
            agent.store_transition(observation, action, reward)
            observation = observation_
            score += reward
            steps += 1
            
        
        score_history.append(score)
        
        agent.learn()
        
        print('episode: ', i, 'prefered Action:', env.getPrefAction(),'score %.1f:' % score,'steps %.1f:' % (steps-1),
              'average_score %.1f:' % np.mean(score_history[-100:]))
    
    
    plt.plot(score_history)
    plt.show()