## Prediction using neural networks.

In this demo lets learn how to estimate value functions using function approximations , specifically neural networks.

In [1]:
import numpy as np

class MLP():

    def __init__(self, tot_inputs, tot_hidden, tot_outputs, activation="sigmoid"):
        '''Init an MLP object
        '''
        if(activation!="sigmoid" and  activation!="tanh"):
            raise ValueError("[ERROR] The activation function " 
                             + str(activation) + " does not exist!")
        else:
            self.activation = activation
        self.tot_inputs = tot_inputs
        self.W1 = np.random.normal(0.0, 0.1, (tot_inputs+1, tot_hidden))
        self.W2 = np.random.normal(0.0, 0.1, (tot_hidden+1, tot_outputs))
        self.tot_outputs = tot_outputs

    def _sigmoid(self, z):
        return 1.0 / (1.0 + np.exp(-z))

    def _tanh(self, z):
        return np.tanh(z)
                
    def forward(self, x, verbose=False):
        '''Forward pass in the neural network
        '''
        if(x.shape[0]!=self.tot_inputs): raise ValueError("[ERROR] The size of x is wrong!")
        self.x = np.hstack([x, np.array([1.0])]) 
        self.z1 = np.dot(self.x, self.W1)
        if(self.activation=="sigmoid"):
            self.h = self._sigmoid(self.z1)
        elif(self.activation=="tanh"):
            self.h = self._tanh(self.z1)
        self.h = np.hstack([self.h, np.array([1.0])]) 
        self.z2 = np.dot(self.h, self.W2) 
        if(self.activation=="sigmoid"):
            self.y = self._sigmoid(self.z2)
        elif(self.activation=="tanh"):
            self.y = self._tanh(self.z2)
        if(verbose): print("z1: " + str(self.z1))
        if(verbose): print("h: " + str(self.h))
        if(verbose): print("z2: " + str(self.z2))
        if(verbose): print("y: " + str(self.y))
        if(verbose): print("W1: " + str(self.W1))
        if(verbose): print("W2: " + str(self.W2))
        return self.y

    def _sigmoid_derivative(self, z):
        return self._sigmoid(z) * (1.0 - self._sigmoid(z))

    def _tanh_derivative(self, z):
        return 1 - np.square(np.tanh(z))
       
    def backward(self, y, target, verbose=False):
        '''Backward pass in the network
        '''
        if(y.shape!=target.shape): raise ValueError("[ERROR] The size of target is wrong!")
        
        dE_dy = -(target - y) #shape [tot_outputs]
        if(self.activation=="sigmoid"):
            dy_dz2 = self._sigmoid_derivative(self.z2)
        elif(self.activation=="tanh"):
            dy_dz2 = self._tanh_derivative(self.z2)
        dz2_dW2 = self.h        
        dz2_dh = self.W2
        if(self.activation=="sigmoid"):
            dh_dz1 = self._sigmoid_derivative(self.z1) 
        elif(self.activation=="tanh"):
            dh_dz1 = self._tanh_derivative(self.z1)     
        dz1_dW1 = self.x
       
        dE_dW2 = np.dot(np.expand_dims(dE_dy * dy_dz2, axis=1), 
                        np.expand_dims(dz2_dW2, axis=0)).T
       
        dE_dW1 = (dE_dy * dy_dz2) 
        dE_dW1 = np.dot(dE_dW1, dz2_dh.T)[0:-1] * dh_dz1 
        dE_dW1 = np.dot(np.expand_dims(dE_dW1,axis=1), 
                        np.expand_dims(dz1_dW1,axis=0)).T 
        if(verbose): print("dE_dW1: " + str(dE_dW1))
        if(verbose): print("dE_dW2: " + str(dE_dW2))
        return dE_dW1, dE_dW2

    def train(self, x, target, learning_rate=0.1):
        '''train the network updating the weight matrices
        '''
        y = self.forward(x)
        dE_dW1, dE_dW2 = self.backward(y, target)
        #update the weights
        self.W2 = self.W2 - (learning_rate * dE_dW2) 
        self.W1 = self.W1 - (learning_rate * dE_dW1)
        #estimate the error
        error = 0.5 * (target - y)**2
        return error


In [2]:
import numpy as np


class GridWorld:

    def __init__(self, tot_row, tot_col):
        self.action_space_size = 4
        self.world_row = tot_row
        self.world_col = tot_col
        self.transition_matrix = np.ones((self.action_space_size, self.action_space_size))/ self.action_space_size
        
        self.reward_matrix = np.zeros((tot_row, tot_col))
        self.state_matrix = np.zeros((tot_row, tot_col))
        self.index_matrix = None
        self.position = [np.random.randint(tot_row), np.random.randint(tot_col)]

    def setTransitionMatrix(self, transition_matrix):
        '''Set the reward matrix.

        The transition matrix here is a matrix which has a line
        for each action and the element of the row are the probabilities to
        executes each action when a command is given. 
        '''
        if(transition_matrix.shape != self.transition_matrix.shape):
            raise ValueError('The shape of the two matrices must be the same.') 
        self.transition_matrix = transition_matrix

    def setRewardMatrix(self, reward_matrix):
       
        if(reward_matrix.shape != self.reward_matrix.shape):
            raise ValueError('The shape of the matrix does not match with the shape of the world.')
        self.reward_matrix = reward_matrix

    def setStateMatrix(self, state_matrix):
        '''Set the obstacles in the world.

        The input to the function is a matrix with the
        same size of the world 
        -1 for states which are not walkable.
        +1 for terminal states
       
         [0,  0,  0,  0]]
        '''
        if(state_matrix.shape != self.state_matrix.shape):
            raise ValueError('The shape of the matrix does not match with the shape of the world.')
        self.state_matrix = state_matrix

    def setIndexMatrix(self, index_matrix):
        '''Set the indices of the states in the world.

        The input to the function is a matrix with the
        same size of the world 
        '''
       
        self.index_matrix = index_matrix

    def setPosition(self, index_row=None, index_col=None):
        ''' Set the position of the robot in a specific state.

        '''
        if(index_row is None or index_col is None): self.position = [np.random.randint(tot_row), np.random.randint(tot_col)]
        else: self.position = [index_row, index_col]

    def render(self):
        ''' Print the current world in the terminal.

        O represents the robot position
        - respresent empty states.
        # represents obstacles
        * represents terminal states
        '''
        graph = ""
        for row in range(self.world_row):
            row_string = ""
            for col in range(self.world_col):
                if(self.position == [row, col]): row_string += u" \u25CB " # u" \u25CC "
                else:
                    if(self.state_matrix[row, col] == 0): row_string += ' - '
                    elif(self.state_matrix[row, col] == -1): row_string += ' # '
                    elif(self.state_matrix[row, col] == +1): row_string += ' * '
            row_string += '\n'
            graph += row_string 
        print(graph)            

    def reset(self, exploring_starts=False):
        ''' Set the position of the robot in the bottom left corner.

        It returns the first observation
        '''
        if exploring_starts:
            while(True):
                row = np.random.randint(0, self.world_row)
                col = np.random.randint(0, self.world_col)
                if(self.state_matrix[row, col] == 0): break
            self.position = [row, col]
        else:
            self.position = [self.world_row-1, 0]
        #Check which kind of index to return
        if self.index_matrix is not None:
            indexed_position = [self.index_matrix[self.position[0],self.position[1],0], self.index_matrix[self.position[0],self.position[1],1]]
            return indexed_position
        else:
            return self.position

    def step(self, action):
        ''' One step in the world.

        [observation, reward, done = env.step(action)]
        The robot moves one step in the world based on the action given.
        The action can be 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT
        @return observation the position of the robot after the step
        @return reward the reward associated with the next state
        @return done True if the state is terminal  
        '''
        if(action >= self.action_space_size): 
            raise ValueError('The action is not included in the action space.')

        #Based on the current action and the probability derived
        #from the trasition model it chooses a new actio to perform
        action = np.random.choice(4, 1, p=self.transition_matrix[int(action),:])
        #action = self.transition_model(action)

        #Generating a new position based on the current position and action
        if(action == 0): new_position = [self.position[0]-1, self.position[1]]   #UP
        elif(action == 1): new_position = [self.position[0], self.position[1]+1] #RIGHT
        elif(action == 2): new_position = [self.position[0]+1, self.position[1]] #DOWN
        elif(action == 3): new_position = [self.position[0], self.position[1]-1] #LEFT
        else: raise ValueError('The action is not included in the action space.')

        #Check if the new position is a valid position
        #print(self.state_matrix)
        if (new_position[0]>=0 and new_position[0]<self.world_row):
            if(new_position[1]>=0 and new_position[1]<self.world_col):
                if(self.state_matrix[new_position[0], new_position[1]] != -1):
                    self.position = new_position

        reward = self.reward_matrix[self.position[0], self.position[1]]
        #Done is True if the state is a terminal state
        done = bool(self.state_matrix[self.position[0], self.position[1]])
        if self.index_matrix is not None:
            indexed_position = [self.index_matrix[self.position[0],self.position[1],0], self.index_matrix[self.position[0],self.position[1],1]]
            return indexed_position, reward, done
        else:
            return self.position, reward, done



In [3]:
def init_env():

    env = GridWorld(5, 5)
    #Define the state matrix
    state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the index matrix
    index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
                             [(3,0), (3,1), (3,2), (3,3), (3,4)],
                             [(2,0), (2,1), (2,2), (2,3), (2,4)],
                             [(1,0), (1,1), (1,2), (1,3), (1,4)],
                             [(0,0), (0,1), (0,2), (0,3), (0,4)]])
    #Define the reward matrix
    reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, -1.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [-1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])
    env.setStateMatrix(state_matrix)
    env.setIndexMatrix(index_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)
    return env

In [4]:

def update(my_mlp, new_observation, reward, learning_rate, gamma, done):
    '''Return the updated weights vector 
    '''
    if done:
        x = np.array(new_observation, dtype=np.float32)
        target = np.array([reward], dtype=np.float32) 
        #print(target)
        error = my_mlp.train(x, target, learning_rate)
    else:
        x = np.array(new_observation, dtype=np.float32)  
        target = np.array((reward + (gamma * my_mlp.forward(x))), dtype=np.float32)
        #print target
        error = my_mlp.train(x, target, learning_rate)  
    return my_mlp, error

In [5]:
def print_utility(my_mlp, tot_rows, tot_cols, decimal=2, flip=True):
    '''Print on terminal the utility matrix of a discrete state space
       having states defined by tuples: (0,0); (0,1); (0,2) ...

    '''
    value_fn = np.zeros((tot_rows, tot_cols))
    for row in range(tot_rows):
        for col in range(tot_cols):
            x = np.array([row, col], dtype=np.float32)
            value_fn[row,col] = my_mlp.forward(x)
    np.set_printoptions(precision=decimal)
    if flip:
        print(np.flipud(value_fn))
    else:
        print(value_fn)
    np.set_printoptions(precision=8)

In [6]:
env = init_env()
my_mlp = MLP(tot_inputs=2, tot_hidden=2, tot_outputs=1, activation="tanh")
learning_rate = 0.1
gamma = 0.9
tot_epoch = 10001
print_epoch = 100

for epoch in range(tot_epoch):
    #XOR-world episode
    observation = env.reset(exploring_starts=True)
    #The episode starts here
    for step in range(1000):
        action = np.random.randint(0,4)
        new_observation, reward, done = env.step(action) #move in the world and get the state and reward
        my_mlp, error = update(my_mlp, new_observation, reward, learning_rate, gamma, done)
        observation = new_observation
        if done: break
    if(epoch % print_epoch == 0 and epoch!=0):
        print("")
        print("Epoch: " + str(epoch+1))
        print("Tot steps: " + str(step))
        print("Error: " + str(error))

        print_utility(my_mlp, tot_rows=5, tot_cols=5)
    
print(".....Finished all episodes......\n \n")
print("Final value function")
print_utility(my_mlp, tot_rows=5, tot_cols=5)



Epoch: 101
Tot steps: 3
Error: [0.48333517]
[[ 0.24  0.22  0.19  0.14  0.11]
 [ 0.19  0.19  0.16  0.12  0.09]
 [ 0.07  0.12  0.12  0.1   0.07]
 [-0.16 -0.01  0.05  0.06  0.05]
 [-0.43 -0.24 -0.08 -0.    0.02]]

Epoch: 201
Tot steps: 4
Error: [0.97673731]
[[ 0.62  0.35  0.15  0.08  0.07]
 [ 0.52  0.29  0.13  0.08  0.07]
 [ 0.25  0.2   0.1   0.07  0.07]
 [-0.31 -0.02  0.05  0.06  0.07]
 [-0.76 -0.44 -0.07  0.04  0.06]]

Epoch: 301
Tot steps: 3
Error: [0.24097727]
[[ 0.79  0.62  0.5   0.47  0.47]
 [ 0.68  0.57  0.49  0.47  0.47]
 [ 0.35  0.49  0.47  0.47  0.47]
 [-0.34  0.29  0.44  0.46  0.47]
 [-0.8  -0.14  0.37  0.45  0.47]]

Epoch: 401
Tot steps: 2
Error: [1.10351996]
[[ 0.75  0.29 -0.11 -0.18 -0.19]
 [ 0.58  0.14 -0.14 -0.18 -0.19]
 [ 0.17 -0.02 -0.16 -0.18 -0.19]
 [-0.49 -0.22 -0.19 -0.19 -0.19]
 [-0.85 -0.5  -0.23 -0.19 -0.19]]

Epoch: 501
Tot steps: 8
Error: [0.18568809]
[[ 0.68 -0.09 -0.47 -0.51 -0.51]
 [ 0.48 -0.24 -0.49 -0.51 -0.51]
 [ 0.01 -0.38 -0.5  -0.51 -0.51]
 [-0.61 -0.5


Epoch: 4101
Tot steps: 0
Error: [0.00483003]
[[ 0.71 -0.09 -0.15 -0.15 -0.15]
 [ 0.34 -0.13 -0.15 -0.15 -0.15]
 [-0.25 -0.17 -0.15 -0.15 -0.15]
 [-0.71 -0.21 -0.16 -0.15 -0.15]
 [-0.9  -0.26 -0.16 -0.15 -0.15]]

Epoch: 4201
Tot steps: 18
Error: [0.68644234]
[[ 0.82 -0.07 -0.16 -0.17 -0.17]
 [ 0.54 -0.13 -0.17 -0.17 -0.17]
 [-0.05 -0.17 -0.17 -0.17 -0.17]
 [-0.65 -0.21 -0.17 -0.17 -0.17]
 [-0.9  -0.27 -0.17 -0.17 -0.17]]

Epoch: 4301
Tot steps: 6
Error: [0.00839616]
[[ 0.89  0.22  0.11  0.11  0.11]
 [ 0.69  0.16  0.11  0.11  0.11]
 [ 0.15  0.11  0.11  0.11  0.11]
 [-0.55  0.06  0.11  0.11  0.11]
 [-0.87 -0.01  0.11  0.11  0.11]]

Epoch: 4401
Tot steps: 23
Error: [0.00648147]
[[ 0.84 -0.01 -0.11 -0.11 -0.11]
 [ 0.59 -0.07 -0.11 -0.11 -0.11]
 [ 0.02 -0.11 -0.11 -0.11 -0.11]
 [-0.61 -0.15 -0.11 -0.11 -0.11]
 [-0.89 -0.21 -0.11 -0.11 -0.11]]

Epoch: 4501
Tot steps: 24
Error: [0.36779417]
[[ 0.88  0.43  0.38  0.38  0.38]
 [ 0.69  0.39  0.38  0.38  0.38]
 [ 0.19  0.36  0.38  0.38  0.38]
 [-0


Epoch: 8101
Tot steps: 19
Error: [0.92593948]
[[ 0.87  0.06 -0.02 -0.02 -0.02]
 [ 0.66  0.01 -0.02 -0.02 -0.02]
 [ 0.17 -0.02 -0.02 -0.02 -0.02]
 [-0.48 -0.04 -0.02 -0.02 -0.02]
 [-0.88 -0.08 -0.02 -0.02 -0.02]]

Epoch: 8201
Tot steps: 31
Error: [0.0060533]
[[ 0.9   0.08 -0.01 -0.01 -0.01]
 [ 0.69  0.02 -0.01 -0.01 -0.01]
 [ 0.16 -0.01 -0.01 -0.01 -0.01]
 [-0.53 -0.03 -0.01 -0.01 -0.01]
 [-0.89 -0.07 -0.01 -0.01 -0.01]]

Epoch: 8301
Tot steps: 1
Error: [0.00741559]
[[ 0.89  0.14  0.07  0.07  0.07]
 [ 0.68  0.1   0.07  0.07  0.07]
 [ 0.18  0.07  0.07  0.07  0.07]
 [-0.49  0.05  0.07  0.07  0.07]
 [-0.88  0.    0.06  0.07  0.07]]

Epoch: 8401
Tot steps: 10
Error: [0.32732996]
[[ 0.69 -0.35 -0.4  -0.4  -0.4 ]
 [ 0.25 -0.38 -0.4  -0.4  -0.4 ]
 [-0.32 -0.4  -0.4  -0.4  -0.4 ]
 [-0.76 -0.42 -0.4  -0.4  -0.4 ]
 [-0.94 -0.45 -0.4  -0.4  -0.4 ]]

Epoch: 8501
Tot steps: 12
Error: [0.52067424]
[[ 0.8  -0.21 -0.28 -0.28 -0.28]
 [ 0.45 -0.25 -0.28 -0.28 -0.28]
 [-0.12 -0.27 -0.28 -0.28 -0.28]
 [-0