# Environment Set Up

In [83]:
from collections import defaultdict
import numpy as np
import random

In [84]:
maze = np.zeros((4,4)) # represents a gridworld in which each state, apart from the terminal state, has a value of 0. 
terminal_state = [(0,0), (3,3)]

## Concept Check

What is value? The sum of all future rewards that can be recieved from a given state s. These values are "long lasting. Tells us the long term effects of being in a certain state



## Iterative Policy Evaluation for Gridworld 

What is iterative policy evaluation? Finding the true value function for each state with full knowledge of the environment

In [85]:
# calculate the value of each state given the policy
def policy_eval(maze_values): 
  
    
    new_values = maze_values.copy()
    
    for row in range(len(maze_values[0])): 
        for col in range(len(maze_values[1])):
            reward = -1 # -1 for up, down, left, right
            successor_val = 0
            initial_state_val = maze_values[row][col]
            
            if ((row, col) in terminal_state): 
                new_values[row][col] = 0
            else:
                # check left
                if (col - 1 < 0): 
                    successor_val += reward + maze_values[row][col]
                else: 
                    successor_val += (reward + maze_values[row][col - 1])
                # check right
                if (col + 1 >= len(maze_values[1])): 
                    successor_val += reward + maze_values[row][col]
                else: 
                    successor_val += (reward + maze_values[row][col + 1])
                # check up
                if (row - 1 < 0): 
                    successor_val += reward + maze_values[row][col]
                else: 
                    successor_val += (reward + maze_values[row - 1][col])
                # check down
                if (row + 1 >= len(maze_values[0])): 
                    successor_val += reward + maze_values[row][col]
                else: 
                    successor_val += (reward + maze_values[row+1][col])
                
                new_values[row][col] = 0.25 * successor_val
                
            
        
    return np.array(new_values) 
                
            
        
    

                

In [86]:
maze_values = np.zeros((4,4))

print(np.array(maze_values))

current_values = maze_values

for i in range(1000): 
    
    current_values = policy_eval(current_values)
    
    
print(current_values)



[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[  0. -14. -20. -22.]
 [-14. -18. -20. -20.]
 [-20. -20. -18. -14.]
 [-22. -20. -14.   0.]]


# Step Function

In [126]:
def step(pos, Q):
    
    eps = 0.5
    action = e_greedy(eps, pos, Q)
    successor_val = 0
    successor_state = pos
    reward = -1
    
    if action == 0: # check up
        if not (pos[0] - 1 < 0): 
            successor_state = (pos[0] - 1, pos[1])
    if action == 1: # check down
        if not (pos[0] + 1 > 3): 
            successor_state = (pos[0] + 1, pos[1])
    if action == 2: # check left
        if not (pos[1] - 1 < 0): 
            successor_state = (pos[0], pos[1] - 1)
    if action == 3: # check right
        if not (pos[1] + 1 > 3): 
            successor_state = (pos[0], pos[1] + 1)
    
    #if successor_state in terminal_state: 
        #reward = 0      
        
    if pos in terminal_state:
        reward = 0
                               
    
    return pos, action, reward, successor_state

# Monte Carlo Control

In [109]:
def e_greedy(eps, pos, Q): 
    
    
    
    #episilon = the probability of randomly choosing an action
    # probablility of choosing the optimal action is 1 - eps.
    
   
    probability = round(random.uniform(0,1), 2)
    
    if probability <= eps: 
        action = random.randint(0,3)
    else: 
       
        rewards = list(Q[pos])
        action = rewards.index(max(rewards))
        
      
    return action

In [122]:
def first_visit_mc_control(num_iterations, discount): 
    
    Q = defaultdict(lambda: np.zeros(4))
    episode = []
    sa_visited = []
    sa_occurences = defaultdict(int)
    
    
    for it in range(1, num_iterations + 1): 
        
        is_terminal = False
        initial_state = (random.randint(0,3), random.randint(0,3))
        
        while not is_terminal:
            
            # the action must be chosen with the epsilon greedy policy. 
            initial_state, action, reward, successor_state = step_control(initial_state, Q)
            sa = (initial_state, action)
            episode.append((sa, reward))
            sa_occurences[sa] += 1
            initial_state = successor_state
            if initial_state in terminal_state: 
                is_terminal = True
            
        current_return = 0
        power = len(episode)
        for sa, reward in reversed(episode): 
                 
            if sa not in sa_visited: 
                    
                current_return += (discount ** power) * reward
                
                Q[sa[0]][sa[1]] += round((1 / sa_occurences[sa]) * (reward - Q[sa[0]][sa[1]]), 2)
                    
                sa_visited.append(sa)
                
                power -= 1    
    return Q
   
                
                
        
        

In [111]:
Q = first_visit_mc_control(2000, 0.7)

In [112]:
# construct policy map

grid = np.zeros((4,4))

for state in Q.keys(): 
    
    grid[state[0], state[1]] = max(Q[state])

grid

array([[ 0.  , -0.25, -0.5 , -0.06],
       [-0.2 , -0.25, -0.25, -0.11],
       [-1.  , -1.  , -1.  , -0.25],
       [-0.08, -0.2 , -0.5 ,  0.  ]])

# Temporal Difference Learning

In [113]:
def td_learning(num_iterations, alpha, discount): 
    
    Q = defaultdict(lambda: np.zeros(4))

    for it in range(1, num_iterations + 1): 
        
        is_terminal = False
        initial_state = (random.randint(0,3), random.randint(0,3))
    
        while not is_terminal:

            # the action must be chosen with the epsilon greedy policy. 
            initial_state, action, reward, successor_state = step(initial_state, Q)
    
            Q[initial_state][action] += alpha * (reward + discount * max(Q[successor_state] - Q[initial_state][action]))
        
            if initial_state in terminal_state: 
                is_terminal = True
            
            initial_state = successor_state
                                             
    return Q
                                        
                

In [114]:
Q = td_learning(2000, 0.5, 0.5)

In [115]:
grid = np.zeros((4,4))

for state in Q.keys(): 
    
    grid[state[0], state[1]] = max(Q[state])

grid

array([[ 0., -2., -4., -6.],
       [-2., -4., -6., -4.],
       [-4., -6., -4., -2.],
       [-6., -4., -2.,  0.]])

In [116]:
Q = td_learning(1000, 0.5, 0.5)


grid = np.zeros((4,4))

for state in Q.keys(): 
    
    grid[state[0], state[1]] = max(Q[state])

grid


array([[ 0.        , -2.        , -4.        , -5.99999969],
       [-2.        , -4.        , -5.99999319, -4.        ],
       [-4.        , -5.99999263, -4.        , -2.        ],
       [-5.99999993, -4.        , -2.        ,  0.        ]])

In [117]:
Q = td_learning(500, 0.5, 0.5)

grid = np.zeros((4,4))

for state in Q.keys(): 
    
    grid[state[0], state[1]] = max(Q[state])

grid


array([[ 0.        , -2.        , -4.        , -5.99982938],
       [-2.        , -3.99999782, -5.98060658, -4.        ],
       [-4.        , -5.98655542, -3.99997676, -2.        ],
       [-5.99642811, -3.99999999, -2.        ,  0.        ]])

In [118]:
Q = td_learning(650, 0.5, 0.5)

grid = np.zeros((4,4))

for state in Q.keys(): 
    
    grid[state[0], state[1]] = max(Q[state])

grid

array([[ 0.        , -2.        , -4.        , -5.99922538],
       [-2.        , -3.99999985, -5.99832279, -4.        ],
       [-4.        , -5.99796553, -3.9999983 , -2.        ],
       [-5.99998019, -4.        , -2.        ,  0.        ]])