# Project 1 Froze Lake Problem

Png Qun Shen

A0199519J

png.qunshen@u.nus.edu

## Import statements

In [1]:
# import statements
import numpy as np
import pandas as pd
from Monte_Carlo_without_es import Monte_carlo_without_es
from Sarsa import Sarsa
from Q_learning import Q_learning

## Task 1: 4 x 4 Grid

Create the default environment for the 4x4 Froze Lake problem for task 1

In [2]:
# four by four obstacle
four_by_four_obs = [(1,1), (1,3), (2,3), (3,0)]

In [None]:
mont1 = Monte_carlo_without_es(4, 4, obstacle_pos=four_by_four_obs)
print("Original map:")
print(mont1.get_map())
print("Number of iteration: {}".format(mont1.generate_path(10000)))
print("Solution:")
print(mont1.get_path_map())

In [None]:
sarsa1 = Sarsa(4, 4, obstacle_pos=four_by_four_obs)
print("Original map:")
print(sarsa1.get_map())
print("Number of iteration: {}".format(sarsa1.generate_path(1000)))
print("Solution:")
print(sarsa1.get_path_map())

In [None]:
q_learn1 = Q_learning(4, 4, obstacle_pos=four_by_four_obs)
print("Original map:")
print(q_learn1.get_map())
print("Number of iteration: {}".format(q_learn1.generate_path(1000)))
print("Solution:")
print(q_learn1.get_path_map())

### Parameter Tuning

To find the optimal parameters (discount rate, epsilon), the parameters are iterated over a few values.

For each set of parameters, the problem is solved 30 times, and the number of iterations in each run is recorded

#### Monte Carlo without Exploring Start

In [3]:
epsilon_lst = [i * 0.1 for i in range(1,10)]
discount_rate_lst = [i * 0.1 for i in range(1,10)]
mont_mean = []
for epsilon in epsilon_lst: # loop through epsilon
    mean_lst = []
    for discount_rate in discount_rate_lst: # loop through discount_rate
        mont_lst = np.empty((0), int)

        # loop 30 times
        for i in range(30):
            mont = Monte_carlo_without_es(4, 4, epsilon=epsilon, \
                                          discount_rate=discount_rate, \
                                            obstacle_pos=four_by_four_obs)
            mont_lst = np.append(mont_lst, np.array([mont.generate_path(10000)[0]]), axis = 0)
        mean_lst.append(mont_lst.mean())
    mont_mean.append(mean_lst)
print("Mean number of iterations for Monte Carlo without Exploring Start:")
print(pd.DataFrame(mont_mean, index=epsilon_lst, columns=discount_rate_lst))

#### Sarsa

In [None]:
epsilon_lst = [i * 0.1 for i in range(1,10)]
discount_rate_lst = [i * 0.1 for i in range(1,10)]
sarsa_mean = []
for epsilon in epsilon_lst: # loop through epsilon
    mean_lst = []
    for discount_rate in discount_rate_lst: # loop through discount_rate
        sarsa_lst = np.empty((0), int)

        # loop 30 times
        for i in range(30):
            sarsa = Sarsa(4, 4, epsilon=epsilon, \
                          discount_rate=discount_rate, \
                            obstacle_pos=four_by_four_obs)
            sarsa_lst = np.append(sarsa_lst, np.array([sarsa.generate_path(10000)[0]]), axis = 0)
        mean_lst.append(sarsa_lst.mean())
    sarsa_mean.append(mean_lst)
print("Mean number of iterations for Sarsa:")
print(pd.DataFrame(sarsa_mean, index=epsilon_lst, columns=discount_rate_lst))

#### Q-Learning

In [None]:
epsilon_lst = [i for i in range(1,10)] * 0.1
discount_rate_lst = [i for i in range(1,10)] * 0.1
q_learn_mean = []
for epsilon in epsilon_lst: # loop through epsilon
    mean_lst = []
    for discount_rate in discount_rate_lst: # loop through discount_rate
        q_learn_lst = np.empty((0), int)

        # loop 30 times
        for i in range(30):
            q_learn = Q_learning(4, 4, epsilon=epsilon, \
                                 discount_rate=discount_rate, \
                                    obstacle_pos=four_by_four_obs)
            q_learn_lst = np.append(q_learn_lst, np.array([q_learn.generate_path(1000)[0]]), axis = 0)
        mean_lst.append(q_learn_lst.mean())
    q_learn_mean.append(mean_lst)
print("Mean number of iterations for Q-Learning:")
print(pd.DataFrame(q_learn_mean, index=epsilon_lst, columns=discount_rate_lst))

### Decayed $\epsilon$-greedy policy

Using the optimal discount rate, decayed $\epsilon$-greedy policy is attempted by setting epsilon value to None (default value).

Decayed $\epsilon$-greedy policy decreases the epsilon value linearly from 1 to 0.1 as the number of iteration increases, encouraging more exploration at the start and more exploitation towards the end.

The mean number of iterations over 30 runs are compared between using the optimal epsilon value and using decayed $\epsilon$-greedy policy

#### Monte Carlo without Exploring Start

In [None]:
mont_decayed_lst = np.empty((0), int)
mont_epsilon_lst = np.empty((0), int)
for i in range(30):
    mont_decayed = Monte_carlo_without_es(4, 4, discount_rate=0.9, epsilon=None, obstacle_pos=four_by_four_obs)
    mont_epsilon = Monte_carlo_without_es(4, 4, discount_rate=0.9, epsilon=0.1, obstacle_pos=four_by_four_obs)
    mont_decayed_lst = np.append(mont_decayed_lst, np.array([mont_decayed.generate_path(10000)[0]]), axis = 0)
    mont_epsilon_lst = np.append(mont_epsilon_lst, np.array([mont_epsilon.generate_path(10000)[0]]), axis = 0)
print("Monte Carlo without Exploring Start")
print("Mean number of iterations, decayed epsilon-greedy: {}".format(mont_decayed_lst.mean()))
print("Mean number of iterations, optimal epsilon: {}".format(mont_epsilon_lst.mean()))

#### Sarsa

In [None]:
sarsa_decayed_lst = np.empty((0), int)
sarsa_epsilon_lst = np.empty((0), int)
for i in range(30):
    sarsa_decayed = Sarsa(4, 4, discount_rate=0.9, epsilon=None, obstacle_pos=four_by_four_obs)
    sarsa_epsilon = Sarsa(4, 4, discount_rate=0.9, epsilon=0.1, obstacle_pos=four_by_four_obs)
    sarsa_decayed_lst = np.append(sarsa_decayed_lst, np.array([sarsa_decayed.generate_path(10000)[0]]), axis = 0)
    sarsa_epsilon_lst = np.append(sarsa_epsilon_lst, np.array([sarsa_epsilon.generate_path(10000)[0]]), axis = 0)
print("Sarsa")
print("Mean number of iterations, decayed epsilon-greedy: {}".format(sarsa_decayed_lst.mean()))
print("Mean number of iterations, optimal epsilon: {}".format(sarsa_epsilon_lst.mean()))

#### Q-Learning

In [None]:
q_learn_decayed_lst = np.empty((0), int)
q_learn_epsilon_lst = np.empty((0), int)
for i in range(30):
    q_learn_decayed = Q_learning(4, 4, discount_rate=0.9, epsilon=None, obstacle_pos=four_by_four_obs)
    q_learn_epsilon = Q_learning(4, 4, discount_rate=0.9, epsilon=0.1, obstacle_pos=four_by_four_obs)
    q_learn_decayed_lst = np.append(q_learn_decayed_lst, np.array([q_learn_decayed.generate_path(10000)[0]]), axis = 0)
    q_learn_epsilon_lst = np.append(q_learn_epsilon_lst, np.array([q_learn_epsilon.generate_path(10000)[0]]), axis = 0)
print("Q-Learning")
print("Mean number of iterations, decayed epsilon-greedy: {}".format(q_learn_decayed_lst.mean()))
print("Mean number of iterations, optimal epsilon: {}".format(q_learn_epsilon_lst.mean()))

### Reward Shaping

Using the optimal discount rate and decayed $\epsilon$-greedy policy, reward shaping is attempted. There are 2 reward shaping techniques: Manhattan distance and Artificial Potential Field

Manhattan distance: the manhattan distance from each point to the goal is calculated, and scaled by dividing by the maximum possible manhattan distance (number of rows + number of columns) such that the value is between 0 and 1. Finally, the value is subtracted from 1 to become the reward for that cell. This generates a higher positive reward the closer the cell is to the goal based on manhattan distance. ($reward = 1 - man\_dist(cell, goal)/(num\_row + num\_col)$)

Artificial Potential Field: a potential field is generated such that each hole generate repulsion, and the goal generates attraction. 

$$att = \begin{cases}
    \frac{\alpha}{dist_{man}\left(cell, goal\right)}, 
    & \text{if}\ dist_{man}\left(cell, goal\right)\leq max_{cell, goal} \\
    0, & \text{if}\ dist_{man}\left(cell, goal\right)> max_{cell, goal}
\end{cases}$$
$$rep_{i} = \begin{cases}
    -\frac{\beta}{dist_{man}\left(hole_{i}, cell\right)^{2}}\left(\frac{1}{dist_{man}\left(hole_{i}, cell\right)}
    -\frac{1}{dist_{man}\left(cell, goal\right)}\right), & \text{if}\ dist_{man}\left(hole_{i}, cell\right)\leq max_{hole_{i}, cell} \\
    0, & \text{if}\ dist_{man}\left(hole_{i}, cell\right)> max_{hole_{i}, cell}
\end{cases}$$
$$potential = att + \sum_{i}{rep_{i}}$$

The potential at each cell is the reward at the cell. This is done to discourage the robot from going towards the holes, and encourage the robot to go towards the goal (just like using only manhattan distance)

The mean number of iterations from 30 runs from using each reward shaping technique is compared to when no reward shaping was used

#### Monte Carlo without Exploring Start

In [None]:
mont_no_rew_lst = np.empty((0), int)
mont_man_lst = np.empty((0), int)
mont_apf_lst = np.empty((0), int)
for i in range(30):
    mont_no_rew = Monte_carlo_without_es(4, 4, discount_rate=0.9, epsilon=None, reward_shape=None, obstacle_pos=four_by_four_obs)
    mont_man = Monte_carlo_without_es(4, 4, discount_rate=0.9, epsilon=None, reward_shape="manhattan", obstacle_pos=four_by_four_obs)
    mont_apf = Monte_carlo_without_es(4, 4, discount_rate=0.9, epsilon=None, reward_shape="apf", obstacle_pos=four_by_four_obs)
    mont_no_rew_lst = np.append(mont_no_rew_lst, np.array([mont_no_rew.generate_path(10000)[0]]), axis = 0)
    mont_man_lst = np.append(mont_man_lst, np.array([mont_man.generate_path(10000)[0]]), axis = 0)
    mont_apf_lst = np.append(mont_apf_lst, np.array([mont_apf.generate_path(10000)[0]]), axis = 0)
print("Monte Carlo without Exploring Start")
print("Mean number of iterations, No reward shaping: {}".format(mont_no_rew_lst.mean()))
print("Mean number of iterations, Manhattan distance: {}".format(mont_man_lst.mean()))
print("Mean number of iterations, Artificial Potential Field: {}".format(mont_apf_lst.mean()))

#### Sarsa

In [None]:
sarsa_no_rew_lst = np.empty((0), int)
sarsa_man_lst = np.empty((0), int)
sarsa_apf_lst = np.empty((0), int)
for i in range(30):
    sarsa_no_rew = Sarsa(4, 4, discount_rate=0.9, epsilon=None, reward_shape=None, obstacle_pos=four_by_four_obs)
    sarsa_man = Sarsa(4, 4, discount_rate=0.9, epsilon=None, reward_shape="manhattan", obstacle_pos=four_by_four_obs)
    sarsa_apf = Sarsa(4, 4, discount_rate=0.9, epsilon=None, reward_shape="apf", obstacle_pos=four_by_four_obs)
    sarsa_no_rew_lst = np.append(sarsa_no_rew_lst, np.array([sarsa_no_rew.generate_path(10000)[0]]), axis = 0)
    sarsa_man_lst = np.append(sarsa_man_lst, np.array([sarsa_man.generate_path(10000)[0]]), axis = 0)
    sarsa_apf_lst = np.append(sarsa_apf_lst, np.array([sarsa_apf.generate_path(10000)[0]]), axis = 0)
print("Sarsa")
print("Mean number of iterations, No reward shaping: {}".format(sarsa_no_rew_lst.mean()))
print("Mean number of iterations, Manhattan distance: {}".format(sarsa_man_lst.mean()))
print("Mean number of iterations, Artificial Potential Field: {}".format(sarsa_apf_lst.mean()))

#### Q-Learning

In [None]:
q_learn_no_rew_lst = np.empty((0), int)
q_learn_man_lst = np.empty((0), int)
q_learn_apf_lst = np.empty((0), int)
for i in range(30):
    q_learn_no_rew = Q_learning(4, 4, discount_rate=0.9, epsilon=None, reward_shape=None, obstacle_pos=four_by_four_obs)
    q_learn_man = Q_learning(4, 4, discount_rate=0.9, epsilon=None, reward_shape="manhattan", obstacle_pos=four_by_four_obs)
    q_learn_apf = Q_learning(4, 4, discount_rate=0.9, epsilon=None, reward_shape="apf", obstacle_pos=four_by_four_obs)
    q_learn_no_rew_lst = np.append(q_learn_no_rew_lst, np.array([q_learn_no_rew.generate_path(10000)[0]]), axis = 0)
    q_learn_man_lst = np.append(q_learn_man_lst, np.array([q_learn_man.generate_path(10000)[0]]), axis = 0)
    q_learn_apf_lst = np.append(q_learn_apf_lst, np.array([q_learn_apf.generate_path(10000)[0]]), axis = 0)
print("Q Learning")
print("Mean number of iterations, No reward shaping: {}".format(q_learn_no_rew_lst.mean()))
print("Mean number of iterations, Manhattan distance: {}".format(q_learn_man_lst.mean()))
print("Mean number of iterations, Artificial Potential Field: {}".format(q_learn_apf_lst.mean()))

## Task 2: 10 x 10 Grid

Randomly generate a 10 x 10 enviroment with 25% holes

Solve the problem using the parameters found in task 1, printing the original map and the solution

X - holes

G - goal

O - path

In [None]:
mont2 = Monte_carlo_without_es(10, 10, discount_rate=0.9)
print("Original map:")
print(mont2.get_map())
print("Number of iteration: {}".format(mont2.generate_path(100000)))
print("Solution:")
print(mont2.get_path_map())

In [None]:
sarsa2 = Sarsa(10, 10, discount_rate=0.8)
print("Original map:")
print(sarsa2.get_map())
print("Number of iteration: {}".format(sarsa2.generate_path(1000000)))
print("Solution:")
print(sarsa2.get_path_map())

In [None]:
q_learn2 = Q_learning(10, 10, epsilon=0.5, discount_rate=0.8)
print("Original map:")
print(q_learn2.get_map())
print("Number of iteration: {}".format(q_learn2.generate_path(100000)))
print("Solution:")
print(q_learn2.get_path_map())