# Project 1 Froze Lake Problem

Png Qun Shen

A0199519J

png.qunshen@u.nus.edu

## Import statements

In [1]:
# import statements
import numpy as np
import pandas as pd
from Monte_Carlo_without_es import Monte_carlo_without_es
from Sarsa import Sarsa
from Q_learning import Q_learning

Create the default environment for the 4x4 Froze Lake problem

In [2]:
# four by four obstacle
four_by_four_obs = [(1,1), (1,3), (2,3), (3,0)]

## Monte Carlo without Exploring Start

### Task 1: 4 x 4 Grid

Solve the problem once, printing the original map and the solution

X - holes

G - goal

O - path

In [3]:
mont1 = Monte_carlo_without_es(4, 4, obstacle_pos=four_by_four_obs)
print("Original map:")
print(mont1.get_map())
print("Number of iteration: {}".format(mont1.generate_path(10000)))
print("Solution:")
print(mont1.get_path_map())

Original map:
|            |
|    X     X |
|          X |
| X        G |
Number of iteration: 92
Solution:
| O          |
| O  X     X |
| O  O  O  X |
| X     O  G |


To find the optimal parameters (discount rate, epsilon), the parameters are iterated over a few values.

For each set of parameters, the problem is solved 30 times, and the number of iterations in each run is recorded

In [4]:
epsilon_lst = [0.1, 0.2, 0.3, 0.4, 0.5]
discount_rate_lst = [0.6, 0.7, 0.8, 0.9, 1.0]
mont_std_dev = []
mont_mean = []
for epsilon in epsilon_lst: # loop through epsilon
    std_dev_lst = []
    mean_lst = []
    for discount_rate in discount_rate_lst: # loop through discount_rate
        mont_lst = np.empty((0), int)

        # loop 30 times
        for i in range(30):
            mont = Monte_carlo_without_es(4, 4, epsilon=epsilon, \
                                          discount_rate=discount_rate, \
                                            obstacle_pos=four_by_four_obs)
            mont_lst = np.append(mont_lst, np.array([mont.generate_path(1000)]), axis = 0)
        std_dev_lst.append(np.std(mont_lst))
        mean_lst.append(mont_lst.mean())
    mont_mean.append(mean_lst)
    mont_std_dev.append(std_dev_lst)
print("Standard deviation:")
print(pd.DataFrame(mont_std_dev, index=epsilon_lst, columns=discount_rate_lst))
print()
print("Mean:")
print(pd.DataFrame(mont_mean, index=epsilon_lst, columns=discount_rate_lst))

Standard deviation:
            0.6         0.7         0.8         0.9         1.0
0.1  366.774113  311.089107  394.635224  413.301624  426.849181
0.2  324.240686  283.836737  325.769387  275.942879  396.344244
0.3  352.770698  245.111641  215.936987  150.471190  319.948926
0.4  383.989473  286.232386   98.752041  108.937133  344.713304
0.5  418.473289  370.900526  212.296428  144.236318  289.078182

Mean:
            0.6         0.7         0.8         0.9         1.0
0.1  569.500000  351.633333  378.800000  354.366667  696.100000
0.2  300.333333  281.200000  295.200000  222.166667  558.200000
0.3  403.966667  248.500000  200.466667  111.566667  335.533333
0.4  524.866667  246.766667  133.966667   96.966667  356.733333
0.5  543.200000  355.000000  142.600000  115.533333  279.066667


### Task 2: 10 x 10 Grid

Solve the problem once, printing the original map and the solution

X - holes

G - goal

O - path

In [5]:
mont2 = Monte_carlo_without_es(10, 10, epsilon=0.4, discount_rate=0.9)
print("Original map:")
print(mont2.get_map())
print("Number of iteration: {}".format(mont2.generate_path(100000)))
print("Solution:")
print(mont2.get_path_map())

Original map:
|             X     X  X       |
|                X           X |
| X  X                         |
| X     X                      |
|    X           X  X     X    |
|                         X    |
|                   X          |
| X  X     X                   |
|    X     X     X     X       |
|    X              X     X  G |
Number of iteration: 99999
Solution:
| O  O  O     X     X  X       |
|                X           X |
| X  X                         |
| X     X                      |
|    X           X  X     X    |
|                         X    |
|                   X          |
| X  X     X                   |
|    X     X     X     X       |
|    X              X     X  G |


## Sarsa

### Task 1: 4 x 4 Grid

Solve the problem once, printing the original map and the solution

X - holes

G - goal

O - path

In [6]:
sarsa1 = Sarsa(4, 4, obstacle_pos=four_by_four_obs)
print("Original map:")
print(sarsa1.get_map())
print("Number of iteration: {}".format(sarsa1.generate_path(1000)))
print("Solution:")
print(sarsa1.get_path_map())

Original map:
|            |
|    X     X |
|          X |
| X        G |
Number of iteration: 21
Solution:
| O          |
| O  X     X |
| O  O  O  X |
| X     O  G |


Solve the problem 30 times, and store the number of iterations in each run, printing the statistics

In [7]:
epsilon_lst = [0.1, 0.2, 0.3, 0.4, 0.5]
discount_rate_lst = [0.6, 0.7, 0.8, 0.9, 1.0]
sarsa_std_dev = []
sarsa_mean = []
for epsilon in epsilon_lst: # loop through epsilon
    std_dev_lst = []
    mean_lst = []
    for discount_rate in discount_rate_lst: # loop through discount_rate
        sarsa_lst = np.empty((0), int)

        # loop 30 times
        for i in range(30):
            sarsa = Sarsa(4, 4, epsilon=epsilon, \
                          discount_rate=discount_rate, \
                            obstacle_pos=four_by_four_obs)
            sarsa_lst = np.append(sarsa_lst, np.array([sarsa.generate_path(1000)]), axis = 0)
        std_dev_lst.append(np.std(sarsa_lst))
        mean_lst.append(sarsa_lst.mean())
    sarsa_mean.append(mean_lst)
    sarsa_std_dev.append(std_dev_lst)
print("Standard deviation:")
print(pd.DataFrame(sarsa_std_dev, index=epsilon_lst, columns=discount_rate_lst))
print()
print("Mean:")
print(pd.DataFrame(sarsa_mean, index=epsilon_lst, columns=discount_rate_lst))

Standard deviation:
           0.6        0.7        0.8        0.9        1.0
0.1   6.635008   5.136038   8.015333   7.298782   6.821209
0.2   8.923378   8.154140   6.988960   7.141117   7.910682
0.3   8.062602   7.404278   7.495258   6.902174   6.132337
0.4   7.651071  11.401559   7.472988   6.641954   7.854510
0.5  12.088792   8.753476  11.670285  12.237465  10.064238

Mean:
           0.6        0.7        0.8        0.9        1.0
0.1  18.100000  16.433333  18.566667  17.166667  18.266667
0.2  19.200000  18.100000  18.433333  18.933333  18.566667
0.3  24.833333  21.100000  21.233333  19.400000  19.833333
0.4  23.833333  25.066667  23.766667  22.866667  25.800000
0.5  29.833333  26.100000  30.933333  28.333333  27.333333


### Task 2: 10 x 10 Grid

Solve the problem once, printing the original map and the solution

X - holes

G - goal

O - path

In [9]:
sarsa2 = Sarsa(10, 10, epsilon=0.1, discount_rate=0.7)
print("Original map:")
print(sarsa2.get_map())
print("Number of iteration: {}".format(sarsa2.generate_path(100000)))
print("Solution:")
print(sarsa2.get_path_map())

Original map:
|                         X    |
|          X  X                |
| X     X     X     X     X    |
|                              |
|                X  X        X |
|       X  X  X  X        X  X |
|                              |
| X  X           X  X          |
|                      X       |
|    X  X                    G |
Number of iteration: 99999
Solution:
| O  O  O                 X    |
|          X  X                |
| X     X     X     X     X    |
|                              |
|                X  X        X |
|       X  X  X  X        X  X |
|                              |
| X  X           X  X          |
|                      X       |
|    X  X                    G |


## Q-Learning

### Task 1: 4 x 4 Grid

Solve the problem once, printing the original map and the solution

X - holes

G - goal

O - path

In [10]:
q_learn1 = Q_learning(4, 4, obstacle_pos=four_by_four_obs)
print("Original map:")
print(q_learn1.get_map())
print("Number of iteration: {}".format(q_learn1.generate_path(1000)))
print("Solution:")
print(q_learn1.get_path_map())

Original map:
|            |
|    X     X |
|          X |
| X        G |
Number of iteration: 999
Solution:
| O  O  O  O |
|    X     X |
|          X |
| X        G |


Solve the problem 30 times, and store the number of iterations in each run, printing the statistics

In [11]:
epsilon_lst = [0.1, 0.2, 0.3, 0.4, 0.5]
discount_rate_lst = [0.6, 0.7, 0.8, 0.9, 1.0]
q_learn_std_dev = []
q_learn_mean = []
for epsilon in epsilon_lst: # loop through epsilon
    std_dev_lst = []
    mean_lst = []
    for discount_rate in discount_rate_lst: # loop through discount_rate
        q_learn_lst = np.empty((0), int)

        # loop 30 times
        for i in range(30):
            q_learn = Q_learning(4, 4, epsilon=epsilon, \
                                 discount_rate=discount_rate, \
                                    obstacle_pos=four_by_four_obs)
            q_learn_lst = np.append(q_learn_lst, np.array([q_learn.generate_path(1000)]), axis = 0)
        std_dev_lst.append(np.std(q_learn_lst))
        mean_lst.append(q_learn_lst.mean())
    q_learn_mean.append(mean_lst)
    q_learn_std_dev.append(std_dev_lst)
print("Standard deviation:")
print(pd.DataFrame(q_learn_std_dev, index=epsilon_lst, columns=discount_rate_lst))
print()
print("Mean:")
print(pd.DataFrame(q_learn_mean, index=epsilon_lst, columns=discount_rate_lst))

Standard deviation:
            0.6         0.7         0.8         0.9         1.0
0.1    0.000000  167.731204  155.143532    0.000000    0.000000
0.2  201.886957  287.448191  254.890131  305.234673  273.893134
0.3  328.674307  314.550234  329.057045  314.186668  295.465501
0.4  314.952159  347.198957  379.708053  342.738767  322.814849
0.5  332.908890  322.713855  257.130276  311.781626  358.966083

Mean:
            0.6         0.7         0.8         0.9         1.0
0.1  999.000000  955.100000  959.533333  999.000000  999.000000
0.2  900.300000  865.933333  850.233333  836.166667  858.466667
0.3  785.000000  762.500000  772.833333  727.066667  803.066667
0.4  545.933333  563.866667  561.833333  548.933333  574.800000
0.5  447.733333  423.033333  376.433333  420.466667  489.133333


### Task 2: 10 x 10 Grid

Solve the problem once, printing the original map and the solution

X - holes

G - goal

O - path

In [12]:
q_learn2 = Q_learning(10, 10, epsilon=0.5, discount_rate=0.8)
print("Original map:")
print(q_learn2.get_map())
print("Number of iteration: {}".format(q_learn2.generate_path(1000)))
print("Solution:")
print(q_learn2.get_path_map())

Original map:
|                   X          |
|       X     X     X          |
|                X             |
|    X        X        X     X |
| X     X              X       |
| X           X     X        X |
|          X        X          |
|                X  X          |
| X           X              X |
| X           X              G |
Number of iteration: 999
Solution:
| O  O  O  O  O  O  X          |
|       X     X  O  X          |
|                X             |
|    X        X        X     X |
| X     X              X       |
| X           X     X        X |
|          X        X          |
|                X  X          |
| X           X              X |
| X           X              G |
