# Load Env

In [1]:
from Environment.Model import MarkovDecisionProcess, util_2DtoMDP

In [2]:
env = {
    "A": {
        "X": {"A":0.3, "B":0.7},
        "Y": {"A":1.0}
    },
    "B": {
        "X": {"End":0.8, "B":0.2},
        "Y": {"A":1.0}
    },
    "End": {}
}

init_rewards = {
    "A": 1,
    "B": 0.6,
    "End": 100
}

In [3]:
model = MarkovDecisionProcess(env=env,rewards=init_rewards)

In [4]:
model.get_transition('A','X')

[(0.3, 'A'), (0.7, 'B')]

In [5]:
model.states

['A', 'B', 'End']

In [6]:
model.get_reward('A')

1

In [7]:
model.get_actions('End')

[None]

In [8]:
model.terminals

['End']

In [9]:
matrix = [
    [-0.1,-0.1,-0.1,-0.1],
    [-0.1,-0.1,-0.1,-0.1],
    [-1,-0.1,-0.1,1]
]

In [10]:
import numpy as np

In [11]:
mm = np.asarray(matrix)

In [12]:
mm.shape

(3, 4)

In [13]:
mm[2,0]

-1.0

In [14]:
env, rewards = util_2DtoMDP(matrix)

In [15]:
env, rewards

(defaultdict(dict,
             {(0, 0): {'down': {(1, 0): 1.0},
               'right': {(0, 1): 1.0},
               'stop': {(0, 0): 1.0}},
              (0, 1): {'down': {(1, 1): 1.0},
               'left': {(0, 0): 1.0},
               'right': {(0, 2): 1.0},
               'stop': {(0, 1): 1.0}},
              (0, 2): {'down': {(1, 2): 1.0},
               'left': {(0, 1): 1.0},
               'right': {(0, 3): 1.0},
               'stop': {(0, 2): 1.0}},
              (0, 3): {'down': {(1, 3): 1.0},
               'left': {(0, 2): 1.0},
               'stop': {(0, 3): 1.0}},
              (1, 0): {'down': {(2, 0): 1.0},
               'right': {(1, 1): 1.0},
               'stop': {(1, 0): 1.0},
               'up': {(0, 0): 1.0}},
              (1, 1): {'down': {(2, 1): 1.0},
               'left': {(1, 0): 1.0},
               'right': {(1, 2): 1.0},
               'stop': {(1, 1): 1.0},
               'up': {(0, 1): 1.0}},
              (1, 2): {'down': {(2, 2): 1.0},
      

In [16]:
gridmodel = MarkovDecisionProcess(env, rewards)

# Load Agent

In [17]:
from Agent.DecisionMaker import DecisionMaker

In [18]:
dm = DecisionMaker(model)

In [19]:
dm.ValueIteration(discount=0.9,threshold=0.01)

({'A': 77.77674745942798, 'B': 88.53658222610669, 'End': 100.0},
 {'A': 'X', 'B': 'X', 'End': None})

In [20]:
dm.ValueIteration(0.9,epoch=100000)

({'A': 77.77814901436685, 'B': 88.53658536585365, 'End': 100.0},
 {'A': 'X', 'B': 'X', 'End': None})

In [21]:
Q,U,policy = dm.QLearning(initial_state='A', discount=0.9, Ne=5, Rplus=2, epoch=10000)

In [22]:
Q

defaultdict(float,
            {('A', 'X'): 4.586495497999105,
             ('A', 'Y'): 1.3715695833333332,
             ('B', 'X'): 2.1690499244848667,
             ('B', 'Y'): 1.6399837072115568,
             ('End', None): 1.048462333622274})

In [23]:
U

defaultdict(<function Agent.DecisionMaker.DecisionMaker.QLearning.<locals>.<lambda>>,
            {'A': 4.586495497999105,
             'B': 2.1690499244848667,
             'End': 1.048462333622274})

In [24]:
policy

{'A': 'X', 'B': 'X', 'End': None}

In [25]:
grid_dm = DecisionMaker(gridmodel)

In [26]:
grid_dm.ValueIteration(0.9,threshold=0.00001)

({(0, 0): 5.4953066475158261,
  (0, 1): 6.2170166475158251,
  (0, 2): 7.0189166475158231,
  (0, 3): 7.9099166475158231,
  (1, 0): 6.2170166475158251,
  (1, 1): 7.0189166475158231,
  (1, 2): 7.9099166475158231,
  (1, 3): 8.8999166475158233,
  (2, 0): 6.1189166475158228,
  (2, 1): 7.9099166475158231,
  (2, 2): 8.8999166475158233,
  (2, 3): 9.999916647515823},
 {(0, 0): 'down',
  (0, 1): 'down',
  (0, 2): 'down',
  (0, 3): 'down',
  (1, 0): 'right',
  (1, 1): 'down',
  (1, 2): 'down',
  (1, 3): 'down',
  (2, 0): 'right',
  (2, 1): 'right',
  (2, 2): 'right',
  (2, 3): 'stop'})

In [27]:
grid_dm.ValueIteration(0.9,epoch=100)

({(0, 0): 5.4951243860111267,
  (0, 1): 6.2168343860111275,
  (0, 2): 7.0187343860111264,
  (0, 3): 7.9097343860111273,
  (1, 0): 6.2168343860111275,
  (1, 1): 7.0187343860111264,
  (1, 2): 7.9097343860111273,
  (1, 3): 8.8997343860111275,
  (2, 0): 6.1187343860111261,
  (2, 1): 7.9097343860111273,
  (2, 2): 8.8997343860111275,
  (2, 3): 9.9997343860111272},
 {(0, 0): 'down',
  (0, 1): 'down',
  (0, 2): 'down',
  (0, 3): 'down',
  (1, 0): 'right',
  (1, 1): 'down',
  (1, 2): 'down',
  (1, 3): 'down',
  (2, 0): 'right',
  (2, 1): 'right',
  (2, 2): 'right',
  (2, 3): 'stop'})

In [28]:
grid_dm.QLearning((0,0), 0.9, Ne=5, Rplus=2,epoch=1000000)

(defaultdict(float,
             {((0, 0), 'down'): -0.097142857142857156,
              ((0, 0), 'right'): -0.085714285714285729,
              ((0, 0), 'stop'): -0.12808333333333333,
              ((0, 1), 'down'): -0.096428571428571433,
              ((0, 1), 'left'): -0.096734693877551028,
              ((0, 1), 'right'): -0.083333333333333343,
              ((0, 1), 'stop'): -0.12808333333333333,
              ((0, 2), 'down'): -0.083333333333333343,
              ((0, 2), 'left'): -0.083333333333333343,
              ((0, 2), 'right'): 0.0,
              ((0, 2), 'stop'): 0.0,
              ((0, 3), 'down'): 0.47137297916666659,
              ((0, 3), 'left'): 0.0,
              ((0, 3), 'stop'): 0.0,
              ((1, 0), 'down'): -0.49083333333333334,
              ((1, 0), 'right'): -0.096875000000000017,
              ((1, 0), 'stop'): -0.12808333333333333,
              ((1, 0), 'up'): -0.096428571428571433,
              ((1, 1), 'down'): -0.096428571428571433,
           