In [48]:
import numpy as np
import random as rand
# Given a mouse, teach mouse to get all cheese and avoid death
# The following abbreviations are used: M - mouse, C - cheese,
# CC- big pile of cheese, E - empty field, P - poison
# M C E
# E P CC

# 0. Read the link.
# 1. Change the table
# 2. Allows mouse to go diagonal

def convert_2d_index_to_1d(two_dim_index: tuple, num_cols: int):
  return two_dim_index[0] * num_cols + two_dim_index[1]


def convert_1d_index_to_2d(two_dim_index: tuple, num_cols: int):
  return (two_dim_index[0] / num_cols, two_dim_index[1] % num_cols)


def is_pos_valid(cur_pos: tuple, num_rows: int, num_cols: int):
  return ((cur_pos[0] < num_rows) and (cur_pos[0] >= 0) and (cur_pos[1] < num_cols)
    and (cur_pos[1] >= 0))

def compute_max_future_reward(Q, cur_state):
  return np.max(Q[cur_state, :])

def tup_add(t1: tuple, t2: tuple):
  return (t1[0] + t2[0], t1[1] + t2[1])

def episode_learn(num_rows, num_cols, alpha, gamma, Q, R, max_steps):
  cur_pos = (0, 0)
  final_pos = (1, 1)

  actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]

  for i in range(max_steps):
    act_idx = rand.randint(0, 3)
    #print(act_idx)
    action = actions[act_idx]
    cur_pos_tmp = tup_add(cur_pos, action)
    if (not is_pos_valid(cur_pos_tmp, num_rows, num_cols)):
      continue

    prev_pos = cur_pos

    cur_pos = cur_pos_tmp
    prev_state = convert_2d_index_to_1d(prev_pos, num_cols)
    cur_state = convert_2d_index_to_1d(cur_pos, num_cols)
    #print("cur_state", cur_state, cur_pos)
    #print("R", R[cur_state], prev_pos, cur_pos)
    q_new = (1 - alpha) * Q[prev_state, act_idx] + \
      alpha * (R[cur_state] + gamma * compute_max_future_reward(Q, cur_state))
    Q[prev_state, act_idx] = q_new
    if cur_pos == final_pos:
      break

def learn(num_rows: int, num_cols: int, rewards: list, alpha: int, gamma: int):

  max_steps = 5
  num_states = num_rows * num_cols
  # This variable list all potential moves the mouse can make from one state
  num_moves = 4
  # The Q-value evaluation matrix
  Q = np.zeros((num_states, num_moves))
  # Reward for each state
  R = rewards

  for i in range(100):
    episode_learn(num_rows, num_cols, alpha, gamma, Q, R, max_steps)
  print(Q)


# This variable lists potential states which the mouse can occupy
num_rows = 2
num_cols = 3
# Reward for each state
rewards = [0, 1, 0, 0, -10, 10]
# learning rate
alpha = 0.1
gamma = 0.99


learn(num_rows, num_cols, rewards, alpha, gamma)


[[ 0.          1.42000423  0.          2.11735258]
 [ 0.         -8.14697981  1.32102394  1.65555432]
 [ 0.          4.14556535  0.44267555  0.        ]
 [ 1.59474005  0.          0.         -9.28210201]
 [ 0.          0.          0.          0.        ]
 [ 0.1881      0.         -1.          0.        ]]
