In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
ACTIONS_SPACE = ('U','D','L','R')

class grid:
    
    def __init__(self,rows,cols,start_position):
        self.rows = rows
        self.cols = cols
        self.i = start_position[0]
        self.j = start_position[1]
    
    def set_grid(self,rewards,actions):
        self.actions = actions #allowed actions in each state
        self.rewards = rewards
        
    def set_state(self,s):
        self.i = s[0]
        self.j = s[1]
    
    def current_state(self):
        return (self.i,self.j)
    
    def is_terminal(self,s):
        return s not in self.actions
    
    def reset(self):
        self.i = 2
        self.j = 0
        return (self.i,self.j)
        
    def move(self,a):
        if a in self.actions[(self.i,self.j)]:
            if a=='U':
                self.i -= 1
            elif a=='D':
                self.i += 1
            elif a=='L':
                self.j -= 1
            elif a=='R':
                self.j += 1
        
        return self.rewards.get((self.i,self.j),0)
    
    def get_next_state(self,s,a):
        i = s[0]
        j = s[1]
        if a in self.actions[(i,j)]:
            if a=='U':
                i -= 1
            elif a=='D':
                i += 1
            elif a=='L':
                j -= 1
            elif a=='R':
                j += 1
        return i,j
    
    def game_over(self):
        return (self.i,self.j) not in self.actions
    
    def all_states(self):
        return set(self.actions.keys()) | set(self.rewards.keys())
##################################################################

def standard_grid():
    g = grid(3,4,(2,0))
    rewards = {(0,3): 1 , (1,3): -1}
    actions = {
        (0,0): {'D','R'},
        (0,1): {'L','R'},
        (0,2): {'L','D','R'},
        (1,0): {'U','D'},
        (1,2): {'U','D','R'},
        (2,0): {'U','R'},
        (2,1): {'L','R'},
        (2,2): {'L','R','U'},
        (2,3): {'L','U'}
    }
    
    g.set_grid(rewards,actions)
    return g 


def negative_grid(step_cost = -0.1):
    g = standard_grid()
    g.rewards.update({
        (0,0): step_cost,
        (0,1): step_cost,
        (0,2): step_cost,
        (1,0): step_cost,
        (1,2): step_cost,
        (2,0): step_cost,
        (2,1): step_cost,
        (2,2): step_cost,
        (2,3): step_cost,
        })
    return g
################################################################

class WindyGrid:
    
    def __init__(self,rows,cols,start):
        self.rows = rows
        self.cols = cols
        self.i = start[0]
        self.j = start[1]
    def set_grid(self,rewards,actions,probs):
        self.rewards = rewards
        self.actions = actions
        self.probs = probs
    
    def set_state(self, s):
        self.i = s[0]
        self.j = s[1]
    
    def current_state(self):
        return (self.i,self.j)
    
    def is_terminal(self,s):
        return s not in self.actions
    
    def move(self,action):
        s = (self.i,self.j)
        a = action
        next_states_and_probs = self.probs[(s,a)]
        next_states = list(next_states_and_probs.keys())
        next_probs = list(next_states_and_probs.values())
        next_state_idx = np.random.choice(len(next_probs), p = next_probs)
        s2 = next_states[next_state_idx]
        
        self.i,self.j = s2
        
        return self.rewards.get(s2,0)
    
    def game_over(self):
        return (self.i,self.j) not in self.actions
    
    def all_states(self):
        return set(self.actions.keys()) | set(self.rewards.keys())

    
def windy_grid():
    g = WindyGrid(3,4,(2,0))
    rewards = {(0,3): 1 , (1,3): -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
      }

    probs = {
        ((2, 0), 'U'): {(1, 0): 1.0},
        ((2, 0), 'D'): {(2, 0): 1.0},
        ((2, 0), 'L'): {(2, 0): 1.0},
        ((2, 0), 'R'): {(2, 1): 1.0},
        ((1, 0), 'U'): {(0, 0): 1.0},
        ((1, 0), 'D'): {(2, 0): 1.0},
        ((1, 0), 'L'): {(1, 0): 1.0},
        ((1, 0), 'R'): {(1, 0): 1.0},
        ((0, 0), 'U'): {(0, 0): 1.0},
        ((0, 0), 'D'): {(1, 0): 1.0},
        ((0, 0), 'L'): {(0, 0): 1.0},
        ((0, 0), 'R'): {(0, 1): 1.0},
        ((0, 1), 'U'): {(0, 1): 1.0},
        ((0, 1), 'D'): {(0, 1): 1.0},
        ((0, 1), 'L'): {(0, 0): 1.0},
        ((0, 1), 'R'): {(0, 2): 1.0},
        ((0, 2), 'U'): {(0, 2): 1.0},
        ((0, 2), 'D'): {(1, 2): 1.0},
        ((0, 2), 'L'): {(0, 1): 1.0},
        ((0, 2), 'R'): {(0, 3): 1.0},
        ((2, 1), 'U'): {(2, 1): 1.0},
        ((2, 1), 'D'): {(2, 1): 1.0},
        ((2, 1), 'L'): {(2, 0): 1.0},
        ((2, 1), 'R'): {(2, 2): 1.0},
        ((2, 2), 'U'): {(1, 2): 1.0},
        ((2, 2), 'D'): {(2, 2): 1.0},
        ((2, 2), 'L'): {(2, 1): 1.0},
        ((2, 2), 'R'): {(2, 3): 1.0},
        ((2, 3), 'U'): {(1, 3): 1.0},
        ((2, 3), 'D'): {(2, 3): 1.0},
        ((2, 3), 'L'): {(2, 2): 1.0},
        ((2, 3), 'R'): {(2, 3): 1.0},
        ((1, 2), 'U'): {(0, 2): 0.5, (1, 3): 0.5},
        ((1, 2), 'D'): {(2, 2): 1.0},
        ((1, 2), 'L'): {(1, 2): 1.0},
        ((1, 2), 'R'): {(1, 3): 1.0},
      }
    g.set(rewards, actions, probs)
    return g


def windy_grid_penalized(step_cost=-0.1):
    g = WindyGrid(3, 4, (2, 0))
    rewards = {
        (0, 0): step_cost,
        (0, 1): step_cost,
        (0, 2): step_cost,
        (1, 0): step_cost,
        (1, 2): step_cost,
        (2, 0): step_cost,
        (2, 1): step_cost,
        (2, 2): step_cost,
        (2, 3): step_cost,
        (0, 3): 1,
        (1, 3): -1
      }
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
      }
    
    probs = {
        ((2, 0), 'U'): {(1, 0): 1.0},
        ((2, 0), 'D'): {(2, 0): 1.0},
        ((2, 0), 'L'): {(2, 0): 1.0},
        ((2, 0), 'R'): {(2, 1): 1.0},
        ((1, 0), 'U'): {(0, 0): 1.0},
        ((1, 0), 'D'): {(2, 0): 1.0},
        ((1, 0), 'L'): {(1, 0): 1.0},
        ((1, 0), 'R'): {(1, 0): 1.0},
        ((0, 0), 'U'): {(0, 0): 1.0},
        ((0, 0), 'D'): {(1, 0): 1.0},
        ((0, 0), 'L'): {(0, 0): 1.0},
        ((0, 0), 'R'): {(0, 1): 1.0},
        ((0, 1), 'U'): {(0, 1): 1.0},
        ((0, 1), 'D'): {(0, 1): 1.0},
        ((0, 1), 'L'): {(0, 0): 1.0},
        ((0, 1), 'R'): {(0, 2): 1.0},
        ((0, 2), 'U'): {(0, 2): 1.0},
        ((0, 2), 'D'): {(1, 2): 1.0},
        ((0, 2), 'L'): {(0, 1): 1.0},
        ((0, 2), 'R'): {(0, 3): 1.0},
        ((2, 1), 'U'): {(2, 1): 1.0},
        ((2, 1), 'D'): {(2, 1): 1.0},
        ((2, 1), 'L'): {(2, 0): 1.0},
        ((2, 1), 'R'): {(2, 2): 1.0},
        ((2, 2), 'U'): {(1, 2): 1.0},
        ((2, 2), 'D'): {(2, 2): 1.0},
        ((2, 2), 'L'): {(2, 1): 1.0},
        ((2, 2), 'R'): {(2, 3): 1.0},
        ((2, 3), 'U'): {(1, 3): 1.0},
        ((2, 3), 'D'): {(2, 3): 1.0},
        ((2, 3), 'L'): {(2, 2): 1.0},
        ((2, 3), 'R'): {(2, 3): 1.0},
        ((1, 2), 'U'): {(0, 2): 0.5, (1, 3): 0.5},
        ((1, 2), 'D'): {(2, 2): 1.0},
        ((1, 2), 'L'): {(1, 2): 1.0},
        ((1, 2), 'R'): {(1, 3): 1.0},
      }
    g.set(rewards, actions, probs)
    return g


def epsilon_greedy(policy, s , eps = 0.1):
    p = np.random.random()
    if p < (1- eps):
        return policy[s]
    else:
        return np.random.choice(ACTIONS_SPACE)
        
    


def print_policy(P, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            a = P.get((i,j), ' ')
            print("  %s  |" % a, end="")
        print("")

def print_values(V, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            v = V.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")
        


In [3]:
g = standard_grid()

num = 6
idx = np.random.choice(len(g.actions.keys()),num)
idx = np.unique(idx)
i = 0 
l = []
beta = 0.3
phi = {}
for key in g.all_states():
    if i in idx:
        l.append(list(key))
    i+=1
l = np.array(l)
for key in g.all_states():
    phi[key] = np.array([])
    for i in l:
        phi[key] = np.append(phi[key],np.exp(-beta * np.linalg.norm(np.array(key)-i)**2))
phi

{(0,
  1): array([1.        , 0.54881164, 0.30119421, 0.22313016, 0.09071795,
        0.22313016]),
 (1,
  2): array([0.54881164, 1.        , 0.54881164, 0.22313016, 0.54881164,
        0.74081822]),
 (2,
  1): array([0.30119421, 0.54881164, 0.09071795, 0.74081822, 0.30119421,
        0.74081822]),
 (0,
  0): array([0.74081822, 0.22313016, 0.06720551, 0.30119421, 0.02024191,
        0.09071795]),
 (0,
  3): array([0.30119421, 0.54881164, 1.        , 0.02024191, 0.30119421,
        0.22313016]),
 (2,
  0): array([0.22313016, 0.22313016, 0.02024191, 1.        , 0.06720551,
        0.30119421]),
 (2,
  3): array([0.09071795, 0.54881164, 0.30119421, 0.06720551, 1.        ,
        0.74081822]),
 (0,
  2): array([0.74081822, 0.74081822, 0.74081822, 0.09071795, 0.22313016,
        0.30119421]),
 (2,
  2): array([0.22313016, 0.74081822, 0.22313016, 0.30119421, 0.74081822,
        1.        ]),
 (1,
  0): array([0.54881164, 0.30119421, 0.04978707, 0.74081822, 0.04978707,
        0.22313016]),


In [4]:
# TD(0)
policy = {
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 0): 'U',
        (1, 2): 'U',
        (2, 0): 'U',
        (2, 1): 'R',
        (2, 2): 'U',
        (2, 3): 'L',
        }

policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U',
  }

'''
policy = {
        (0, 0): {'D':0.5, 'R':0.5},
        (0, 1): {'L':0.7, 'R':0.3},
        (0, 2): {'L':0.6, 'D':0.2, 'R':0.2},
        (1, 0): {'U':0.2, 'D':0.8},
        (1, 2): {'U':0.5, 'D':0.4, 'R':0.1},
        (2, 0): {'U':0.5, 'R':0.5},
        (2, 1): {'L':0.2, 'R':0.8},
        (2, 2): {'L':0.5, 'R':0.3, 'U':0.2},
        (2, 3): {'L':0.3, 'U':0.7},
      }
'''
w = np.zeros(np.shape(l)[0])

alpha = 0.1
gamma = 0.9
deltas = []
#while True:
n_episodes = 20000
for _ in range(n_episodes):
    s = g.reset()
    while not g.game_over():
        a = epsilon_greedy(policy,s)
        r = g.move(a)
        s2 = g.current_state()
        if g.is_terminal(s2):
            y = r
        else:
            y = r+gamma*np.matmul(phi[s2],w.T)
        
        w = w + alpha*(y - np.matmul(phi[s],w.T))*phi[s]
        s = s2
        
V = {}
for key,val in phi.items():
    if g.is_terminal(key):
        V[key] = 0
    else:
        V[key] = np.matmul(phi[key],w) 

print_values(V,g)
print_policy(policy,g)

---------------------------
 0.75| 0.82| 0.97| 0.00|
---------------------------
 0.71| 0.00|-0.28| 0.00|
---------------------------
 0.61|-0.31|-1.09|-0.79|
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |
