In [1]:
import numpy as np
import gym

# Dynamic Programming

In [18]:
class FoodTruck(gym.Env):
    def __init__(self):
        self.v_demand = [100, 200, 300, 400]
        self.p_demand = [0.3, 0.4, 0.2, 0.1]
        self.capacity = self.v_demand[-1]
        self.days = ['Mon', 'Tue', 'Wed', 
                     'Thu', 'Fri', "Weekend"]
        self.unit_cost = 4
        self.net_revenue = 7
        self.action_space = [0, 100, 200, 300, 400]
        self.state_space = [("Mon", 0)] \
                            + [(d, i) for d in self.days[1:] 
                                for i in [0, 100, 200, 300]]
    
    def get_next_state_reward(self, state, action, demand):
        day, inventory = state
        result = {}
        result['next_day'] = self.days[self.days.index(day) \
                                       + 1]
        result['starting_inventory'] = min(self.capacity, 
                                           inventory 
                                           + action)
        result['cost'] = self.unit_cost * action 
        result['sales'] = min(result['starting_inventory'], 
                              demand)
        result['revenue'] = self.net_revenue * result['sales']
        result['next_inventory'] \
            = result['starting_inventory'] - result['sales']
        result['reward'] = result['revenue'] - result['cost']
        return result
    
    def get_transition_prob(self, state, action):
        next_s_r_prob = {}
        for ix, demand in enumerate(self.v_demand):
            result = self.get_next_state_reward(state, 
                                                action, 
                                                demand)
            next_s = (result['next_day'],
                      result['next_inventory'])
            reward = result['reward']
            prob = self.p_demand[ix]
            if (next_s, reward) not in next_s_r_prob:
                next_s_r_prob[next_s, reward] = prob
            else:
                next_s_r_prob[next_s, reward] += prob
        return next_s_r_prob
    
    def reset(self):
        self.day = "Mon"
        self.inventory = 0
        state = (self.day, self.inventory)
        return state
    
    def is_terminal(self, state):
        day, inventory = state
        if day == "Weekend":
            return True
        else:
            return False
    
    def step(self, action):
        demand = np.random.choice(self.v_demand, 
                                  p=self.p_demand)
        result = self.get_next_state_reward((self.day, 
                                             self.inventory), 
                                       action, 
                                       demand)
        self.day = result['next_day']
        self.inventory = result['next_inventory']
        state = (self.day, self.inventory)
        reward = result['reward']
        done = self.is_terminal(state)
        info = {'demand': demand, 'sales': result['sales']}
        return state, reward, done, info

In [19]:
# Simulating an arbitrary policy
np.random.seed(0)
foodtruck = FoodTruck()
rewards = []
for i_episode in range(10000):
    state = foodtruck.reset()
    done = False
    ep_reward = 0
    while not done:
        day, inventory = state
        action = max(0, 300 - inventory)
        state, reward, done, info = foodtruck.step(action) 
        ep_reward += reward
    rewards.append(ep_reward)
np.mean(rewards)

2590.83

In [None]:
# Single day expected reward
ucost = 4
uprice = 7
v_demand = [100, 200, 300, 400]
p_demand = [0.3, 0.4, 0.2, 0.1]
inv = 400
profit = uprice*np.sum([p_demand[i]*min(v_demand[i], inv) for i in range(4)]) - inv*ucost
print(profit)

## Policy Evaluation

In [21]:
def base_policy(states):
    policy = {}
    for s in states:
        day, inventory = s
        prob_a = {} 
        if inventory >= 300:
            prob_a[0] = 1
        else:
            prob_a[200 - inventory] = 0.5
            prob_a[300 - inventory] = 0.5
        policy[s] = prob_a
    return policy

In [22]:
def expected_update(env, v, s, prob_a, gamma):
    expected_value = 0
    for a in prob_a:
        prob_next_s_r = env.get_transition_prob(s, a)
        for next_s, r in prob_next_s_r:
            expected_value += prob_a[a] \
                            * prob_next_s_r[next_s, r] \
                            * (r + gamma * v[next_s])
    return expected_value

In [26]:
def policy_evaluation(env, policy, max_iter=100, 
                      v = None, eps=0.1, gamma=1):
    if not v:
        v = {s: 0 for s in env.state_space}
    k = 0
    while True:
        max_delta = 0
        for s in v:
            if not env.is_terminal(s):
                v_old = v[s]
                prob_a = policy[s]
                v[s] = expected_update(env, v, 
                                       s, prob_a, 
                                       gamma)
                max_delta = max(max_delta, 
                                abs(v[s] - v_old))
        k += 1
        if max_delta < eps:
            print("Converged in", k, "iterations.")
            break
        elif k == max_iter:
            print("Terminating after", k, "iterations.")
            break
    return v

In [52]:
foodtruck = FoodTruck()
policy = base_policy(foodtruck.state_space)

In [53]:
v = policy_evaluation(foodtruck, policy)
print("Expected weekly profit:", v["Mon", 0])

Converged in 6 iterations.
Expected weekly profit: 2515.0


In [54]:
print("The state values:")
v

The state values:


{('Mon', 0): 2515.0,
 ('Tue', 0): 1960.0,
 ('Tue', 100): 2360.0,
 ('Tue', 200): 2760.0,
 ('Tue', 300): 3205.0,
 ('Wed', 0): 1405.0,
 ('Wed', 100): 1805.0,
 ('Wed', 200): 2205.0,
 ('Wed', 300): 2650.0,
 ('Thu', 0): 850.0000000000001,
 ('Thu', 100): 1250.0,
 ('Thu', 200): 1650.0,
 ('Thu', 300): 2095.0,
 ('Fri', 0): 295.00000000000006,
 ('Fri', 100): 695.0000000000001,
 ('Fri', 200): 1095.0,
 ('Fri', 300): 1400.0,
 ('Weekend', 0): 0,
 ('Weekend', 100): 0,
 ('Weekend', 200): 0,
 ('Weekend', 300): 0}

In [57]:
def choose_action(state, policy):
    prob_a = policy[state]
    action = np.random.choice(a=list(prob_a.keys()), 
                              p=list(prob_a.values()))
    return action

def simulate_policy(policy, n_episodes):
    np.random.seed(0)
    foodtruck = FoodTruck()
    rewards = []
    for i_episode in range(n_episodes):
        state = foodtruck.reset()
        done = False
        ep_reward = 0
        while not done:
            action = choose_action(state, policy)
            state, reward, done, info = foodtruck.step(action) 
            ep_reward += reward
        rewards.append(ep_reward)
    print("Expected weekly profit:", np.mean(rewards))

In [58]:
simulate_policy(policy, 1000)

Expected weekly profit: 2518.1


## Policy Iteration

In [39]:
def policy_improvement(env, v, s, actions, gamma):
    prob_a = {}
    if not env.is_terminal(s):
        max_q = np.NINF
        best_a = None
        for a in actions:
            q_sa = expected_update(env, v, s, {a: 1}, gamma)
            if q_sa >= max_q:
                max_q = q_sa
                best_a = a
        prob_a[best_a] = 1
    else:
        max_q = 0
    return prob_a, max_q

In [42]:
def policy_iteration(env,  eps=0.1, gamma=1):
    np.random.seed(1)
    states = env.state_space
    actions = env.action_space
    policy = {s: {np.random.choice(actions): 1}
             for s in states}
    v = {s: 0 for s in states}
    while True:
        v = policy_evaluation(env, policy, v=v, 
                          eps=eps, gamma=gamma)
        old_policy = policy
        policy = {}
        for s in states:
            policy[s], _ = policy_improvement(env, v, s, 
                                    actions, gamma)
        if old_policy == policy:
            break
    print("Optimal policy found!")
    return policy, v

In [43]:
policy, v = policy_iteration(foodtruck)
print("Expected weekly profit:", v["Mon", 0])

Converged in 6 iterations.
Converged in 6 iterations.
Converged in 5 iterations.
Optimal policy found!
Expected weekly profit: 2880.0


In [44]:
print(policy)

{('Mon', 0): {400: 1}, ('Tue', 0): {400: 1}, ('Tue', 100): {300: 1}, ('Tue', 200): {200: 1}, ('Tue', 300): {100: 1}, ('Wed', 0): {400: 1}, ('Wed', 100): {300: 1}, ('Wed', 200): {200: 1}, ('Wed', 300): {100: 1}, ('Thu', 0): {300: 1}, ('Thu', 100): {200: 1}, ('Thu', 200): {100: 1}, ('Thu', 300): {0: 1}, ('Fri', 0): {200: 1}, ('Fri', 100): {100: 1}, ('Fri', 200): {0: 1}, ('Fri', 300): {0: 1}, ('Weekend', 0): {}, ('Weekend', 100): {}, ('Weekend', 200): {}, ('Weekend', 300): {}}


## Value Iteration

In [49]:
def value_iteration(env, max_iter=100, eps=0.1, gamma=1):
    states = env.state_space
    actions = env.action_space
    v = {s: 0 for s in states}
    policy = {}
    k = 0
    while True:
        max_delta = 0
        for s in states:
            old_v = v[s]
            policy[s], v[s] = policy_improvement(env, 
                                                 v, 
                                                 s, 
                                                 actions, 
                                                 gamma)
            max_delta = max(max_delta, abs(v[s] - old_v))
        k += 1
        if max_delta < eps:
            print("Converged in", k, "iterations.")
            break
        elif k == max_iter:
            print("Terminating after", k, "iterations.")
            break
    return policy, v

In [48]:
policy, v = value_iteration(foodtruck)
print("Expected weekly profit:", v["Mon", 0])

Converged in 6 iterations.
6
Expected weekly profit: 2880.0


In [None]:
print(policy)

In [None]:
def generalized_policy_iteration(env, max_iter=2, eps=0.1, gamma=1):
    np.random.seed(1)
    states =  env.observation_space
    actions = env.action_space
    policy = {s: {np.random.choice(actions): 1}
             for s in states}
    v = {s: 0 for s in states}
    k = 0
    while True:
        v_old = v.copy()
        policy = {}
        for s in states:
            policy[s], v[s] = policy_improvement(env, v, s, 
                                    actions, gamma)
        v = policy_evaluation(env, policy, 
                              max_iter=max_iter, v=v, 
                              eps=eps, gamma=gamma)
        max_delta = np.amax([abs(v[s] - v_old[s]) for s in v])
        k += 1
        if max_delta < eps:
            print("GPI converged in", k, "iterations.")
            print([abs(v[s] - v_old[s]) for s in v])
            break
            
    print("Optimal policy found!")
    return policy, v

In [None]:
policy, v = generalized_policy_iteration(foodtruck, max_iter=2, eps=0.1, gamma=1)

In [None]:
print("Expected weekly profit:", v["Mon", 0])
print(policy)

In [None]:
v

# Monte Carlo Methods

## MC Prediction

In [71]:
def first_visit_return(returns, trajectory, gamma):
    G = 0
    T = len(trajectory) - 1
    for t, sar in enumerate(reversed(trajectory)):
        s, a, r = sar
        G = r + gamma * G
        first_visit = True
        for j in range(T - t):
            if s == trajectory[j][0]:
                first_visit = False
        if first_visit:
            if s in returns:
                returns[s].append(G)
            else:
                returns[s] = [G]
    return returns

In [74]:
def get_trajectory(env, policy):
    trajectory = []
    state = env.reset()
    done = False
    sar = [state]
    while not done:
        action = choose_action(state, policy)
        state, reward, done, info = env.step(action)
        sar.append(action)
        sar.append(reward)
        trajectory.append(sar)
        sar = [state]
    return trajectory

In [75]:
def first_visit_mc(env, policy, gamma, n_trajectories):
    np.random.seed(0)
    returns = {}
    v = {}
    for i in range(n_trajectories):
        trajectory = get_trajectory(env, policy)
        returns = first_visit_return(returns, 
                                     trajectory, 
                                     gamma)
    for s in env.state_space:
        if s in returns:
            v[s] = np.round(np.mean(returns[s]), 1)
    return v

In [76]:
foodtruck = FoodTruck()
policy = base_policy(foodtruck.state_space)

In [77]:
v_est = first_visit_mc(foodtruck, policy, 1, 10000)
v_est

{('Mon', 0): 2515.9,
 ('Tue', 0): 1959.1,
 ('Tue', 100): 2362.2,
 ('Tue', 200): 2765.2,
 ('Wed', 0): 1411.3,
 ('Wed', 100): 1804.2,
 ('Wed', 200): 2198.9,
 ('Thu', 0): 852.9,
 ('Thu', 100): 1265.4,
 ('Thu', 200): 1644.4,
 ('Fri', 0): 301.1,
 ('Fri', 100): 696.5,
 ('Fri', 200): 1097.2}

In [78]:
v_true = policy_evaluation(foodtruck, policy)

Converged in 6 iterations.


In [63]:
v_true

{('Mon', 0): 2515.0,
 ('Tue', 0): 1960.0,
 ('Tue', 100): 2360.0,
 ('Tue', 200): 2760.0,
 ('Tue', 300): 3205.0,
 ('Wed', 0): 1405.0,
 ('Wed', 100): 1805.0,
 ('Wed', 200): 2205.0,
 ('Wed', 300): 2650.0,
 ('Thu', 0): 850.0000000000001,
 ('Thu', 100): 1250.0,
 ('Thu', 200): 1650.0,
 ('Thu', 300): 2095.0,
 ('Fri', 0): 295.00000000000006,
 ('Fri', 100): 695.0000000000001,
 ('Fri', 200): 1095.0,
 ('Fri', 300): 1400.0,
 ('Weekend', 0): 0,
 ('Weekend', 100): 0,
 ('Weekend', 200): 0,
 ('Weekend', 300): 0}

In [None]:
# v_est = first_visit_mc(foodtruck, policy, 1, 5)
# {s: v_est[s] for s in sorted(v_est)}

In [None]:
# v_est = first_visit_mc(foodtruck, policy, 1, 10)
# {s: v_est[s] for s in sorted(v_est)}

In [None]:
# v_est = first_visit_mc(foodtruck, policy, 1, 100)
# {s: v_est[s] for s in sorted(v_est)}

In [None]:
# v_est = first_visit_mc(foodtruck, policy, 1, 1000)
# {s: v_est[s] for s in sorted(v_est)}

In [None]:
# v_est = first_visit_mc(foodtruck, policy, 1, 10000)
# {s: v_est[s] for s in sorted(v_est)}

## On-policy Monte Carlo Control

In [85]:
import operator

In [91]:
def get_eps_greedy(actions, eps, a_best):
    prob_a = {}
    n_a = len(actions)
    for a in actions:
        if a == a_best:
            prob_a[a] = 1 - eps + eps/n_a
        else:
            prob_a[a] = eps/n_a
    return prob_a

In [92]:
def get_random_policy(states, actions):
    policy = {}
    n_a = len(actions)
    for s in states:
        policy[s] = {a: 1/n_a for a in actions}
    return policy

In [93]:
def on_policy_first_visit_mc(env, n_iter, eps, gamma):
    np.random.seed(0)
    states =  env.state_space
    actions = env.action_space
    policy =  get_random_policy(states, actions)
    Q = {s: {a: 0 for a in actions} for s in states}
    Q_n = {s: {a: 0 for a in actions} for s in states}
    for i in range(n_iter):
        if i % 10000 == 0:
            print("Iteration:", i)
        trajectory = get_trajectory(env, policy)
        G = 0
        T = len(trajectory) - 1
        for t, sar in enumerate(reversed(trajectory)):
            s, a, r = sar
            G = r + gamma * G
            first_visit = True
            for j in range(T - t):
                s_j = trajectory[j][0]
                a_j = trajectory[j][1]
                if (s, a) == (s_j, a_j):
                    first_visit = False
            if first_visit:
                Q[s][a] = Q_n[s][a] * Q[s][a] + G
                Q_n[s][a] += 1
                Q[s][a] /= Q_n[s][a]
                a_best = max(Q[s].items(), 
                             key=operator.itemgetter(1))[0]
                policy[s] = get_eps_greedy(actions, 
                                           eps, 
                                           a_best)
    return policy, Q, Q_n

In [94]:
policy, Q, Q_n = on_policy_first_visit_mc(foodtruck, 
                                          300000, 
                                          0.05, 
                                          1)

Iteration: 0
Iteration: 10000
Iteration: 20000
Iteration: 30000
Iteration: 40000
Iteration: 50000
Iteration: 60000
Iteration: 70000
Iteration: 80000
Iteration: 90000
Iteration: 100000
Iteration: 110000
Iteration: 120000
Iteration: 130000
Iteration: 140000
Iteration: 150000
Iteration: 160000
Iteration: 170000
Iteration: 180000
Iteration: 190000
Iteration: 200000
Iteration: 210000
Iteration: 220000
Iteration: 230000
Iteration: 240000
Iteration: 250000
Iteration: 260000
Iteration: 270000
Iteration: 280000
Iteration: 290000


In [90]:
policy

{('Mon', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.96},
 ('Tue', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.96},
 ('Tue', 100): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.96, 400: 0.01},
 ('Tue', 200): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Tue', 300): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},
 ('Wed', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.96},
 ('Wed', 100): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.96, 400: 0.01},
 ('Wed', 200): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Wed', 300): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},
 ('Thu', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.96, 400: 0.01},
 ('Thu', 100): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Thu', 200): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},
 ('Thu', 300): {0: 0.96, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.01},
 ('Fri', 0): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Fri', 100): {0: 0.01, 100: 

In [95]:
Q

{('Mon', 0): {0: 2162.733333333329,
  100: 2468.4210526315796,
  200: 2668.7695190505888,
  300: 2739.300098231826,
  400: 2809.1632287569414},
 ('Tue', 0): {0: 1539.1011235955057,
  100: 1857.630979498861,
  200: 2018.3222958057395,
  300: 2101.97486535009,
  400: 2181.249139237035},
 ('Tue', 100): {0: 2243.7967115097176,
  100: 2410.7182940516295,
  200: 2537.853107344635,
  300: 2587.222441722628,
  400: 2170.4049844236765},
 ('Tue', 200): {0: 2828.295819935689,
  100: 2953.6330631123433,
  200: 2996.437255166801,
  300: 2623.82297551789,
  400: 2224.710080285464},
 ('Tue', 300): {0: 3383.880037488284,
  100: 3395.720002238628,
  200: 2939.4218134034168,
  300: 2572.2506393861877,
  400: 2162.3395149786},
 ('Wed', 0): {0: 935.7142857142857,
  100: 1256.8720379146928,
  200: 1400.5025125628129,
  300: 1547.1040492055338,
  400: 1579.8683874265244},
 ('Wed', 100): {0: 1639.7689768976904,
  100: 1868.1431005110733,
  200: 1908.107074569789,
  300: 1989.5285532259934,
  400: 1605.021520

## Off-policy Monte Carlo Control

In [108]:
def off_policy_mc(env, n_iter, eps, gamma):
    np.random.seed(0)
    states =  env.state_space
    actions = env.action_space
    Q = {s: {a: 0 for a in actions} for s in states}
    C = {s: {a: 0 for a in actions} for s in states}
    target_policy = {}
    behavior_policy = get_random_policy(states, 
                                        actions)
    for i in range(n_iter):
        if i % 10000 == 0:
            print("Iteration:", i)
        trajectory = get_trajectory(env, 
                                    behavior_policy)
        G = 0
        W = 1
        T = len(trajectory) - 1
        for t, sar in enumerate(reversed(trajectory)):
            s, a, r = sar
            G = r + gamma * G
            C[s][a] += W
            Q[s][a] += (W/C[s][a]) * (G - Q[s][a])
            a_best = max(Q[s].items(), 
                         key=operator.itemgetter(1))[0]
            target_policy[s] = a_best
            behavior_policy[s] = get_eps_greedy(actions, 
                                                eps, 
                                                a_best)
            if a != target_policy[s]:
                break
            W = W / behavior_policy[s][a]
    target_policy = {s: target_policy[s] for s in states
                                   if s in target_policy}
    return target_policy, Q

In [109]:
policy, Q = off_policy_mc(foodtruck, 300000, 0.05, 1)

Iteration: 0
Iteration: 10000
Iteration: 20000
Iteration: 30000
Iteration: 40000
Iteration: 50000
Iteration: 60000
Iteration: 70000
Iteration: 80000
Iteration: 90000
Iteration: 100000
Iteration: 110000
Iteration: 120000
Iteration: 130000
Iteration: 140000
Iteration: 150000
Iteration: 160000
Iteration: 170000
Iteration: 180000
Iteration: 190000
Iteration: 200000
Iteration: 210000
Iteration: 220000
Iteration: 230000
Iteration: 240000
Iteration: 250000
Iteration: 260000
Iteration: 270000
Iteration: 280000
Iteration: 290000


In [110]:
policy

{('Mon', 0): 400,
 ('Tue', 0): 400,
 ('Tue', 100): 300,
 ('Tue', 200): 200,
 ('Tue', 300): 100,
 ('Wed', 0): 400,
 ('Wed', 100): 300,
 ('Wed', 200): 200,
 ('Wed', 300): 100,
 ('Thu', 0): 300,
 ('Thu', 100): 200,
 ('Thu', 200): 100,
 ('Thu', 300): 0,
 ('Fri', 0): 200,
 ('Fri', 100): 100,
 ('Fri', 200): 0,
 ('Fri', 300): 0}

In [111]:
Q

{('Mon', 0): {0: 2232.674050632915,
  100: 2539.364696421396,
  200: 2725.681570338065,
  300: 2822.8136882129284,
  400: 2878.458190025779},
 ('Tue', 0): {0: 1594.8051948051952,
  100: 1928.976034858388,
  200: 2067.4576271186465,
  300: 2207.8512396694205,
  400: 2239.8886329583893},
 ('Tue', 100): {0: 2318.9435336976317,
  100: 2536.8012422360302,
  200: 2549.486301369862,
  300: 2650.193090274893,
  400: 2256.120527306967},
 ('Tue', 200): {0: 2922.175290390706,
  100: 3012.8990770161868,
  200: 3052.4769607403373,
  300: 2689.515219842163,
  400: 2293.305439330548},
 ('Tue', 300): {0: 3420.032031538755,
  100: 3453.749726573689,
  200: 3014.1210374639763,
  300: 2635.802469135803,
  400: 2233.3333333333344},
 ('Wed', 0): {0: 927.9702970297026,
  100: 1303.1026252983302,
  200: 1428.831168831168,
  300: 1566.1498708010329,
  400: 1616.5133331502423},
 ('Wed', 100): {0: 1683.8652482269495,
  100: 1896.0360360360366,
  200: 1976.8450184501858,
  300: 2024.3386976631361,
  400: 1650.87

# TD Learning

## TD Prediction

In [116]:
def one_step_td_prediction(env, policy, gamma, alpha, n_iter):
    np.random.seed(0)
    states = env.state_space
    v = {s: 0 for s in states}
    s = env.reset()
    for i in range(n_iter):
        a = choose_action(s, policy)
        s_next, reward, done, info = env.step(a)
        v[s] += alpha * (reward + gamma * v[s_next] - v[s])
        if done:
            s = env.reset()
        else:
            s = s_next
    return v

In [117]:
policy = base_policy(foodtruck.state_space)
v = one_step_td_prediction(foodtruck, policy, 1, 0.01, 100000)
v

{('Mon', 0): 2506.576417395407,
 ('Tue', 0): 1956.077876400167,
 ('Tue', 100): 2368.7400039407535,
 ('Tue', 200): 2767.5069659225423,
 ('Tue', 300): 0,
 ('Wed', 0): 1413.0055559001296,
 ('Wed', 100): 1813.546186490315,
 ('Wed', 200): 2200.8873259700867,
 ('Wed', 300): 0,
 ('Thu', 0): 828.2915189850011,
 ('Thu', 100): 1280.424626614422,
 ('Thu', 200): 1675.8661846955831,
 ('Thu', 300): 0,
 ('Fri', 0): 345.52991944823583,
 ('Fri', 100): 677.4358179389413,
 ('Fri', 200): 1094.8263154150825,
 ('Fri', 300): 0,
 ('Weekend', 0): 0,
 ('Weekend', 100): 0,
 ('Weekend', 200): 0,
 ('Weekend', 300): 0}

In [None]:
print({s: np.round(v[s]) for s in v})

True values
{('Mon', 0): 2515.0,
 ('Tue', 0): 1960.0,
 ('Tue', 100): 2360.0,
 ('Tue', 200): 2760.0,
 ('Tue', 300): 3205.0,
 ('Wed', 0): 1405.0,
 ('Wed', 100): 1805.0,
 ('Wed', 200): 2205.0,
 ('Wed', 300): 2650.0,
 ('Thu', 0): 850.0000000000001,
 ('Thu', 100): 1250.0,
 ('Thu', 200): 1650.0,
 ('Thu', 300): 2095.0,
 ('Fri', 0): 295.00000000000006,
 ('Fri', 100): 695.0000000000001,
 ('Fri', 200): 1095.0,
 ('Fri', 300): 1400.0,
 ('Weekend', 0): 0,
 ('Weekend', 100): 0,
 ('Weekend', 200): 0,
 ('Weekend', 300): 0}

In [118]:
def sarsa(env, gamma, eps, alpha, n_iter):
    np.random.seed(0)
    states = env.state_space
    actions = env.action_space
    Q = {s: {a: 0 for a in actions} for s in states}
    policy = get_random_policy(states, actions)
    s = env.reset()
    a = choose_action(s, policy)
    for i in range(n_iter):
        if i % 100000 == 0:
            print("Iteration:", i)
        s_next, reward, done, info = env.step(a)
        a_best = max(Q[s_next].items(), 
                     key=operator.itemgetter(1))[0]
        policy[s_next] = get_eps_greedy(actions, eps, a_best)
        a_next = choose_action(s_next, policy)
        Q[s][a] += alpha * (reward 
                            + gamma * Q[s_next][a_next] 
                            - Q[s][a])
        if done:
            s = env.reset()
            a_best = max(Q[s].items(), 
                         key=operator.itemgetter(1))[0]
            policy[s] = get_eps_greedy(actions, eps, a_best)
            a = choose_action(s, policy)
        else:
            s = s_next
            a = a_next
    return policy, Q

In [119]:
policy, Q = sarsa(foodtruck, 1, 0.1, 0.01, 1000000)

Iteration: 0
Iteration: 100000
Iteration: 200000
Iteration: 300000
Iteration: 400000
Iteration: 500000
Iteration: 600000
Iteration: 700000
Iteration: 800000
Iteration: 900000


In [120]:
policy

{('Mon', 0): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},
 ('Tue', 0): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},
 ('Tue', 100): {0: 0.02, 100: 0.02, 200: 0.92, 300: 0.02, 400: 0.02},
 ('Tue', 200): {0: 0.02, 100: 0.92, 200: 0.02, 300: 0.02, 400: 0.02},
 ('Tue', 300): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},
 ('Wed', 0): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},
 ('Wed', 100): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},
 ('Wed', 200): {0: 0.02, 100: 0.02, 200: 0.92, 300: 0.02, 400: 0.02},
 ('Wed', 300): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},
 ('Thu', 0): {0: 0.02, 100: 0.02, 200: 0.02, 300: 0.92, 400: 0.02},
 ('Thu', 100): {0: 0.02, 100: 0.02, 200: 0.92, 300: 0.02, 400: 0.02},
 ('Thu', 200): {0: 0.02, 100: 0.92, 200: 0.02, 300: 0.02, 400: 0.02},
 ('Thu', 300): {0: 0.92, 100: 0.02, 200: 0.02, 300: 0.02, 400: 0.02},
 ('Fri', 0): {0: 0.02, 100: 0.02, 200: 0.92, 300: 0.02, 400: 0.02},
 ('Fri', 100): {0: 0.02, 100: 

In [121]:
Q[('Mon', 0)]

{0: 2099.8661156763687,
 100: 2399.8190742726747,
 200: 2604.6629056622382,
 300: 2670.098987213351,
 400: 2632.8387133517112}

In [122]:
def q_learning(env, gamma, eps, alpha, n_iter):
    np.random.seed(0)
    states =  env.state_space
    actions = env.action_space
    Q = {s: {a: 0 for a in actions} for s in states}
    policy = get_random_policy(states, actions)
    s = env.reset()
    for i in range(n_iter):
        if i % 100000 == 0:
            print("Iteration:", i)
        a_best = max(Q[s].items(), 
                     key=operator.itemgetter(1))[0]
        policy[s] = get_eps_greedy(actions, eps, a_best)
        a = choose_action(s, policy)
        s_next, reward, done, info = env.step(a)
        Q[s][a] += alpha * (reward 
                            + gamma * max(Q[s_next].values()) 
                            - Q[s][a])
        if done:
            s = env.reset()
        else:
            s = s_next
    policy = {s: {max(policy[s].items(), 
                 key=operator.itemgetter(1))[0]: 1}
                 for s in states}
    return policy, Q

In [123]:
policy, Q = q_learning(foodtruck, 1, 0.1, 0.01, 1000000)
policy

Iteration: 0
Iteration: 100000
Iteration: 200000
Iteration: 300000
Iteration: 400000
Iteration: 500000
Iteration: 600000
Iteration: 700000
Iteration: 800000
Iteration: 900000


{('Mon', 0): {400: 1},
 ('Tue', 0): {400: 1},
 ('Tue', 100): {300: 1},
 ('Tue', 200): {200: 1},
 ('Tue', 300): {100: 1},
 ('Wed', 0): {400: 1},
 ('Wed', 100): {300: 1},
 ('Wed', 200): {200: 1},
 ('Wed', 300): {100: 1},
 ('Thu', 0): {300: 1},
 ('Thu', 100): {200: 1},
 ('Thu', 200): {100: 1},
 ('Thu', 300): {0: 1},
 ('Fri', 0): {200: 1},
 ('Fri', 100): {100: 1},
 ('Fri', 200): {0: 1},
 ('Fri', 300): {0: 1},
 ('Weekend', 0): {0: 1},
 ('Weekend', 100): {0: 1},
 ('Weekend', 200): {0: 1},
 ('Weekend', 300): {0: 1}}

In [124]:
q_learning(foodtruck, 1, 0.1, 0.01, 20000000)


Iteration: 0
Iteration: 100000
Iteration: 200000
Iteration: 300000
Iteration: 400000
Iteration: 500000
Iteration: 600000
Iteration: 700000
Iteration: 800000
Iteration: 900000
Iteration: 1000000
Iteration: 1100000
Iteration: 1200000
Iteration: 1300000
Iteration: 1400000
Iteration: 1500000
Iteration: 1600000
Iteration: 1700000
Iteration: 1800000
Iteration: 1900000
Iteration: 2000000
Iteration: 2100000
Iteration: 2200000
Iteration: 2300000
Iteration: 2400000
Iteration: 2500000
Iteration: 2600000
Iteration: 2700000
Iteration: 2800000
Iteration: 2900000
Iteration: 3000000
Iteration: 3100000
Iteration: 3200000
Iteration: 3300000
Iteration: 3400000
Iteration: 3500000
Iteration: 3600000
Iteration: 3700000
Iteration: 3800000
Iteration: 3900000
Iteration: 4000000
Iteration: 4100000
Iteration: 4200000
Iteration: 4300000
Iteration: 4400000
Iteration: 4500000
Iteration: 4600000
Iteration: 4700000
Iteration: 4800000
Iteration: 4900000
Iteration: 5000000
Iteration: 5100000
Iteration: 5200000
Iteratio

({('Mon', 0): {400: 1},
  ('Tue', 0): {300: 1},
  ('Tue', 100): {300: 1},
  ('Tue', 200): {200: 1},
  ('Tue', 300): {100: 1},
  ('Wed', 0): {400: 1},
  ('Wed', 100): {300: 1},
  ('Wed', 200): {200: 1},
  ('Wed', 300): {100: 1},
  ('Thu', 0): {300: 1},
  ('Thu', 100): {200: 1},
  ('Thu', 200): {100: 1},
  ('Thu', 300): {0: 1},
  ('Fri', 0): {200: 1},
  ('Fri', 100): {100: 1},
  ('Fri', 200): {0: 1},
  ('Fri', 300): {0: 1},
  ('Weekend', 0): {0: 1},
  ('Weekend', 100): {0: 1},
  ('Weekend', 200): {0: 1},
  ('Weekend', 300): {0: 1}},
 {('Mon', 0): {0: 2225.749496682385,
   100: 2528.178263359892,
   200: 2752.245336408776,
   300: 2833.598086662411,
   400: 2865.5080973287336},
  ('Tue', 0): {0: 1627.1674196319675,
   100: 1926.185189822399,
   200: 2130.63971600556,
   300: 2235.794644930646,
   400: 2202.700921685597},
  ('Tue', 100): {0: 2323.642847941712,
   100: 2546.146008882256,
   200: 2622.2014944709003,
   300: 2704.7958165719538,
   400: 2254.9865435917945},
  ('Tue', 200): {0:

In [None]:
Q

{('Mon', 0): 2880.0,
 ('Tue', 0): 2250.0,
 ('Tue', 100): 2650.0,
 ('Tue', 200): 3050.0,
 ('Tue', 300): 3450.0,
 ('Wed', 0): 1620.0,
 ('Wed', 100): 2020.0,
 ('Wed', 200): 2420.0,
 ('Wed', 300): 2820.0,
 ('Thu', 0): 990.0,
 ('Thu', 100): 1390.0,
 ('Thu', 200): 1790.0,
 ('Thu', 300): 2190.0,
 ('Fri', 0): 390.00000000000006,
 ('Fri', 100): 790.0000000000001,
 ('Fri', 200): 1190.0,
 ('Fri', 300): 1400.0,
 ('Weekend', 0): 0,
 ('Weekend', 100): 0,
 ('Weekend', 200): 0,
 ('Weekend', 300): 0}