In [4]:
# imports
import numpy as np

# Question 1: Implementation of Easy21

Below I create an environment implementing the game.
- reset(): generates the start state
- _draw_card() -> card: returns a card represented as a dictionary with "value" and "colour" keys
- step(action): -> (state, reward, terminated): takes an action (either "stick" or "hit") and returns a new state, a rewards (either -1, 0, or +1), and a Boolean denoting whether the terminal state has been reached


In [162]:
class Environment():
    def __init__(self):
        self.state = {
            "dealer_showing" : np.random.choice(np.arange(1, 11)),
            "player_sum": np.random.choice(np.arange(1, 11))
        }
        
    def reset(self):
        self.state = {
            "dealer_showing" : np.random.choice(np.arange(1, 11)),
            "player_sum": np.random.choice(np.arange(1, 11))
        }

        return self.state
        
    
    def _draw_card(self):
        value = np.random.choice(np.arange(1, 11))
        colour = np.random.choice(["red", "black"], p=[1/3, 2/3])
        card = {
            "value": value,
            "colour": colour,
        }
        return card
    
    
    def step(self, action):
        if action == "hit":
            card = self._draw_card()
            # Update the state based on the drawn card
            if card["colour"] == "black":
                self.state["player_sum"] = self.state["player_sum"] + card["value"]
            else:
                self.state["player_sum"] = self.state["player_sum"] - card["value"]

            # Terminate and return -1 if player went bust
            if self.state["player_sum"] > 21 or self.state["player_sum"] < 1:
                reward = -1
                terminated = True
            # Otherwise return a reward of 0
            else:
                terminated = False
                reward = 0
        
        if action == "stick":
            # We can already terminate as the dealer will keep drawing until a conclusion is reached
            terminated = True
            dealer_sum = self.state["dealer_showing"]
    
            while dealer_sum < 17:
                dealer_card = self._draw_card()
                if dealer_card["colour"] == "black":
                    dealer_sum = dealer_sum + dealer_card["value"]
                else:
                    dealer_sum = dealer_sum - dealer_card["value"]

            # Player wins either by dealer going bust or by having a greater sum
            if (dealer_sum > 21 or dealer_sum < 1) or self.state["player_sum"] > dealer_sum:
                reward = 1
            # Player draws
            elif dealer_sum == self.state["player_sum"]:
                reward = 0
            # Player loses
            else:
                reward = -1
    
        return self.state, reward, terminated

# Question 2: Monte-Carlo Control

In [234]:
def uniform_random_policy():
    "hit" if np.random.uniform() > 0.5 else "stick"

values = np.zeros((21, 10))
returns = np.zeros_like(values)

env = Environment()
env.reset()


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 