In [1]:
# imports
import numpy as np

# Question 1: Implementation of Easy21

Below I create an environment implementing the game.
- reset(): generates the start state
- _draw_card() -> card: returns a card represented as a dictionary with "value" and "colour" keys
- step(action): -> (state, reward, terminated): takes an action (either "stick" or "hit") and returns a new state, a rewards (either -1, 0, or +1), and a Boolean denoting whether the terminal state has been reached


In [127]:
class Environment():
    def __init__(self):
        self.state = {
            "dealer_showing" : np.random.choice(np.arange(1, 11)),
            "player_sum": np.random.choice(np.arange(1, 11))
        }
        
    def reset(self):
        self.state = {
            "dealer_showing" : np.random.choice(np.arange(1, 11)),
            "player_sum": np.random.choice(np.arange(1, 11))
        }

        return self.state
        
    
    def _draw_card(self):
        value = np.random.choice(np.arange(1, 11))
        colour = np.random.choice(["red", "black"], p=[1/3, 2/3])
        card = {
            "value": value,
            "colour": colour,
        }
        return card
    
    
    def step(self, action):
        if action == "hit":
            card = self._draw_card()
            # Update the state based on the drawn card
            if card["colour"] == "black":
                self.state["player_sum"] = self.state["player_sum"] + card["value"]
            else:
                self.state["player_sum"] = self.state["player_sum"] - card["value"]

            # Terminate and return -1 if player went bust
            if self.state["player_sum"] > 21 or self.state["player_sum"] < 1:
                reward = -1
                terminated = True
            # Otherwise return a reward of 0
            else:
                terminated = False
                reward = 0
        
        if action == "stick":
            # We can already terminate as the dealer will keep drawing until a conclusion is reached
            terminated = True
            dealer_sum = self.state["dealer_showing"]
    
            while dealer_sum < 17:
                dealer_card = self._draw_card()
                if dealer_card["colour"] == "black":
                    dealer_sum = dealer_sum + dealer_card["value"]
                else:
                    dealer_sum = dealer_sum - dealer_card["value"]

            # Player wins either by dealer going bust or by having a greater sum
            if (dealer_sum > 21 or dealer_sum < 1) or self.state["player_sum"] > dealer_sum:
                reward = 1
            # Player draws
            elif dealer_sum == self.state["player_sum"]:
                reward = 0
            # Player loses
            else:
                reward = -1
    
        return self.state.copy(), reward, terminated

# Question 2: Monte-Carlo Control

In [153]:
def uniform_random_policy():
    return "hit" if np.random.uniform() > 0.5 else "stick"

values = np.zeros((21, 10))
total_returns = np.zeros_like(values)
counts = np.zeros_like(values)

env = Environment()

In [158]:
for _ in range(100000):
    env.reset()
    
    episode = {
        "states":[],
        "rewards":[],
        "actions":[],
    }
    
    terminated = False
    
    while not terminated:
        action = uniform_random_policy()
        state, reward, terminated = env.step(action)
        episode["states"].append(state.copy())
        episode["rewards"].append(reward)
        episode["actions"].append(action)
    
    for t, state in enumerate(episode["states"][:-2]):
        counts[state["player_sum"]-1, state["dealer_showing"]-1] += 1
        total_returns[state["player_sum"]-1, state["dealer_showing"]-1] += sum(episode["rewards"][t:])

In [159]:
values = total_returns / counts

In [161]:
print(values.round(3))

[[-0.63  -0.44  -0.493 -0.361 -0.474 -0.467 -0.247 -0.645 -0.639 -0.711]
 [-0.561 -0.485 -0.489 -0.628 -0.45  -0.682 -0.407 -0.72  -0.527 -0.655]
 [-0.518 -0.547 -0.462 -0.664 -0.378 -0.579 -0.565 -0.452 -0.584 -0.758]
 [-0.549 -0.471 -0.551 -0.582 -0.581 -0.66  -0.402 -0.547 -0.551 -0.736]
 [-0.557 -0.477 -0.604 -0.479 -0.435 -0.476 -0.521 -0.503 -0.472 -0.659]
 [-0.568 -0.508 -0.57  -0.61  -0.477 -0.5   -0.49  -0.548 -0.527 -0.618]
 [-0.598 -0.516 -0.519 -0.51  -0.518 -0.472 -0.605 -0.531 -0.48  -0.519]
 [-0.557 -0.472 -0.518 -0.56  -0.52  -0.429 -0.585 -0.59  -0.482 -0.566]
 [-0.613 -0.409 -0.468 -0.511 -0.449 -0.458 -0.405 -0.47  -0.511 -0.511]
 [-0.416 -0.431 -0.535 -0.495 -0.434 -0.391 -0.49  -0.471 -0.46  -0.478]
 [-0.49  -0.346 -0.438 -0.403 -0.403 -0.39  -0.441 -0.386 -0.439 -0.414]
 [-0.444 -0.402 -0.478 -0.382 -0.449 -0.317 -0.338 -0.416 -0.403 -0.432]
 [-0.427 -0.435 -0.38  -0.416 -0.388 -0.303 -0.391 -0.407 -0.316 -0.423]
 [-0.493 -0.414 -0.412 -0.423 -0.363 -0.371 -0.349 