In [1]:
import pandas as pd
import numpy as np
from blackjack import SimplifiedBlackjackMDP
from snake import SnakeMDP, hashable_state, array_state
import pickle
import random
import matplotlib.pyplot as plt
import time

from tqdm.notebook import tqdm

## Construct MDP Examples

In [2]:
problems = {
    'Snake': SnakeMDP(board_length=3, max_snake_length=3),
    'Blackjack': SimplifiedBlackjackMDP()
}

Trying max snake length 1 with 9 base combos
100 states found...
200 states found...
Trying max snake length 2 with 36 base combos
300 states found...
400 states found...


In [3]:
for key in problems:
    print(f'{key} MDP has {len(problems[key].states)} states.')

Snake MDP has 464 states.
Blackjack MDP has 1271 states.


## Learn on these examples

In [4]:
params = {
    'PI': [
        {'gamma': 0.99, 'epsilon': 0.001},
        {'gamma': 0.99, 'epsilon': 0.25},
        {'gamma': 0.75, 'epsilon': 0.001},
        {'gamma': 0.75, 'epsilon': 0.25},
        {'gamma': 0.50, 'epsilon': 0.001},
        {'gamma': 0.50, 'epsilon': 0.25},
        {'gamma': 0.25, 'epsilon': 0.001},
        {'gamma': 0.25, 'epsilon': 0.25},
    ],
    'VI': [
        {'gamma': 0.99, 'epsilon': 0.001},
        {'gamma': 0.99, 'epsilon': 0.25},
        {'gamma': 0.75, 'epsilon': 0.001},
        {'gamma': 0.75, 'epsilon': 0.25},
        {'gamma': 0.50, 'epsilon': 0.001},
        {'gamma': 0.50, 'epsilon': 0.25},
        {'gamma': 0.25, 'epsilon': 0.001},
        {'gamma': 0.25, 'epsilon': 0.25},
    ],
    'Q-learning': [
        {'decay_pattern': 'mitchell',        'initialization': 'zeros',        'exploration': 'uniform'  },
        {'decay_pattern': 'mitchell',        'initialization': 'first_reward', 'exploration': 'uniform'  },
        {'decay_pattern': 'mitchell',        'initialization': 'zeros',        'exploration': 'q-optimal'},
        {'decay_pattern': 'mitchell',        'initialization': 'first_reward', 'exploration': 'q-optimal'},
        {'decay_pattern': 'iteration_based', 'initialization': 'zeros',        'exploration': 'uniform'  },
        {'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'uniform'  },
        {'decay_pattern': 'iteration_based', 'initialization': 'zeros',        'exploration': 'q-optimal'},
        {'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'q-optimal'},
    ]
}

In [5]:
def params_to_text(d):
    text = ''
    for i, k in enumerate(d):
        text += f'{k}={d[k]}'
        if i < len(d) - 1:
            text += '; '
    return text

In [6]:
self = problems['Snake']

In [7]:
self.Q_learning()

Beginning iteration i=1
((0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 2.0), 'left') left
((0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 2.0, 0.0), 'left') down
(None, 'down') right


KeyError: (None, 'right')

In [None]:
visits = {
            (s, a): 0
            for s in self.states
            for a in self.available_actions(s)
        }

In [8]:
visits[(None, 'left')]

NameError: name 'visits' is not defined

In [9]:
self.available_actions((None, 'left'))

['up', 'down', 'left', 'right']

In [None]:
start = time.time()
results = {}
for problem_name in tqdm(problems):
    results[problem_name] = {}
    for algo in tqdm(params):
        results[problem_name][algo] = {}
        for parameterization in tqdm(params[algo]):
            print(f'{problem_name}-{algo}-{parameterization} @ {(time.time() - start) / 60:.1f}s')
            
            if algo == 'PI':
                output = problems[problem_name].policy_iteration(**parameterization)
            elif algo == 'VI':
                output = problems[problem_name].value_iteration(**parameterization)
            elif algo == 'Q-learning':
                output = problems[problem_name].Q_learning(**parameterization)
            else:
                raise Exception('Unexpected...')
                
            results[problem_name][algo][params_to_text(parameterization)] = output
    

## Test accuracy with handpicked examples

## Develop graphs for these bad boys

In [None]:
with open('snake_bl5_msl7_20221124.pkl', 'rb') as f:
    snake_mdp = pickle.load(f)

In [None]:
snake_mdp = SnakeMDP(board_length=3, max_snake_length=4)

In [None]:
snake_policy, snake_q = snake_mdp.Q_learning(gamma=0.9, epsilon=0.001)

In [None]:
snake_policy, snake_q, stats = snake_mdp.value_iteration(gamma=0.9, epsilon=0.01)

In [None]:
snake_policy, snake_value = mdp.policy_iteration(gamma=0.5, epsilon=0.0001, max_allowed_time=720)

# s = ((1.0, 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0), 'down')
# array_state(s)
# array_state(mdp.accessible_states(s, 'down')[0])

for s in random.sample(mdp.states, k=5):
    print(f'\nFor state s = \n{array_state(s)}')
    print(f'the recommended action is: {snake_policy[s]}')

In [None]:
# take a state and visualize it

def plot(s):
    board, last_direction = array_state(s)
    plt.imshow(board, interpolation='none')
    plt.title(f"Snake with last move='{s[1]}'")
    plt.show()

In [None]:
plot(example_4)

In [None]:
snake_policy[example_4]

In [None]:

mdp = SimplifiedBlackjackMDP()


mdp._sample_state_layout()

for s in mdp.accessible_states((11, 3, 'hitting'), 'hit'):
    print(s, mdp.reward(s))
mdp.transition_model((19, 6, 'hitting'), 'hold', (19, 6, 'stand'))

blackjack_policy, blackjack_value = mdp.policy_iteration(gamma=0.99999, epsilon=1e-5)
blackjack_policy, blackjack_value = mdp.value_iteration(gamma=0.99999, epsilon=1e-5)

policy_visualization = pd.DataFrame(
    index=pd.Index(range(2, 12), name='Dealer Value'),
    columns=pd.Index(range(2, 22), name='Player Value'),
    dtype='string'
)
for i in range(2, 12):
    for j in range(2, 22):
        policy_visualization.loc[i, j] = blackjack_policy[(j, i, 'hitting')]

policy_visualization.iloc[:, 5:15]

value_visualization = pd.DataFrame(
    index=pd.Index(range(2, 12), name='Dealer Value'),
    columns=pd.Index(range(2, 22), name='Player Value'),
    dtype='float'
)
for i in range(2, 12):
    for j in range(2, 22):
        value_visualization.loc[i, j] = np.round(blackjack_value[(j, i, 'hitting')], 1)

value_visualization.iloc[:, 5:15]


In [None]:
example_1 = hashable_state(
    (np.array([[0., 0., 0., 0., 0.],
               [0., 0., 0., 0., 0.],
               [0., 0., 0., 0., 0.],
               [0., 0., 0., 0., 0.],x
               [3., 1., 1., 1., 2.]]),
     'right')
)

example_2 = hashable_state(
    (np.array([[0., 0., 0., 0., 0.],
               [0., 0., 0., 3., 0.],
               [0., 0., 0., 0., 0.],
               [0., 0., 0., 2., 0.],
               [0., 1., 1., 1., 0.]]),
     'up')
)

example_3 = hashable_state(
    (np.array([[0., 0., 0., 0.],
               [0., 0., 3., 0.],
               [0., 0., 0., 0.],
               [0., 1., 2., 0.]]),
     'right')
)

example_4 = hashable_state(
    (np.array([[0., 0., 0.],
               [0., 1., 1.],
               [0., 2., 3.]]),
     'down')
)
s = example_2