In [1]:
import pandas as pd
import numpy as np
from blackjack import SimplifiedBlackjackMDP
from snake import SnakeMDP, hashable_state, array_state
import pickle
import random
import matplotlib.pyplot as plt
import time

from tqdm.notebook import tqdm

# Construct MDP Examples

In [2]:
problems = {
    'Snake': SnakeMDP(board_length=4, max_snake_length=5),
    'Blackjack': SimplifiedBlackjackMDP()
}

Trying max snake length 1 with 16 base combos
100 states found...
200 states found...
300 states found...
400 states found...
500 states found...
600 states found...
700 states found...
800 states found...
900 states found...
Trying max snake length 2 with 120 base combos
1,000 states found...
1,100 states found...
1,200 states found...
1,300 states found...
1,400 states found...
1,500 states found...
1,600 states found...
Trying max snake length 3 with 560 base combos
1,700 states found...
1,800 states found...
1,900 states found...
2,000 states found...
2,100 states found...
2,200 states found...
2,300 states found...
2,400 states found...
2,500 states found...
2,600 states found...
2,700 states found...
2,800 states found...
2,900 states found...
Trying max snake length 4 with 1,820 base combos
3,000 states found...
3,100 states found...
3,200 states found...
3,300 states found...
3,400 states found...
3,500 states found...
3,600 states found...
3,700 states found...
3,800 states fo

In [3]:
for key in problems:
    print(f'{key} MDP has {len(problems[key].states)} states.')

Snake MDP has 5776 states.
Blackjack MDP has 1271 states.


In [4]:
params = {
    'PI': [
        {'gamma': 0.99, 'epsilon': 0.001},
        {'gamma': 0.99, 'epsilon': 0.25},
        #{'gamma': 0.75, 'epsilon': 0.001},
        #{'gamma': 0.75, 'epsilon': 0.25},
        #{'gamma': 0.50, 'epsilon': 0.001},
        #{'gamma': 0.50, 'epsilon': 0.25},
        {'gamma': 0.25, 'epsilon': 0.001},
        {'gamma': 0.25, 'epsilon': 0.25},
    ],
    'VI': [
        {'gamma': 0.99, 'epsilon': 0.001},
        {'gamma': 0.99, 'epsilon': 0.25},
        #{'gamma': 0.75, 'epsilon': 0.001},
        #{'gamma': 0.75, 'epsilon': 0.25},
        #{'gamma': 0.50, 'epsilon': 0.001},
        #{'gamma': 0.50, 'epsilon': 0.25},
        {'gamma': 0.25, 'epsilon': 0.001},
        {'gamma': 0.25, 'epsilon': 0.25},
    ],
    'Q-learning': [
        {'decay_pattern': 'mitchell',        'initialization': 'zeros',        'exploration': 'introduce-randomness' , 'epsilon': 0.01 },
        {'decay_pattern': 'mitchell',        'initialization': 'first_reward', 'exploration': 'introduce-randomness' , 'epsilon': 0.05 },
        {'decay_pattern': 'mitchell',        'initialization': 'zeros',        'exploration': 'q-optimal'            , 'epsilon': 0.01 },
        {'decay_pattern': 'mitchell',        'initialization': 'first_reward', 'exploration': 'q-optimal'            , 'epsilon': 0.10 },
        #{'decay_pattern': 'iteration_based', 'initialization': 'zeros',        'exploration': 'introduce-randomness', 'epsilon': 0.01 },
        {'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'introduce-randomness' , 'epsilon': 0.10 },
        #{'decay_pattern': 'iteration_based', 'initialization': 'zeros',        'exploration': 'q-optimal'           , 'epsilon': 0.01 },
        {'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'q-optimal'            , 'epsilon': 0.05 },
    ]
}

In [5]:
def params_to_text(d):
    text = ''
    for i, k in enumerate(d):
        text += f'{k}={d[k]}'
        if i < len(d) - 1:
            text += '; '
    return text

In [6]:
start = time.time()
results = {}
for problem_name in tqdm(problems):
    results[problem_name] = {}
    for algo in tqdm(params):
        results[problem_name][algo] = {}
        for parameterization in tqdm(params[algo]):
            print(f'{problem_name}-{algo}-{parameterization} @ {(time.time() - start) / 60:.1f}m')
            
            if algo == 'PI':
                output = problems[problem_name].policy_iteration(**parameterization)
            elif algo == 'VI':
                output = problems[problem_name].value_iteration(**parameterization)
            elif algo == 'Q-learning':
                output = problems[problem_name].Q_learning(**parameterization)
            else:
                raise Exception('Unexpected...')
                
            results[problem_name][algo][params_to_text(parameterization)] = output

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Snake-PI-{'gamma': 0.99, 'epsilon': 0.001} @ 0.0m
Snake-PI-{'gamma': 0.99, 'epsilon': 0.25} @ 9.4m
Snake-PI-{'gamma': 0.25, 'epsilon': 0.001} @ 13.0m
Snake-PI-{'gamma': 0.25, 'epsilon': 0.25} @ 15.4m


  0%|          | 0/4 [00:00<?, ?it/s]

Snake-VI-{'gamma': 0.99, 'epsilon': 0.001} @ 17.7m
Snake-VI-{'gamma': 0.99, 'epsilon': 0.25} @ 18.8m
Snake-VI-{'gamma': 0.25, 'epsilon': 0.001} @ 19.7m
Snake-VI-{'gamma': 0.25, 'epsilon': 0.25} @ 20.1m


  0%|          | 0/6 [00:00<?, ?it/s]

Snake-Q-learning-{'decay_pattern': 'mitchell', 'initialization': 'zeros', 'exploration': 'introduce-randomness', 'epsilon': 0.01} @ 20.4m
Snake-Q-learning-{'decay_pattern': 'mitchell', 'initialization': 'first_reward', 'exploration': 'introduce-randomness', 'epsilon': 0.05} @ 40.5m
Snake-Q-learning-{'decay_pattern': 'mitchell', 'initialization': 'zeros', 'exploration': 'q-optimal', 'epsilon': 0.01} @ 40.7m
Snake-Q-learning-{'decay_pattern': 'mitchell', 'initialization': 'first_reward', 'exploration': 'q-optimal', 'epsilon': 0.1} @ 60.8m
Snake-Q-learning-{'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'introduce-randomness', 'epsilon': 0.1} @ 61.0m
Snake-Q-learning-{'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'q-optimal', 'epsilon': 0.05} @ 61.0m


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Blackjack-PI-{'gamma': 0.99, 'epsilon': 0.001} @ 61.0m
Blackjack-PI-{'gamma': 0.99, 'epsilon': 0.25} @ 61.1m
Blackjack-PI-{'gamma': 0.25, 'epsilon': 0.001} @ 61.1m
Blackjack-PI-{'gamma': 0.25, 'epsilon': 0.25} @ 61.1m


  0%|          | 0/4 [00:00<?, ?it/s]

Blackjack-VI-{'gamma': 0.99, 'epsilon': 0.001} @ 61.1m
Blackjack-VI-{'gamma': 0.99, 'epsilon': 0.25} @ 61.2m
Blackjack-VI-{'gamma': 0.25, 'epsilon': 0.001} @ 61.2m
Blackjack-VI-{'gamma': 0.25, 'epsilon': 0.25} @ 61.2m


  0%|          | 0/6 [00:00<?, ?it/s]

Blackjack-Q-learning-{'decay_pattern': 'mitchell', 'initialization': 'zeros', 'exploration': 'introduce-randomness', 'epsilon': 0.01} @ 61.2m
Blackjack-Q-learning-{'decay_pattern': 'mitchell', 'initialization': 'first_reward', 'exploration': 'introduce-randomness', 'epsilon': 0.05} @ 61.2m
Blackjack-Q-learning-{'decay_pattern': 'mitchell', 'initialization': 'zeros', 'exploration': 'q-optimal', 'epsilon': 0.01} @ 76.7m
Blackjack-Q-learning-{'decay_pattern': 'mitchell', 'initialization': 'first_reward', 'exploration': 'q-optimal', 'epsilon': 0.1} @ 76.7m
Blackjack-Q-learning-{'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'introduce-randomness', 'epsilon': 0.1} @ 76.7m
Blackjack-Q-learning-{'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'q-optimal', 'epsilon': 0.05} @ 76.7m


In [7]:
with open('results.pkl', 'wb') as f:
    pickle.dump(results, f)

# Testing for accuracy

In [None]:
def plot(s):
    board, last_direction = array_state(s)
    plt.imshow(board, interpolation='none')
    plt.title(f"Snake with last move='{s[1]}'")
    plt.show()

In [16]:
snake_example_0 = hashable_state(
    (np.array([[0., 0., 3., 0.],
               [0., 0., 0., 0.],
               [0., 0., 0., 0.],
               [0., 0., 2., 0.]]),
     'right')
)

snake_example_1 = hashable_state(
    (np.array([[0., 0., 2., 0.],
               [0., 0., 1., 0.],
               [0., 0., 0., 0.],
               [0., 3., 0., 0.]]),
     'up')
)

snake_example_2 = hashable_state(
    (np.array([[0., 2., 1., 0.],
               [0., 0., 0., 0.],
               [0., 0., 0., 0.],
               [0., 3., 0., 0.]]),
     'left')
)

snake_example_3 = hashable_state(
    (np.array([[0., 0., 0., 3.],
               [0., 1., 0., 0.],
               [0., 1., 0., 0.],
               [0., 2., 0., 0.]]),
     'down')
)

snake_example_4 = hashable_state(
    (np.array([[0., 0., 0., 0.],
               [0., 0., 0., 0.],
               [0., 0., 1., 1.],
               [0., 0., 2., 3.]]),
     'down')
)

correct_snake_answers = [
    (snake_example_0, 'up'),
    (snake_example_1, 'left'),
    (snake_example_2, 'down'),
    (snake_example_3, 'right'),
    (snake_example_4, 'left')
]

In [20]:
blackjack_example_0 = (2, 7, 'hitting')
blackjack_example_1 = (10, 7, 'hitting')
blackjack_example_2 = (18, 6, 'hitting')
blackjack_example_3 = (16, 6, 'hitting')
blackjack_example_4 = (11, 11, 'hitting')

correct_blackjack_answers = [
    (blackjack_example_0, 'hit'),
    (blackjack_example_1, 'hit'),
    (blackjack_example_2, 'hold'),
    (blackjack_example_3, 'hold'),
    (blackjack_example_4, 'hit')
]

In [21]:
accuracy = pd.DataFrame()

for problem_name in results:
    for algo in results[problem_name]:
        for parameterization in results[problem_name][algo]:
            policy, value_or_q, stats = results[problem_name][algo][parameterization]
            key = f'{problem_name}-{algo}-{parameterization}'
            if problem_name == 'Snake':
                test = correct_snake_answers
            else:
                test = correct_blackjack_answers
            for i, (question, answer) in enumerate(test):
                col = f'Problem{i}'
                correct = policy[question] == answer
                accuracy.loc[key, col] = correct
                

In [23]:
accuracy

Unnamed: 0,Problem0,Problem1,Problem2,Problem3,Problem4
Snake-PI-gamma=0.99; epsilon=0.001,True,True,True,True,True
Snake-PI-gamma=0.99; epsilon=0.25,True,True,True,True,True
Snake-PI-gamma=0.25; epsilon=0.001,True,True,True,True,True
Snake-PI-gamma=0.25; epsilon=0.25,True,True,True,True,True
Snake-VI-gamma=0.99; epsilon=0.001,True,True,True,True,True
Snake-VI-gamma=0.99; epsilon=0.25,True,True,True,True,True
Snake-VI-gamma=0.25; epsilon=0.001,True,True,True,True,True
Snake-VI-gamma=0.25; epsilon=0.25,True,True,True,True,True
Snake-Q-learning-decay_pattern=mitchell; initialization=zeros; exploration=introduce-randomness; epsilon=0.01,True,True,True,True,True
Snake-Q-learning-decay_pattern=mitchell; initialization=first_reward; exploration=introduce-randomness; epsilon=0.05,True,True,True,True,True
