In [1]:
import pandas as pd
import numpy as np
from blackjack import SimplifiedBlackjackMDP
from snake import SnakeMDP, hashable_state, array_state
import pickle
import random
import matplotlib.pyplot as plt
import time

from tqdm.notebook import tqdm

# Construct MDP Examples

In [2]:
problems = {
    'Snake': SnakeMDP(board_length=4, max_snake_length=5),
    'Blackjack': SimplifiedBlackjackMDP()
}

Trying max snake length 1 with 9 base combos
100 states found...
200 states found...
Trying max snake length 2 with 36 base combos
300 states found...
400 states found...


In [3]:
for key in problems:
    print(f'{key} MDP has {len(problems[key].states)} states.')

Snake MDP has 464 states.
Blackjack MDP has 1271 states.


In [4]:
params = {
    'PI': [
        {'gamma': 0.99, 'epsilon': 0.001},
        {'gamma': 0.99, 'epsilon': 0.25},
        #{'gamma': 0.75, 'epsilon': 0.001},
        #{'gamma': 0.75, 'epsilon': 0.25},
        #{'gamma': 0.50, 'epsilon': 0.001},
        #{'gamma': 0.50, 'epsilon': 0.25},
        {'gamma': 0.25, 'epsilon': 0.001},
        {'gamma': 0.25, 'epsilon': 0.25},
    ],
    'VI': [
        {'gamma': 0.99, 'epsilon': 0.001},
        {'gamma': 0.99, 'epsilon': 0.25},
        #{'gamma': 0.75, 'epsilon': 0.001},
        #{'gamma': 0.75, 'epsilon': 0.25},
        #{'gamma': 0.50, 'epsilon': 0.001},
        #{'gamma': 0.50, 'epsilon': 0.25},
        {'gamma': 0.25, 'epsilon': 0.001},
        {'gamma': 0.25, 'epsilon': 0.25},
    ],
    'Q-learning': [
        {'decay_pattern': 'mitchell',        'initialization': 'zeros',        'exploration': 'introduce-randomness' , '' },
        {'decay_pattern': 'mitchell',        'initialization': 'first_reward', 'exploration': 'introduce-randomness' , '' },
        {'decay_pattern': 'mitchell',        'initialization': 'zeros',        'exploration': 'q-optimal'            , '' },
        {'decay_pattern': 'mitchell',        'initialization': 'first_reward', 'exploration': 'q-optimal'            , '' },
        #{'decay_pattern': 'iteration_based', 'initialization': 'zeros',        'exploration': 'introduce-randomness', '' },
        {'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'introduce-randomness' , '' },
        #{'decay_pattern': 'iteration_based', 'initialization': 'zeros',        'exploration': 'q-optimal'           , '' },
        {'decay_pattern': 'iteration_based', 'initialization': 'first_reward', 'exploration': 'q-optimal'            , '' },
    ]
}

In [5]:
def params_to_text(d):
    text = ''
    for i, k in enumerate(d):
        text += f'{k}={d[k]}'
        if i < len(d) - 1:
            text += '; '
    return text

In [6]:
start = time.time()
results = {}
for problem_name in tqdm(problems):
    results[problem_name] = {}
    for algo in tqdm(params):
        results[problem_name][algo] = {}
        for parameterization in tqdm(params[algo]):
            print(f'{problem_name}-{algo}-{parameterization} @ {(time.time() - start) / 60:.1f}m')
            
            if algo == 'PI':
                output = problems[problem_name].policy_iteration(**parameterization)
            elif algo == 'VI':
                output = problems[problem_name].value_iteration(**parameterization)
            elif algo == 'Q-learning':
                output = problems[problem_name].Q_learning(**parameterization)
            else:
                raise Exception('Unexpected...')
                
            results[problem_name][algo][params_to_text(parameterization)] = output

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Snake-Q-learning-{'decay_pattern': 'mitchell', 'initialization': 'zeros', 'exploration': 'introduce-randomness'} @ 0.0m
At iteration 1, max change in value: 0.85833; avg_change_in_value=0.16765
At iteration 2, max change in value: 1.02142; avg_change_in_value=0.13843
At iteration 3, max change in value: 0.85833; avg_change_in_value=0.14702
At iteration 4, max change in value: 0.90248; avg_change_in_value=0.22459
At iteration 5, max change in value: 0.81900; avg_change_in_value=0.25573
At iteration 6, max change in value: 0.93985; avg_change_in_value=0.24512
At iteration 7, max change in value: 0.82847; avg_change_in_value=0.18038
At iteration 8, max change in value: 0.80217; avg_change_in_value=0.16528
At iteration 9, max change in value: 0.83830; avg_change_in_value=0.12331
At iteration 10, max change in value: 0.72110; avg_change_in_value=0.11278
At iteration 11, max change in value: 0.67902; avg_change_in_value=0.08829
At iteration 12, max change in value: 0.73991; avg_change_in_val

At iteration 109, max change in value: 0.11623; avg_change_in_value=0.03162
At iteration 110, max change in value: 0.11629; avg_change_in_value=0.03100
At iteration 111, max change in value: 0.11599; avg_change_in_value=0.02973
At iteration 112, max change in value: 0.11599; avg_change_in_value=0.03243
At iteration 113, max change in value: 0.11469; avg_change_in_value=0.02627
At iteration 114, max change in value: 0.11599; avg_change_in_value=0.03098
At iteration 115, max change in value: 0.11473; avg_change_in_value=0.02712
At iteration 116, max change in value: 0.11594; avg_change_in_value=0.02947
At iteration 117, max change in value: 0.11625; avg_change_in_value=0.02875
At iteration 118, max change in value: 0.11621; avg_change_in_value=0.02872
At iteration 119, max change in value: 0.11587; avg_change_in_value=0.02621
At iteration 120, max change in value: 0.11606; avg_change_in_value=0.02771
At iteration 121, max change in value: 0.11612; avg_change_in_value=0.02795
At iteration

KeyboardInterrupt: 