In [10]:
import random

def epsilon_greedy_policy(state, Q, epsilon, env):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state, x)])

## Blackjack game with on-policy MC Control with epsilon-greedy policy

In [8]:
import gym
import pandas as pd
import random
from collections import defaultdict
from functools import partial

In [3]:
env = gym.make('Blackjack-v0')

In [6]:
Q = defaultdict(float)
total_return = defaultdict(float)
N = defaultdict(int)

In [11]:
eps_greedy_policy = partial(epsilon_greedy_policy, epsilon=0.5, env=env)

In [12]:
def generate_episode(Q):
    num_timesteps = 100
    episode = []
    state = env.reset()
    for t in range(num_timesteps):
        action = epsilon_greedy_policy(state, Q, 0.5, env)
        next_state, reward, done, info = env.step(action)
        
        episode.append((state, action, reward))
        
        if done:
            break
        state = next_state
    return episode
        

In [13]:
num_iterations = 500000

In [14]:
for i in range(num_iterations):
    episode = generate_episode(Q)
    all_state_action_pairs = [(s, a) for (s,a,r) in episode]
    rewards = [r for (s,a,r) in episode]
    
    for t, (state, action, _) in enumerate(episode):
        if not (state, action) in all_state_action_pairs[0:t]:
            R = sum(rewards[t:])
            total_return[(state, action)] = total_return[(state, action)] + R
            N[(state, action)] += 1
            
            Q[(state, action)] = total_return[(state, action)] / N[(state, action)]
    

In [15]:
df = pd.DataFrame(Q.items(), columns=['state_action_pair', 'value'])

In [16]:
df.head(11)

Unnamed: 0,state_action_pair,value
0,"((18, 8, False), 0)",0.113172
1,"((18, 8, False), 1)",-0.681306
2,"((13, 10, False), 0)",-0.576257
3,"((13, 10, False), 1)",-0.580718
4,"((12, 8, False), 0)",-0.566108
5,"((12, 8, False), 1)",-0.420564
6,"((10, 2, False), 1)",-0.032925
7,"((14, 2, False), 0)",-0.3102
8,"((9, 5, False), 0)",-0.174026
9,"((19, 6, False), 0)",0.476665


In [17]:
df[124:126]

Unnamed: 0,state_action_pair,value
124,"((14, 3, True), 0)",-0.275862
125,"((14, 3, True), 1)",-0.033033
