## Libraries

In [1]:
import sys
import gym
import numpy as np
from collections import defaultdict,deque

#### Black Jack Environment
> Actions => 1: continue 0: stop

> States => (a,b,c) a:hand, b:table open card c: usable ace (True|False)

In [2]:
env = gym.make('Blackjack-v0')
print(env.observation_space)
print(env.action_space)

Tuple(Discrete(32), Discrete(11), Discrete(2))
Discrete(2)


In [3]:
env.reset()

(13, 7, True)

## Create Episodes

In [4]:
# Random Episode
def random(env):
    '''
    Chooses actions completely randomly independent from the current state
    args:
        env: Black Jack Environment
    returns:
        episode list: list of state,action,reward pairs
        rewards list: list of rewards for the episode created. Last value is the reward 1,-1 or 0 previous rewards are 0
    '''
    state = env.reset()
    episode = []
    rewards = []
    while True:
        action = np.random.choice((1,0)) # randomly chooce 1 or 0
        current_state,reward,done,info = env.step(action)
        episode.append((state,action,reward))
        state = current_state
        rewards.append(reward)
        if done: # episode ends with a reward
            break
    return episode,rewards
# 
def stoch(env):
    '''
    Chooses actions by a hard coded probability. If current sum is > 17 9 times out of 10 will choose action 0
    '''
    state = env.reset()
    episode = []
    rewards = []
    while True:
        if state[0]>17: 
            p = [0.1,0.9]
        else:
            p = [0.5,0.5]
        action = np.random.choice((0,1),p=p) # choose action with a probability
        current_state,reward,done,info = env.step(action)
        episode.append((state,action,reward))
        state = current_state
        rewards.append(reward)
        if done: # episode ends with a reward
            break
    return episode,rewards

## Evaluate Episodes

In [8]:
def evaluate_episode_func(env,episode_gen_function,iterations,decay_factor=0.9):
    '''
    Evaluates the episode generation function and policy 
    args:
        env 
        episode_gen_function
        iterations: number of episodes to be created. 
        decay factor float: Rewards are given at the end of the episode. So if the state occured at the very beginning
        then it should have a decayed reward. 
    returns:
        Q defaultdict : cumilative sum of action state pair values
        N defaultdict : number of times a given state action pair occured
        V defaultdict : average value of a given state action pair
    '''
    total_wins = 0 
    Q=defaultdict(lambda : np.zeros(2)) # action state value colection
    N = defaultdict(lambda : np.zeros(2)) # store action state pair counts
    V = defaultdict(lambda : np.zeros(2))  # store average action state values
    for iteration in range(iterations):
        episode,rewards = episode_gen_function(env) # actual reward is given in the end of the episode
        if rewards[-1] == 1:
            total_wins +=1
        decay =[decay_factor**k for k in  range(len(episode))]
        decay = deque(decay) # deque is faster
        for inst in episode:
            state = inst[0]
            action = inst[1]
            Q[state][action] += rewards[-1]*decay.pop() # this is a right removal so that we read the decay in a reverse order
            N[state][action] +=1
    # calculate average state action value
    for state,value in Q.items():
    #print(state,value)
        for action in range(2):
            if N[state][action] !=0:
                V[state][action] = value[action]/N[state][action]
    return Q,N,V,total_wins

In [9]:
Q,N,V,total_wins = evaluate_episode_func(env,random,iterations=500000,decay_factor=0.9)

In [11]:
print('total wins are {} \npercentage wins {} '.format(total_wins,total_wins/500000))

total wins are 140419 
percentage wins 0.280838 


In [12]:
Q,N,V,total_wins = evaluate_episode_func(env,stoch,iterations=500000,decay_factor=0.9)

In [13]:
print('total wins are {} \npercentage wins {} '.format(total_wins,total_wins/500000))

total wins are 81965 
percentage wins 0.16393 


## Create Policy Using Monte Carlo Approach

In [14]:
def get_action(state,V,epsilon,nA):
    '''
    Chooses an action using history up to the current point
    args:
        state tuple
        V defaultdict : Action State pair average value
        epsilon float : between 0 and 1 metric to set how much to eplore and how much to exploit
        nA int : number of actions available
    returns:
        action int
    '''
    probs = epsilon*np.ones(nA)/nA # initiate probs its sum is equal to epsilon
    if sum(V[state] ==0) !=nA: # check if we have zero value for all the actions
        best_action = np.argmax(V[state]) # get the best action based on the history so far
        prob_best_action = 1-epsilon # probability of best action is equal to 1 - epsilon
        probs[best_action] = prob_best_action+epsilon/nA # sum of probs is equal to 1 
        action = np.random.choice(np.arange(nA),p=probs)
    else:
        action = np.random.choice(np.arange(nA))
    return action

def create_episode(env,V,epsilon,nA):
    '''
    create an episode by using actions chosen by get action function
    args:
        env
        V
        epsilon
        nA
    returns:
        episode
        rewards
    '''
    state = env.reset()
    episode = []
    rewards = []
    while True:
        action = get_action(state,V,epsilon,nA) # use get action
        current_state,reward,done,info = env.step(action)
        episode.append((state,action,reward))
        state = current_state
        rewards.append(reward)
        if done: # episode ends with a reward
            break
    return episode,rewards

def collect_values(env,epsilon,nA,iterations,episode_gen_function,decay_factor):
    Q = defaultdict(lambda : np.zeros(nA)) # action state value colection
    N = defaultdict(lambda : np.zeros(nA)) # store action state pair counts
    V = defaultdict(lambda : np.zeros(nA))  # store average action state values
    for iteration in range(iterations):
        episode,rewards = episode_gen_function(env,V,epsilon,nA) # actual reward is given in the end of the episode
        decay =[decay_factor**k for k in  range(len(episode))]
        decay = deque(decay) # deque is faster
        for inst in episode:
            state = inst[0]
            action = inst[1]
            Q[state][action] += rewards[-1]*decay.pop() # this is a right removal so that we read the decay in a reverse order
            N[state][action] +=1
    # calculate average state action value
    for state,value in Q.items():
    #print(state,value)
        for action in range(2):
            if N[state][action] !=0:
                V[state][action] = value[action]/N[state][action]
    return Q,N,V

In [15]:
Q,N,V = collect_values(env,epsilon=0.05,nA=2,iterations=500000,episode_gen_function=create_episode,decay_factor=0.9)

In [20]:
Policy={}
for key,value in V.items():
    Policy[key]=np.argmax(value)

In [29]:
def select_action_from_policy(state,Policy,nA):
    try:
        action = Policy[state]
    except:
        action = np.random.choice(np.arange(nA))
    return action


def create_episode(env,Policy,nA):
    state = env.reset()
    episode = []
    rewards = []
    while True:
        action = select_action_from_policy(state,Policy,nA)
        current_state,reward,done,info = env.step(action)
        episode.append((state,action,reward))
        state = current_state
        rewards.append(reward)
        if done: # episode ends with a reward
            break
    return episode,rewards    

def run_episodes(env,Policy,nA,num_iterations):
    total_wins = 0
    for k in range(num_iterations):
        episode,rewards = create_episode(env,Policy,nA)
        if rewards[-1] == 1:
            total_wins +=1
    return total_wins

In [30]:
create_episode(env,Policy,nA=2)

([((5, 1, False), 1, 0), ((15, 1, False), 0, -1.0)], [0, -1.0])

#### Evaluate created policy

In [32]:
total_wins = run_episodes(env,Policy,2,500000)

In [33]:
print('total wins are {} \npercentage wins {} '.format(total_wins,total_wins/500000))

total wins are 207211 
percentage wins 0.414422 
