In [1]:
import numpy as np
from tqdm.notebook import tqdm

In [2]:
def policy_evaluation(pi, P, gamma=1.0, thetha=1e-10):
    
    prev_V = np.zeros(len(P))
    
    while True:
        V = np.zeros(len(P))
        
        for s in range(len(P)):
            
            for prob, next_state, reward, done in P[s][pi(s)]:
                ## Each transition tuple has a probability, next state, reward, and a done flag
                ## Indicating whether the 'next_state' is terminal or not
                V[s] += prob * (reward + gamma*prev_V[next_state] * (not done))
                ## Done flag  is used to ensure the value of the next_state when landing on the terminal
                ##state is zero. We don't want infinite sum
        if np.max(np.abs(prev_V - V)) < theta:
            break
        
        prev_V = V.copy()
    
    return V
                  

In [3]:
def policy_improvement(V, P, gamma=1.0):
    
    Q = np.zeros((len(P), len(P[0])), dtype=np.float64)
    
    for s in range(len(P)):
        for a in range(len(P[0])):
            for prob, next_state, reward, done in P[s][a]:
                Q[s][a] += prob * (reward + gamma *V[next_state]*(not done))
                
    new_pi = lambda s: {s : a for s , a in enumerate(np.argmax(Q, axis=1))}[s]
    
    return new_pi

In [4]:
def policy_iteration(P, gamma=1.0, thetha=1e-10):
    
    random_actions = np.random.choice(tuple(P[0].keys()), len(P))
    
    pi = lambda s : {s:a for s, a in enumerate(random_actions)}[s]
    
    while True:
        
        old_pi = {s:pi(s) for s in range(len(P))}
        
        V = policy_evaluation(pi, P, gamma, theta)
        
        pi = policy_improvement(V, P, gamma)
        
        if old_pi == {s:pi(s) for s in range(len(P))}:
            break
            
            
    return V, pi

In [5]:
def value_iteration(P, gamma=1.0, theta=1e-10):
    
    V = np.zeros(len(P), dtype=np.float64)
    
    while True:
        Q = np.zeros((len(P), len(P[0])), dtype=np.float64)
        
        for s in range(len(P)):
            for a in range(len(P[s])):
                for prob, next_state, reward, done in P[s][a]:
                    Q[s][a] += prob*(reward + gamma*V[next_state]*(not done))
                    
        if np.max(np.abs(V - np.max(Q, axis=1))) < theta:
            break
            
        V = np.max(Q, axis=1)
        
        pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
        
        return V, pi

In [6]:
def pure_exploitation(env, n_episodes=5000):
    Q = np.zeros((env.action_space.n))
    N = np.zeros((env.action_space.n))
    
    Qe = np.empty((n_episodes, env.action_space.n))
    returns = np.empty(n_episodes)
    returns = np.empty(n_episodes, dtype=np.int)
    
    name = 'Pure exploitation'
    for e in tqdm(range(n_episodes), desc='Episodes for: '+name, leave=False):
        action = np.argmax(Q)
        _, reward, _, _ = env.step(action)
        N[action] += 1
        Q[action] = Q[action] + (reward - Q[action])/N[action]
        
        Qe[e] = Q
        returns[e] = reward
        actions[e] = action
        
    return name, returns, Qe, actions

In [7]:
def pure_exploration(env, n_episodes=5000):
    Q = np.zeros((env.action_space.n))
    N = np.zeros((env.action_space.n))
    
    Qe = np.empty((n_episodes, env.action_space.n))
    returns = np.empty(n_episodes)
    returns = np.empty(n_episodes, dtype=np.int)
    
    name = 'Pure Exploration'
    for e in tqdm(range(n_episodes), desc='Episode for: ' + name, leave=False):
        
        action = np.random.randint(len(Q))
        #CODE TO BE WRITTEN for estimation statistics bookkeeping 
        
    return name, returns, Qe, actions
        
    

In [9]:
def epsilon_greedy(env, epsilon=0.01, n_episodes=5000):
    #Code to be written. 
    pass

In [10]:

## LINEARLY DECAYING EPSILON GREEDY

def lin_dec_epsilon_greedy( env, init_epsilon=1.0, min_epsilon=0.01,
                          decay_ratio=0.05, n_episodes=5000):
    ##Boiler Plate code to be added
    
    name = 'Lin e-greedy {} {} {}'.format(init_epsilon, min_epsilon, decay_ratio)
    for e in tqdm(range(n_episodes), desc='Episodes for: '+name, leave=False):
        decay_episodes = n_episodes * decay_ration
        
        epsilon = 1 - e / decay_episodes
        epsilon *= init_epsilon - min_epsilon
        epsilon += min_epsilon
        epsilon = np.clip(epsilon, min_epsilon, init_epsilon)
        
        if np.random.random() > epsilon:
            action = np.argmax(Q)
        else:
            action = np.random.randint(len(Q))
            
        #Stats code to be added here
        
    return name, returns, Qe, actions

In [11]:
## EXPONENTIALLY DECAYING EPSILON GREEDY

def exp_dec_epsilon_greedy(env, init_epsilon=1.0, min_epsilon= 0.01, 
                          decay_ratio = 0.1, n_episodes=5000):
    ## Boiler Plate code to be added
    
    decay_episodes = int(n_episodes * decay_ration)
    rem_episodes = n_episodes - decay_episodes
    epsilons = 0.01
    epsilons /= np.logspace(-2, 0, decay_episodes)
    epsilons *= init_epsilons - min_epsilon
    epsilons += min_epsilon
    epsilons = np.pad(epsilons, (0, rem_episodes), 'edge')
    
    name = 'Exp e-greedy {} {} {}'.format(init_epsilon, min_epsilon, decay_ratio)
    for e in tqdm(range(n_episodes), desc='Episodes for: '+ name, leave=False):
        if np.random.randint() > epsilons[e]:
            action = np.argmax(Q)
        else:
            action = np.random.randint(len(Q))
        ##Add code for the stats
    
    return name, returns, Qe, actions