## Implementation of $RL^2$: Fast Reinforcement Learning via Slow Reinforcement Learning
The goal of this implementation is to use an RNN agent to be able to learn in itself a Reinforcement Learning algorithm that is able to perfect the exploration and exploitation dilemma of the Multi-Armed Bandit context (Meta-RL). The algorithm is tested in multiple environments and is tested for efficacy with other state of the art agents.

1. The implemenation uses a Recurrent Neural Network (RNN) which is a Gated Recurrent Unit (GRU) being fed the last action, reward and timestep.
2. The policy is optimized using the basic REINFORCE or Vanilla Policy Gradient.
3. Training is being performed for 20,000 tasks(environments) where each task is picked from a random and uniform distribution of means and standard deviations.!

Following image shows the visualization of our implementation.


![Meta RL Implementation](Implementation.jpg)

In [1]:
import random
import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn
from torch.nn import functional as F
from torch.distributions.categorical import Categorical
from torch.autograd import Variable

from gym import spaces
from gym.utils import seeding
from tqdm import tqdm

#### The classes below are the environments that are used to test the efficacy of our model and the agents

In [2]:
class ArmedBanditsEnv():
    """
    The k-Armed Bandit Environment
    Initialization requires an array of length equals to k, where each item is
    a function which samples from a specified distribution.
    """
    
    def __init__(self, mean, stddev):
        assert len(mean.shape) == 2 
        assert len(stddev.shape) == 2
        self.num_actions = mean.shape[1] 
        self.num_experiments = mean.shape[0]
        
        self.mean = mean
        self.stddev = stddev
        
    def step(self, action):
    
        sampled_means = self.mean[np.arange(self.num_experiments),action]
        sampled_stddevs = self.stddev[np.arange(self.num_experiments),action]
        
        reward = np.random.normal(loc=sampled_means, scale=sampled_stddevs, size=(1,self.num_experiments))
        
        observation, done, info = 0, False, dict()
        return observation, reward, done, info
    
    def reset(self):
        return 0
    
    def reset_env(self):
        self.mean = np.random.normal(size = (1, self.num_actions))
        self.stddev = np.ones((1, self.num_actions))
        
    def render(self, mode='human', close=False):
        pass
    
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np.random(seed)
        return [seed]
    
    def close(self):
        pass

class ArmedBanditsGaussian(ArmedBanditsEnv):
    def __init__(self, num_experiments=1, num_bandits=3):
        self.means = np.random.normal(size=(num_experiments, num_bandits))
        
        ArmedBanditsEnv.__init__(self, self.means, np.ones((num_experiments, num_bandits)))

In [3]:
class ArmBanditBernoulli():
    def __init__(self, num_actions):
        self.num_actions = num_actions
        self.probs = np.random.uniform(low = 0, high = 1, size = self.num_actions)

    def reset_env(self):
        self.probs = np.random.uniform(low = 0, high = 1, size = self.num_actions)

    def reset(self):
        return 0

    def step(self, action):
        reward = np.random.binomial(n = 1, p = self.probs[action], size=1)[0]
        observation, done, info = 0, False, dict()
        return observation, reward, done, info


In [4]:
class IndependentArms():
    def __init__(self, num_actions):
        self.num_actions = num_actions
        self.probs = np.random.uniform(low = 0, high = 1, size = self.num_actions)

    def reset_env(self):
        self.probs = np.random.uniform(low = 0, high = 1, size = self.num_actions)
    
    def step(self, action):
        reward = 1 if random.uniform(0,1) <= self.probs[action] else 0
        observation, done, info = 0, False, dict()
        return observation, reward, done, info

    def reset(self):
        return 0

#### Following classes define our RNN Agent. The agent is a neural network with the input layer a linear layer with an input size of 3 as it takes as input the last action, last reward and timestep a tuple $<a,r,t>$  and the second layer is a GRU with a hidden size of 48, and 2 hidden layers, the output layer is again a linear layer with an ouput size of the number of bandits or actions to take.

In [6]:
class RNNAgent(nn.Module):
    def __init__(self, hiddenSize, layers,  outputSize, inputSize = 3, timesteps = 1):
        super(RNNAgent, self).__init__()
        self.timesteps = timesteps
        self.hiddenSize = hiddenSize
        self.layers = layers
        self.inputSize = inputSize
        self.outputSize = outputSize

        self.layer1 = nn.Linear(inputSize, hiddenSize)
        self.layer2 = nn.GRU(hiddenSize, hiddenSize, layers)
        self.layer3 = nn.Linear(hiddenSize, outputSize)

        self.hiddenLayerReset()
    
    def hiddenLayerReset(self):
        self.hiddenLayer = self.hidden()
    
    def hidden(self):
        layers = self.layers
        hiddenSize = self.hiddenSize
        timesteps   = self.timesteps

        return torch.randn(layers, timesteps, hiddenSize)

    def forward(self, phi):

        inputs = torch.relu(self.layer1(phi))
        outputs, self.hiddenLayer = self.layer2(inputs, self.hiddenLayer)
        logits = torch.relu(self.layer3(outputs))
        action_probs = F.softmax(logits, dim = 2).view(-1)
        
        return action_probs

#### The input to the RNN is a tuple that of $<s,a,r,t>$ and is represented as $\phi$ Since for the armed bandit is stateless we'll ignore the state for now and that $\phi$ is a tuple of the values $<a,r,t>$

In [7]:
def createphi(state, action, reward, t):
    phi = [action, reward, t]
    phi = np.reshape(phi, (1,1,3)).astype(np.float32)
    
    return torch.tensor(phi)

#### This is our slow policy optimization algorithm which is a simple REINFORCE policy gradient

In [8]:
def slow_RL(optimizer, action_probs, rewards, gamma, action):
    #We are using REINFORCE policy gradient here
    Gt = 0
    discountedRewards = []
    policyLoss = []
    
    for i in reversed(rewards):
        Gt = i + gamma * Gt
        discountedRewards.insert(0, Gt)
    discountedRewards = torch.tensor(discountedRewards)

    for action_prob, r in zip(action_probs, discountedRewards):
        policyLoss.append(-action_prob * r)

    optimizer.zero_grad()
    policyLoss = torch.cat(policyLoss).sum()
    policyLoss.backward(retain_graph=True)
    optimizer.step()

#### This function is used to train our agent. The training parameters are fixed on 

In [9]:
def trainRL2(num_episodes, num_tasks, gamma, env, lr):

    MetaLearner = RNNAgent(hiddenSize = 48, outputSize = env.num_actions, layers = 2)
    optimizer = optim.Adam(MetaLearner.parameters(), lr)
    total_rewards = []
    
    for i in tqdm(range(num_tasks)):
        MetaLearner.hiddenLayerReset()
        action_probs = []
        actions = []
        rewards = []
        action = 0
        reward = 0
        env.reset_env()
        for t in range(num_episodes):

                phi = createphi(0, action, reward, t)
                action_prob = MetaLearner.forward(phi)
                action_probs.append(action_prob)
                
                action_distribution = Categorical(action_prob)
                action = action_distribution.sample()
                actions.append(action)
        
                _, reward, _, _ = env.step(action)
                rewards.append(reward)
            
        slow_RL(optimizer, action_probs, rewards, gamma, action)

In [10]:
num_tasks = 1000
num_episodes = 100
gamma = 0.9
lr = 0.01
num_bandits = 5
mean = np.random.normal(size=(1,num_bandits))
stddev = np.ones((1,num_bandits))
#env = ArmedBanditsEnv(mean, stddev)
env = IndependentArms(num_bandits)
trainRL2(num_episodes, num_tasks, gamma, env, lr)

100%|██████████| 1000/1000 [00:32<00:00, 31.07it/s]


In [27]:
# def testRL2(num_episodes, num_tasks, gamma, env, lr):

#     MetaLearner = RNNAgent(hiddenSize = 48, outputSize = env.num_actions, layers = 2)
#     optimizer = optim.Adam(MetaLearner.parameters(), lr)
#     total_rewards = []
#     optimal = np.argmax(env.mean, axis = 1)

#     for i in range(num_tasks):
#         MetaLearner.hiddenLayerReset()
#         action_probs = []
#         actions = []
#         rewards = []
#         action = 0
#         reward = 0
#         env.reset()
#         sub_optimal_pulls = 0
#         for t in range(num_episodes):

#                 phi = createphi(0, action, reward, t)
#                 action_prob = MetaLearner.forward(phi)
#                 action_probs.append(action_prob)
                
#                 action_distribution = Categorical(action_prob)
#                 action = action_distribution.sample()
#                 actions.append(action)
#                 if action.item() != optimal : sub_optimal_pulls+=1

#                 _, reward, _, _ = env.step(action)
#                 rewards.append(reward)
#         total_rewards.append(np.sum(rewards))    
#         slow_RL(optimizer, action_probs, rewards, gamma, action)
#         printresults (rewards, i, num_tasks, sub_optimal_pulls, total_rewards)

In [28]:
# def printresults(rewards, i, num_tasks, sub_optimal_pulls, total_rewards):
#     print("For the task {}/{} Avearge Reward {:.2f} and Sub-optimal Pulls {}" .format(i+1, num_tasks, np.mean(total_rewards), sub_optimal_pulls))

In [29]:
# num_bandits = 5
# mean = np.random.normal(size=(1,num_bandits))
# stddev = np.ones((1,num_bandits))
# env = ArmedBanditsEnv(mean, stddev)
# #env = IndependentArms(num_bandits)
# testRL2(10, 100, gamma, env, lr=0)

In [30]:
def testRL21(num_episodes, num_tasks, gamma, env, lr):

    MetaLearner = RNNAgent(hiddenSize = 48, outputSize = env.num_actions, layers = 2)
    optimizer = optim.Adam(MetaLearner.parameters(), lr)
    total_rewards = []
    #optimal = np.argmax(env.mean, axis = 1)

    for i in range(num_tasks):
        MetaLearner.hiddenLayerReset()
        action_probs = []
        actions = []
        rewards = []
        action = 0
        reward = 0
        env.reset()
        sub_optimal_pulls = 0
        
        for t in range(num_episodes):

                phi = createphi(0, action, reward, t)
                action_prob = MetaLearner.forward(phi)
                action_probs.append(action_prob)
                
                action_distribution = Categorical(action_prob)
                action = action_distribution.sample()
                actions.append(action)
                #if action.item() != optimal : sub_optimal_pulls+=1
                _, reward, _, _ = env.step(action)
                rewards.append(reward)
        total_rewards.append(np.sum(rewards))    
        slow_RL(optimizer, action_probs, rewards, gamma, action)
        printresults2(rewards, i, num_tasks, total_rewards)

In [31]:
def printresults2(rewards, i, num_tasks, total_rewards):
    print("For the task {}/{} Avearge Reward {:.2f}" .format(i+1, num_tasks, np.mean(total_rewards)))

In [32]:
num_bandits = 5
mean = np.random.normal(size=(1,num_bandits))
stddev = np.ones((1,num_bandits))
#env = ArmedBanditsEnv(mean, stddev)
env = IndependentArms(num_bandits)
testRL21(10, 1000, gamma, env, lr=0)

For the task 1/1000 Avearge Reward 5.00
For the task 2/1000 Avearge Reward 6.50
For the task 3/1000 Avearge Reward 6.33
For the task 4/1000 Avearge Reward 6.75
For the task 5/1000 Avearge Reward 6.00
For the task 6/1000 Avearge Reward 6.00
For the task 7/1000 Avearge Reward 6.00
For the task 8/1000 Avearge Reward 6.38
For the task 9/1000 Avearge Reward 6.33
For the task 10/1000 Avearge Reward 6.30
For the task 11/1000 Avearge Reward 6.36
For the task 12/1000 Avearge Reward 6.58
For the task 13/1000 Avearge Reward 6.69
For the task 14/1000 Avearge Reward 6.71
For the task 15/1000 Avearge Reward 6.73
For the task 16/1000 Avearge Reward 6.69
For the task 17/1000 Avearge Reward 6.82
For the task 18/1000 Avearge Reward 6.94
For the task 19/1000 Avearge Reward 7.00
For the task 20/1000 Avearge Reward 7.00
For the task 21/1000 Avearge Reward 7.00
For the task 22/1000 Avearge Reward 7.09
For the task 23/1000 Avearge Reward 7.04
For the task 24/1000 Avearge Reward 7.12
For the task 25/1000 Avea

In [33]:
class GreedyAgent:
    def __init__(self, reward_estimates):
        """
        Our agent takes as input the initial reward estimates.
        This estimates will be updated incrementally after each 
        interaction with the environment.
        """
        assert len(reward_estimates.shape) == 2
        
        self.num_bandits = reward_estimates.shape[1]
        self.num_experiments = reward_estimates.shape[0]
        self.reward_estimates = reward_estimates.astype(np.float64)
        self.action_count = np.zeros(reward_estimates.shape)
        
    def get_action(self):
        action = argmax(self.reward_estimates)
        self.action_count[np.arange(self.num_experiments), action] += 1
        return action
    
    def update_estimates(self, reward, action):
        # rew is a matrix with the obtained rewards from our previuos
        # action. Use this to update our estimates incrementally
        n = self.action_count[np.arange(self.num_experiments), action]
        prev_reward_estimates = self.reward_estimates[np.arange(self.num_experiments), action]
        
        # Update the reward estimates incementally
        self.reward_estimates[np.arange(self.num_experiments), action] = inc_avg(prev_reward_estimates,reward,n)

class UCBAgent(GreedyAgent):
    def __init__(self, reward_estimates, num_steps, c):
        GreedyAgent.__init__(self, reward_estimates)
        self.c = c
        self.num_steps = num_steps
        self.ucb = np.zeros((self.num_experiments, self.num_bandits))
        self.lcb = np.zeros((self.num_experiments, self.num_bandits))
        
    def get_action(self):
        for i in range(self.num_bandits):
            if self.action_count[:,i].any() == 0:
                action = i                                                        #Making sure each arm is played atleast once.
                self.action_count[np.arange(self.num_experiments), action] += 1
                return action
        for i in range(self.num_bandits):
            self.ucb[:, i] = self.reward_estimates[:, i] + self.c * np.sqrt(np.log(self.num_steps)/self.action_count[:,i])   #Calculating the UCBs
            self.lcb[:, i] = self.reward_estimates[:, i] - self.c * np.sqrt(np.log(self.num_steps)/self.action_count[:,i])   #Calculating the LCBs
        action = argmax(self.ucb)                                                 #Using the arm that has the highest UCB
        self.action_count[np.arange(self.num_experiments), action] += 1
        return action
    
    def update_estimates(self, reward, action):
        n = self.action_count[np.arange(self.num_experiments), action]
        prev_reward_estimates = self.reward_estimates[np.arange(self.num_experiments), action]
        self.reward_estimates[np.arange(self.num_experiments), action] = inc_avg(prev_reward_estimates,reward,n)
def inc_avg(prev_avg, new_val, n):
    return prev_avg + 1/n*(new_val - prev_avg)

def argmax(q_values):
    """
    Takes in a matrix of n*k q_values and returns the index
    of the item with the highest value for each row. 
    Breaks ties randomly.
    returns: vector of size n, where each item is the index of
    the highest value in q_values for each row.
    """
    # Generate a mask of the max values for each row
    mask = q_values == q_values.max(axis=1)[:, None]
    # Generate noise to be added to the ties
    r_noise = 1e-6*np.random.random(q_values.shape)
    # Get the argmax of the noisy masked values
    return np.argmax(r_noise*mask,axis=1)

In [34]:
def run_experiment(num_experiments=1000, num_steps=1000, num_actions=10, c=1):
    env = IndependentArms(num_actions)
    agent = UCBAgent(np.zeros((num_experiments, num_actions)), num_steps, c)
    averages = np.zeros((num_steps))
    optimality = np.zeros((num_steps))
    scores = np.zeros((num_experiments, num_steps+1))
    #optimal = np.argmax(env.mean, axis=1)
    
    for i in tqdm(range(num_steps)):
        action = agent.get_action()
        _, reward, _, _ = env.step(action)
        agent.update_estimates(reward, action)
        
        scores[:,i+1] = scores[:,i] + reward
        avg_score = np.mean(scores[:,i+1]/(i+1))
        averages[i] = avg_score
        
        #current_optimality = np.mean(action == optimal)
        optimality = 0

    return optimality, averages

In [35]:
num_experiments = 1000
num_steps = 10
c=1
agent = UCBAgent(np.zeros((num_experiments,num_bandits)), num_steps, c)
_, reward = run_experiment(num_experiments, num_steps, num_bandits, c)

print("Average Reward is {}".format(np.mean(reward)))

 50%|█████     | 5/10 [00:00<00:00, 5102.56it/s]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()