## Imports

In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F     
import torch.optim as optim
from torch.distributions import Categorical
from torch.autograd import Variable
import copy

import gym
from collections import namedtuple

import matplotlib.pyplot as plt

import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'


## Definitions

In [None]:
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
eps = np.finfo(np.float32).eps.item()

## Policy Network


In [None]:
class Policy(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Policy, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)

    def forward(self, state):
        """
        Param state is a torch tensor
        """
        x = state
        x = F.relu(self.linear1(x))
        x = F.softmax(self.linear2(x), dim = -1)
            
        return x #Returns Probabilities of each action

## Critic in Case of Baseline

In [None]:
class Critic(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, learning_rate):
        super(Critic, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        self.learning_rate = learning_rate

    def forward(self, state):
        """
        Param state is a torch tensor
        """
        x = state
        x = F.relu(self.linear1(x))
        x = self.linear2(x)

        return x

## Agent

In [None]:
class MCReinforceAgent:
    def __init__(self, env:gym.Env, hidden_size=256, learning_rate = 0.01, gamma=0.99, baseline: Critic = None, toggle_target = False):
        # Params
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.n
        self.gamma = gamma
        self.toggle_target = toggle_target
        
        # Policy Network
        self.policy = Policy(self.num_states,hidden_size,self.num_actions).to(device)
        
        if baseline is not None:
            self.baseline = True
            self.critic_network = baseline.to(device)
            self.critic_optimizer = optim.Adam(self.policy.parameters(), lr=self.critic_network.learning_rate)
            self.critic_criterion = nn.MSELoss() 
            if toggle_target:
                self.critic_target = copy.deepcopy(baseline).to(device)
                self.tau = 0.01
                for target_param, param in zip(self.critic_target.parameters(), self.critic_network.parameters()):
                    target_param.data.copy_(param.data)
                    target_param.requires_grad = False
            else:
                self.critic_td_losses = [[],[]]
        else:
            self.baseline = False

        # Action & Reward Buffer (For a given episode)
        self.saved_actions = [] #Elements are NamedTuples
        self.rewards = []

        # Training
        self.optimizer  = optim.Adam(self.policy.parameters(), lr=learning_rate)

    def get_action(self, state):
        state = torch.from_numpy(state).float().to(device)
        probs = self.policy(state)

        if self.baseline:
            state_value = self.critic_network(state)
        else:
            state_value = None

        # create a categorical distribution over the list of probabilities of actions
        m = Categorical(probs)

        # and sample an action using the distribution
        action = m.sample()

        # save to action buffer
        self.saved_actions.append(SavedAction(m.log_prob(action), state_value))

        # the action to take
        return action.item()

    def update(self, returns):
        
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps) 
        T = 1
        policy_losses = []
        for (log_prob, value), R in zip(self.saved_actions, returns):
            if self.baseline == False:
                advantage = R 
            else:
                advantage = R - value.item()

            # calculate policy loss
            policy_losses.append(-log_prob * advantage/T)

        # reset gradients
        self.optimizer.zero_grad()

        # sum up all the values of policy_losses
        loss = torch.stack(policy_losses).sum()

        # perform backprop
        loss.backward()
        self.optimizer.step()

        if self.baseline and self.toggle_target == False:
            # reset gradients
            self.critic_optimizer.zero_grad()
            loss = self.critic_criterion(self.critic_td_losses[0][0],self.critic_td_losses[1][0])

            # perform backprop
            loss.backward()
            self.critic_optimizer.step()
            self.critic_td_losses = [[],[]]

        self.saved_actions = []
        self.rewards = []
    
    def update_critic(self,state,reward,next_state,done):
        state = torch.tensor(state).to(device)
        next_state = torch.tensor(next_state).to(device)
        reward = torch.tensor([reward]).to(device)
        if done:
            target = torch.tensor([0]).to(device)
        else:
            if self.toggle_target:
                target = self.critic_target(next_state).to(device)
            else:
                target = self.critic_network(next_state).to(device)
        if not self.toggle_target:
            self.critic_td_losses[0].append(reward + self.gamma*target)
            self.critic_td_losses[1].append(reward + self.critic_network(state))
        else:
            loss = self.critic_criterion(reward + self.gamma*target, self.critic_network(state))
            self.critic_optimizer.zero_grad()
            loss.backward()
            self.critic_optimizer.step()
            for target_param, param in zip(self.critic_target.parameters(), self.critic_network.parameters()):
                target_param.data.copy_(self.tau*param.data + (1-self.tau)*target_param.data)
     

## Run Environment

In [None]:
def train(env: gym.Env, agent: MCReinforceAgent, running_reward_init, num_episodes, log_interval=10, terminate_on_threshold=False):
    '''
    Returns episode wise returns
    '''
    running_reward = running_reward_init
    returns = []

    for i_episode in range(num_episodes):

        # reset environment and episode reward
        state, _ = env.reset()
        ep_reward = 0

        for t in range(1, 10000):

            # select action from policy
            action = agent.get_action(state)

            # take the action
            next_state, reward, done1, done2, _ = env.step(action)
            done = done1 or done2
            if agent.baseline:
                agent.update_critic(state, reward, next_state, done)
            agent.rewards.append(reward)
            ep_reward += reward
            state = next_state
            if done:
                break
        # update cumulative reward
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

        # Calculate Returns
        R = 0
        returns_for_agent = []
        for r in agent.rewards[::-1]:
            # calculate the discounted value
            R = r + agent.gamma * R
            returns_for_agent.insert(0, R)
        returns.append(R)
        agent.update(returns_for_agent)

        # log results
        if log_interval is not None:
            if i_episode % log_interval == 0:
                print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                    i_episode, ep_reward, running_reward))

        # check if we have "solved" the cart pole problem
        if (terminate_on_threshold and running_reward > env.spec.reward_threshold):
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break
    return returns


## Train and Plot

5 random seeds + plotting

In [None]:
def test5(env: gym.Env, agent_params: np.ndarray, critic_params: np.ndarray, running_reward_init, num_episodes, log_interval=10, terminate_on_threshold=False, plot=True, savefile=None, toggle_target = False):
    ''' 
    agent_params = (hidden_size, learning_rate)
    critic_params = (None) if not critic, (hidden_size, learning_rate) otherwise
    '''
    agent_lrs = []
    agent_hls = []
    critic_lrs = []
    critic_hls = []
    avg_regrets = []
    averaged_returns = []
    returns_all_params = []
    for i in range(len(agent_params)):
        print(f"Parameter Set {i+1}:")
        agent_hidden_size = agent_params[i][0]
        agent_learning_rate = agent_params[i][1]
        agent_lrs.append(agent_learning_rate)
        agent_hls.append(agent_hidden_size)
        if critic_params[i] is not None:
            critic_hidden_size = critic_params[i][0]
            critic_learning_rate = critic_params[i][1]
            critic_lrs.append(critic_learning_rate)
            critic_hls.append(critic_hidden_size)
        else:
            critic_lrs.append(np.nan)
            critic_hls.append(np.nan)
        returns = []
        for j in range(5):
            seed = int(np.random.randint(low=1, high=1034300))
            env.reset(seed=seed)
            torch.manual_seed(seed)
            if critic_params[i] is not None:
                critic = Critic(
                    env.observation_space.shape[0], hidden_size=critic_hidden_size, output_size=1, learning_rate=critic_learning_rate)
            else:
                critic = None
            agent = MCReinforceAgent(env, hidden_size=agent_hidden_size,
                                     learning_rate=agent_learning_rate, gamma=0.99, baseline=critic, toggle_target=toggle_target)
            a = train(env, agent, running_reward_init, num_episodes,
                                 log_interval, terminate_on_threshold)
            returns.append(a)
            
        returns_all_params.append(returns)
        averaged_returns.append(np.mean(np.array(returns), axis=0))
        if env.spec.id == 'Acrobot-v1':
            avg_regrets.append(np.sum(
                (1-(0.99)**100)/0.01*-np.ones(len(returns[0])) - (np.mean(np.array(returns), axis=0))))
        elif env.spec.id == 'CartPole-v1':
            avg_regrets.append(np.sum(
                (1-(0.99)**500)/0.01*np.ones(len(returns[0])) - (np.mean(np.array(returns), axis=0))))
    if savefile is not None:
        try:
            df = pd.DataFrame()
            df["agent_lr"] = agent_lrs
            df["agent_hl"] = agent_hls
            df["critic_lr"] = critic_lrs
            df["critic_hl"] = critic_hls
            df["avg_regrets"] = avg_regrets

            df.to_csv(savefile)
        except Exception as e:
            print(f"Error Saving!{e}")

    if plot:
        plot_returns(returns_all_params, title="Acrobot-v1 with REINFORCE", labels=['no_base','base'])

    return returns_all_params


def plot_returns(returns_all_params,labels,title:str = None):
    
    x = np.arange(1, len(returns_all_params[0][0])+1)
    plt.figure()
    for i in range(len(returns_all_params)):
        returns = np.array(returns_all_params[i])
        avg_returns = np.mean(returns, axis=0)
        std_returns = np.std(returns, axis=0)
        plt.plot(x, avg_returns, label = labels[i])
        plt.fill_between(x, avg_returns-std_returns,
                         avg_returns+std_returns, alpha=0.5, label = None)

    plt.xlabel('Episode')
    plt.ylabel('Average return per episode')
    if title is not None:
        plt.title(title)
    plt.legend()
    plt.show()


### Hyperparameter Tuning:


In [None]:
## Without baseline
policy_params =[[64,3e-2],[64,3e-3],[64,3e-4]]
no_critic_params = [None for i in policy_params]

#With baseline
critic_params_ = [[32,3e-2],[32,3e-3],[32,3e-4],\
                  [64,3e-2],[64,3e-2],[64,3e-2]]
critic_params = []
for i in critic_params_:
    critic_params += [i]*len(policy_params)
policy_params_baseline = policy_params*len(critic_params_)



### Without Baseline

In [None]:
env = gym.make('CartPole-v1')
test5(env, policy_params,\
            no_critic_params,\
                  0, 1000, log_interval = 100,savefile='cp_wb.csv', plot= True)

In [None]:
env = gym.make('Acrobot-v1')
test5(env, policy_params,\
            no_critic_params,\
                  -500, 2000, log_interval = 100,savefile='ac_wb.csv', plot= True)

### With Baseline

In [None]:
env = gym.make('CartPole-v1')
test5(env, policy_params_baseline,\
            critic_params,\
                  0, 2000, log_interval = 100,savefile='cp_b.csv', plot= True)

In [None]:
env = gym.make('Acrobot-v1')
test5(env, policy_params_baseline,\
            critic_params,\
                  -500, 2000, log_interval = 100,savefile='ac_b.csv', plot= True)

### Test Obtained Params

In [None]:
env = gym.make('Acrobot-v1')
returns_final_cart = test5(env, [[64, 3e-3],[64, 3e-3]],
                       [[32,3e-4],None],
                       -500, 1000, log_interval=10, savefile=None, plot=True, toggle_target=False) 

