# Assignment 2: Actor-Critic & DQN algorithms

In [None]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tensorflow_probability as tfp
import tqdm
import time

import matplotlib.pyplot as plt
from tensorflow.keras import layers
import tensorflow.keras as keras
from typing import Any, List, Sequence

## 1. Defining Model and Agent classes

### 1.1 Actor Critic model

In [None]:
class ActorCriticModel(keras.Model):
    """
    Defining policy and value networkss
    """
    def __init__(self, action_size, n_hidden1=1024, n_hidden2=512):
        super(ActorCriticModel, self).__init__()

        #Hidden Layer 1
        self.fc1 = layers.Dense(n_hidden1, activation='relu')
        #Hidden Layer 2
        self.fc2 = layers.Dense(n_hidden2, activation='relu')

        self.fc3 = layers.Dense(128, activation='relu')
        
        #Output Layer for policy
        self.pi_out = layers.Dense(action_size, activation='softmax')
        #Output Layer for state-value
        self.v_out = layers.Dense(1)

    def call(self, state):
        """
        Computes policy distribution and state-value for a given state
        """
        layer1 = self.fc1(state)
        layer2 = self.fc2(layer1)
        layer3 = self.fc3(layer2)

        pi = self.pi_out(layer3)
        v = self.v_out(layer3)

        return pi, v

### 1.2 Agent Class

In [None]:
class Agent:
    """
    Agent class
    """
    def __init__(self, action_size, lr=0.001, gamma=0.99):
        self.gamma = gamma
        self.action = None
        self.ac_model = ActorCriticModel(action_size=action_size)
        self.ac_model.compile(tf.keras.optimizers.Adam(learning_rate=lr))
        # np.random.seed(seed)
    
    def sample_action(self, state):
        """
        Given a state, compute the policy distribution over all actions and sample one action
        """
        state = tf.convert_to_tensor([state])
        pi, _ = self.ac_model(state)

        action_probabilities = tfp.distributions.Categorical(probs=pi)
        sample = action_probabilities.sample()
        self.action = sample

        return sample.numpy()[0]

    # def actor_loss(self, action, pi, delta):
    #     """
    #     Compute Actor Loss
    #     """
    #     return -tf.math.log(pi[0, action]) * delta

    # def critic_loss(self,delta):
    #     """
    #     Critic loss aims to minimize TD error
    #     """
    #     return delta**2

    # @tf.function
    def learn(self, state, reward, next_state, done):
        """
        For a given transition (s,a,s',r) update the paramters by computing the
        gradient of the total loss
        """
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        next_state = tf.convert_to_tensor([next_state], dtype=tf.float32)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32)

        with tf.GradientTape(persistent=True) as tape:

            pi, V_s = self.ac_model(state)
            _, V_s_next = self.ac_model(next_state)

            # if done:
            #     V_s_next = tf.stop_gradient(V_s_next)

            V_s = tf.squeeze(V_s)
            V_s_next = tf.squeeze(V_s_next)

            action_probs = tfp.distributions.Categorical(probs=pi)
            log_prob = action_probs.log_prob(self.action)

            delta = reward + (self.gamma * V_s_next*(1-int(done))) - V_s

            loss_a = -log_prob*delta#self.actor_loss(action, pi, delta)
            loss_c = delta**2
            loss_total = loss_a + loss_c

        gradient = tape.gradient(loss_total, self.ac_model.trainable_variables)
        self.ac_model.optimizer.apply_gradients(zip(gradient, self.ac_model.trainable_variables))

## 2. Training Actor-Critic algoritm

In [None]:
"""### Train the Network"""

avg_list = []

ENVIRONMENTS = ['Acrobot-v1', 'CartPole-v1', 'MountainCar-v0']

# for envs in ENVIRONMENTS:
env = gym.make('CartPole-v1')
#Initializing Agent
agent = Agent(lr=1e-5, action_size=env.action_space.n)
#Number of episodes
episodes = 1500
# tf.compat.v1.reset_default_graph()

filename = 'cartpole_1e-5_1024x512_1800games.png'

figure_file = 'plots/' + filename

reward_list = []
average_reward_list = []
start = time.time()

for episode in range(1, episodes + 1):
    state, _ = env.reset()
    done = False
    episode_reward = 0
    while not done:
        action = agent.sample_action(state) ##Sample Action
        next_state, reward, done, info, _ = env.step(action) ##Take action
        episode_reward += reward  ##Updating episode reward
        agent.learn(state, reward, next_state, done) ##Update Parameters
        state = next_state ##Updating State
    reward_list.append(episode_reward)
    average_reward_list.append(np.mean(reward_list[-100:]))

    if episode % 10 == 0:
        avg_reward = np.mean(reward_list[-10:])
        print('Episode ', episode, 'Reward %f' % episode_reward, 'Average Reward %f' % avg_reward)

    if episode % 100:
        avg_100 =  np.mean(reward_list[-100:])
        if avg_100 > 495.0:
            print('Stopped at Episode ', episode-100)
            break

end = time.time()   
print(f"Total training time for AC algorithm: {end-start}s")

### 2.1 Plotting Reward curve

In [None]:
"""
Plotting total reward curve (moving average over 100 episodes)
"""
episodes=1260

avg_var = []
for i in range(episodes):
    avg_var.append(np.var(reward_list[i:100+i]))

#Plot of total reward vs episode
print(f"Maximum Reward obtained = {np.max(average_reward_list)} at episode {np.argmax(average_reward_list)+1})")
plt.plot(np.arange(episodes), average_reward_list)
plt.title("Running Avg of 100 rewards")
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.show()

#Plot of Reward Variance vs episode
plt.plot(np.arange(episodes), avg_var)
plt.title("Reward Variance Plot")
plt.xlabel("Episodes")
plt.ylabel("Variance Of Rewards")
plt.show()

### 2.2 Rendering an episode and saving as a GIF file

In [None]:
# Render an episode and save as a GIF file

# display = Display(visible=0, size=(400, 300))
# display.start()
from PIL import Image

def render_episode(env: gym.Env, model: tf.keras.Model, max_steps: int): 
  screen = env.render(mode='rgb_array')
  print(screen)
  im = Image.fromarray(screen)

  images = [im]
  
  state = tf.constant(env.reset(), dtype=tf.float32)
  for i in range(1, max_steps + 1):
    state = tf.expand_dims(state, 0)
    action_probs, _ = model(state)
    action = np.argmax(np.squeeze(action_probs))
    state, _, done, _ = env.step(action)
    state = tf.constant(state, dtype=tf.float32)

    # Render screen every 10 steps
    if i % 10 == 0:
      screen = env.render(mode='rgb_array')
      images.append(Image.fromarray(screen))
  
    if done:
      break
  
  return images


# Save GIF image
images = render_episode(env, agent.ac_model, 200)
image_file = 'cartpole-v1.gif'
# loop=0: loop forever, duration=1: play each frame for 1ms
images[0].save(
    image_file, save_all=True, append_images=images[1:], loop=0, duration=1)

# import tensorflow_docs.vis.embed as embed
# embed.embed_file(image_file)

In [None]:
"""# Full Step Returns"""

# Wrap OpenAI Gym's `env.step` call as an operation in a TensorFlow function.
# This would allow it to be included in a callable TensorFlow graph.

def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """Returns state, reward and done flag given an action."""

  state, reward, done, _ = env.step(action)
  return (state.astype(np.float32), 
          np.array(reward, np.int32), 
          np.array(done, np.int32))


def tf_env_step(action: tf.Tensor) -> List[tf.Tensor]:
  return tf.numpy_function(env_step, [action], 
                           [tf.float32, tf.int32, tf.int32])

eps = 1e-2
Avg_List = []

def run_episode(
    initial_state: tf.Tensor,  
    model: tf.keras.Model, 
    max_steps: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
  """Runs a single episode to collect training data."""

  action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

  initial_state_shape = initial_state.shape
  state = initial_state

  for t in tf.range(max_steps):
    # Convert state into a batched tensor (batch size = 1)
    state = tf.expand_dims(state, 0)
  
    # Run the model and to get action probabilities and critic value
    action_logits_t, value = model(state)
  
    # Sample next action from the action probability distribution
    action = tf.random.categorical(action_logits_t, 1)[0, 0]
    action_probs_t = tf.nn.softmax(action_logits_t)

    # Store critic values
    values = values.write(t, tf.squeeze(value))

    # Store log probability of the action chosen
    action_probs = action_probs.write(t, action_probs_t[0, action])
  
    # Apply action to the environment to get next state and reward
    state, reward, done = tf_env_step(action)
    state.set_shape(initial_state_shape)
  
    # Store reward
    rewards = rewards.write(t, reward)

    if tf.cast(done, tf.bool):
      break

  action_probs = action_probs.stack()
  values = values.stack()
  rewards = rewards.stack()
  
  return action_probs, values, rewards

class FullReturnsAgentTF:
  def __init__(self, action_size,n_hidden1=1024,n_hidden2=512,n_hidden3=None, lr=0.001, gamma=0.99, seed = 85):
        self.gamma = gamma
        self.ac_model = ActorCriticModel(action_size=action_size,n_hidden1=n_hidden1,n_hidden2=n_hidden2)
        self.ac_model.compile(tf.keras.optimizers.Adam(learning_rate=lr))
        self.mse = tf.keras.losses.MeanSquaredError()
        np.random.seed(seed)
  #         ##calculating expected returns##
  def get_expected_return(self,
      rewards: tf.Tensor, 
      gamma: float, 
      standardize: bool = True) -> tf.Tensor:
    """Compute expected returns per timestep."""

    n = tf.shape(rewards)[0]
    returns = tf.TensorArray(dtype=tf.float32, size=n)

    # Start from the end of `rewards` and accumulate reward sums
    # into the `returns` array
    rewards = tf.cast(rewards[::-1], dtype=tf.float32)
    discounted_sum = tf.constant(0.0)
    discounted_sum_shape = discounted_sum.shape
    for i in tf.range(n):
      reward = rewards[i]
      discounted_sum = reward + gamma * discounted_sum
      discounted_sum.set_shape(discounted_sum_shape)
      returns = returns.write(i, discounted_sum)
    returns = returns.stack()[::-1]

    if standardize:
      returns = ((returns - tf.math.reduce_mean(returns)) / 
                (tf.math.reduce_std(returns) + eps))

    return returns
  def get_expected_return_n_step(self, rewards: tf.Tensor, values: tf.Tensor, gamma: float, no_of_steps: int, standardize: bool = True) -> tf.Tensor:
      """Compute n-step returns per timestep."""

      a = gamma**(no_of_steps)
      b = gamma**(no_of_steps - 1)
      n = tf.shape(rewards)[0]
      returns = tf.TensorArray(dtype=tf.float32, size=n)

      # Start from the end of `rewards` and accumulate reward sums
      # into the `returns` array
      rewards = tf.cast(rewards[::-1], dtype=tf.float32)
      discounted_sum = tf.constant(0.0)
      discounted_sum_shape = discounted_sum.shape
      if no_of_steps == 1000:
        no_of_steps = n
      for i in tf.range(n):
        reward = rewards[i]
        if i + no_of_steps >= n:
          discounted_sum = reward + gamma * discounted_sum
        else:
          discounted_sum = reward + gamma * discounted_sum + a*values[i+no_of_steps]
        discounted_sum.set_shape(discounted_sum_shape)
        returns = returns.write(i, discounted_sum)
        if i + no_of_steps < n:
          discounted_sum -= a*values[i+no_of_steps]
        if i + no_of_steps < n + 1:
          discounted_sum -= b*rewards[i+no_of_steps-1]
        discounted_sum.set_shape(discounted_sum_shape)
      returns = returns.stack()[::-1]

      if standardize:
        returns = ((returns - tf.math.reduce_mean(returns)) / 
                  (tf.math.reduce_std(returns) + eps))

      return returns
  #             ##calculating AC loss##
  huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
  

  def compute_loss(self,
      action_probs: tf.Tensor,  
      values: tf.Tensor,  
      returns: tf.Tensor) -> tf.Tensor:
    """Computes the combined actor-critic loss."""
    
    advantage = returns - values

    action_log_probs = tf.math.log(action_probs)
    actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)
    # tf.cast(values, tf.float32); tf.cast(returns, tf.float32)
    N = tf.cast(tf.shape(values)[0],tf.float32)
    critic_loss = N*self.mse(values, returns)

    return actor_loss + critic_loss

  #         ## Train func for updating parameters##


  @tf.function
  def train_step(self,
      initial_state: tf.Tensor, 
      model: tf.keras.Model, 
      optimizer: tf.keras.optimizers.Optimizer, 
      gamma: float, 
      max_steps_per_episode: int) -> tf.Tensor:
    """Runs a model training step."""

    with tf.GradientTape() as tape:

      # Run the model for one episode to collect training data
      action_probs, values, rewards = run_episode(
          initial_state, model, max_steps_per_episode) 

      # Calculate expected returns
      returns = self.get_expected_return_n_step(rewards, values, gamma, 7)

      # Convert training data to appropriate TF tensor shapes
      action_probs, values, returns = [
          tf.expand_dims(x, 1) for x in [action_probs, values, returns]] 

      # Calculating loss values to update our network
      loss = self.compute_loss(action_probs, values, returns)

    # Compute the gradients from the loss
    grads = tape.gradient(loss, model.trainable_variables)

    # Apply the gradients to the model's parameters
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    episode_reward = tf.math.reduce_sum(rewards)

    return episode_reward

# Commented out IPython magic to ensure Python compatibility.
# %time
env = gym.make('CartPole-v1')

#Initializing Agent
agent = FullReturnsAgentTF(lr=1e-4, action_size=env.action_space.n)
#Number of episodes
episodes = 500
tf.compat.v1.reset_default_graph()

reward_list = []
average_reward_list = []
begin_time = datetime.datetime.now()

avg_rew_prev=0

min_episodes_criterion = 100
max_episodes = 5000 # their given value = 10000
max_steps_per_episode = 100

# Cartpole-v1 is considered solved if average reward is >= 495 over 100 
# consecutive trials
reward_threshold = 495
running_reward = 0

# Discount factor for future rewards
gamma = 0.99

# Keep last episodes reward
episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)

with tqdm.trange(max_episodes) as t:
  for i in t:
    # agent = FullReturnsAgentTF()
    initial_state = tf.constant(env.reset(), dtype=tf.float32)
    episode_reward = int(agent.train_step(
        initial_state, agent.ac_model, agent.ac_model.optimizer, gamma, max_steps_per_episode))

    episodes_reward.append(episode_reward)
    running_reward = statistics.mean(episodes_reward)

    reward_list.append(episode_reward)
    average_reward_list.append(np.mean(reward_list[-100:]))

    

    t.set_description(f'Episode {i}')
    t.set_postfix(
        episode_reward=episode_reward, running_reward=running_reward)

    # # Show average episode reward every 10 episodes
    # if i % 10 == 0:
    #   pass # print(f'Episode {i}: average reward: {avg_reward}')

    # if running_reward > reward_threshold and i >= min_episodes_criterion:  
    #     break
    # reward_list.append(ep_rew)
    # average_reward_list.append(np.mean(reward_list[-100:]))

    
    if i % 10 == 0:
        avg_rew = np.mean(reward_list[-10:])
        print('Episode ', i, 'Reward %f' % episode_reward, 'Average Reward %f' % avg_rew)
        if (avg_rew > avg_rew_prev and avg_rew > 100):
          model_dir_name = 'my_model'+str(i//10)
          # agent.ac_model.save(model_file_name)
          tf.saved_model.save(agent.ac_model,model_dir_name)
        avg_rew_prev = avg_rew

    if i % 100:
        avg_100 =  np.mean(reward_list[-100:])
        if avg_100 > 475.0:
            print('Stopped at Episode ',i-100)
            break
Avg_List.append(average_reward_list)
# print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')
time_taken = datetime.datetime.now() - begin_time
print(time_taken)

Avg_List.append(average_reward_list)
print(len(Avg_List))

mx = -1
for i in Avg_List:
  print(len(i))
  mx = max(mx, len(i))
print(mx)

for i in range(len(Avg_List)):
  Avg_List[i] += [Avg_List[i][-1]] * (mx - len(Avg_List[i]))

Avg_val = np.mean(Avg_List, axis = 0)
Avg_var = np.var(Avg_List, axis = 0)

print(len(Avg_val), len(Avg_var))

### Plot of total reward vs episode
## Write Code Below
plt.plot(np.arange(len(average_reward_list)), average_reward_list)
plt.title("Running Avg of 100 rewards")
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.show()

### Plot of Reward Variance vs episode
## Write Code Below
plt.plot(np.arange(len(Avg_var)), Avg_var)
plt.title("Reward Variance Plot")
plt.xlabel("Episodes")
plt.ylabel("Variance Of Rewards")
plt.show()

In [None]:
#%%
import numpy as np
import random
import torch
import torch.nn as nn  
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import datetime
import gym
from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
import matplotlib.pyplot as plt
from IPython.display import HTML
from pyvirtualdisplay import Display
import tensorflow as tf
from IPython import display as ipythondisplay
from PIL import Image
import tensorflow_probability as tfp

'''
Please refer to the first tutorial for more details on the specifics of environments
We've only added important commands you might find useful for experiments.
'''

'''
List of example environments
(Source - https://gym.openai.com/envs/#classic_control)

'Acrobot-v1'
'Cartpole-v1'
'MountainCar-v0'
'''

#%%

env = gym.make('CartPole-v1')
seed = 0
env.reset(seed=seed)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

print(state_shape)
print(no_of_actions)
print(env.action_space.sample())
print("----")

'''
# Understanding State, Action, Reward Dynamics

The agent decides an action to take depending on the state.

The Environment keeps a variable specifically for the current state.
- Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
- It returns the new current state and reward for the agent to take the next action

'''

state = env.reset(seed=seed)[0].reshape(1,-1)   
''' This returns the initial state (when environment is reset) '''

print(state)
print("----")

action = env.action_space.sample()  
''' We take a random action now '''

print(action)
print("----")

next_state, reward, done, other, _ = env.step(action) 
''' env.step is used to calculate new state and obtain reward based on old state and action taken  ''' 

print(next_state)
print(reward)
print(done)
print(other)
print("----")

"""## DQN

Using NNs as substitutes isn't something new. It has been tried earlier, but the 'human control' paper really popularised using NNs by providing a few stability ideas (Q-Targets, Experience Replay & Truncation). The 'Deep-Q Network' (DQN) Algorithm can be broken down into having the following components. 

### Q-Network:
The neural network used as a function approximator is defined below
"""

'''
### Q Network & Some 'hyperparameters'

QNetwork1:
Input Layer - 4 nodes (State Shape) \
Hidden Layer 1 - 64 nodes \
Hidden Layer 2 - 64 nodes \
Output Layer - 2 nodes (Action Space) \
Optimizer - zero_grad()

QNetwork2: Feel free to experiment more
'''
#%%
import torch
import torch.nn as nn  
import torch.nn.functional as F


'''
Bunch of Hyper parameters (Which you might have to tune later **wink wink**)
'''
BUFFER_SIZE = int(1e5)  # replay buffer size [int(1e6), same]
BATCH_SIZE = 64         # minibatch size [512, 128]
GAMMA = 0.99            # discount factor [same, 0.999] [0.5, 0.7, 0.99]
LR = 5e-4               # learning rate [same, 1e-4]
UPDATE_EVERY = 20       # how often to update the network (When Q target is present) [same, 30], [10, 20, 40]


class QNetwork1(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units, fc2_units):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork1, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

"""### Replay Buffer:

This is a 'deque' that helps us store experiences. Recall why we use such a technique.
"""

#%%
import random
import torch
import numpy as np
from collections import deque, namedtuple

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

"""## Truncation:
We add a line (optionally) in the code to truncate the gradient in hopes that it would help with the stability of the learning process.

## Tutorial Agent Code:

"""

#%%
class TutorialAgent():

    def __init__(self, state_size, action_size, seed, fc1_units, fc2_units):

        ''' Agent Environment Interaction '''
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        ''' Q-Network '''
        self.qnetwork_local = QNetwork1(state_size, action_size, seed, fc1_units, fc2_units).to(device)
        self.qnetwork_target = QNetwork1(state_size, action_size, seed, fc1_units, fc2_units).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        ''' Replay memory '''
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        ''' Initialize time step (for updating every UPDATE_EVERY steps)           -Needed for Q Targets '''
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):

        ''' Save experience in replay memory '''
        self.memory.add(state, action, reward, next_state, done)
        
        ''' If enough samples are available in memory, get random subset and learn '''
        if len(self.memory) >= BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        """ +Q TARGETS PRESENT """
        ''' Updating the Network every 'UPDATE_EVERY' steps taken '''      
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:

            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, eps=0.):
        
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        ''' Epsilon-greedy action selection (Already Present) '''
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """ +E EXPERIENCE REPLAY PRESENT """
        states, actions, rewards, next_states, dones = experiences

        ''' Get max predicted Q values (for next states) from target model'''
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)

        ''' Compute Q targets for current states '''
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        ''' Get expected Q values from local model '''
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        ''' Compute loss '''
        loss = F.mse_loss(Q_expected, Q_targets)

        ''' Minimize the loss '''
        self.optimizer.zero_grad()
        loss.backward()
        
        ''' Gradiant Clipping '''
        """ +T TRUNCATION PRESENT """
        for param in self.qnetwork_local.parameters():
            param.grad.data.clamp_(-1, 1)
            
        self.optimizer.step()

"""### Here, we present the DQN algorithm code."""

''' Defining DQN Algorithm '''
#%%
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n

def dqn(n_episodes=10000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

    scores = []                 
    ''' list containing scores from each episode '''

    steps = []
    steps_window = []

    scores_window_printing = deque(maxlen=10) 
    ''' For printing in the graph '''
    
    scores_window = deque(maxlen=100)  
    ''' last 100 scores for checking if the avg is more than 195 '''

    eps = eps_start                    
    ''' initialize epsilon '''

    for i_episode in range(1, n_episodes+1):
        state = env.reset(seed=seed)[0].reshape(1,-1) 
        score = 0
        step_episode = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, other, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                step_episode = t
                break 

        scores_window.append(score)       
        scores_window_printing.append(score)   
        ''' save most recent score '''  
        steps_window.append(step_episode)         

        eps = max(eps_end, eps_decay*eps) 
        ''' decrease epsilon '''

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")  
        if i_episode % 10 == 0: 
            scores.append(np.mean(scores_window_printing))
            steps.append(np.mean(step_episode))        
        if i_episode % 100 == 0: 
           print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
    return np.array(scores), np.array(steps), i_episode-100

#%%
total_runs = 10
n_episodes = 1500
GAMMA = 0.99     

combinations = [
    [5e-4, int(1e5), 20, 64, [128, 64]],
    [1e-3, int(1e5), 20, 64, [128, 64]],
    [5e-4, int(1e5), 25, 128, [128, 64]],
    [5e-4, int(1e3), 50, 64, [128, 64]],
    [5e-4, int(1e7), 20, 32, [64, 64]],
    [1e-4, int(1e5), 5, 64, [128, 64]],
    [5e-4, int(1e5), 25, 128, [256, 128]],
    [1e-4, int(1e5), 20, 128, [128, 128]],
    [1e-3, int(1e5), 20, 256, [256, 128]],
    [5e-4, int(1e3), 10, 256, [256, 128]],
    [1e-4, int(1e5), 50, 256, [256, 128]],
    [5e-4, int(1e5), 20, 256, [256, 256]],
]

parameter_names = ["LR", "BUFFER_SIZE", "UPDATE_EVERY", "BATCH_SIZE", "ARCHITECTURE"]

combinationsDict = {}
for i_name in range(len(parameter_names)):
    parameter_list = []
    for j_combo in range(len(combinations)):
        parameter_list.append(combinations[j_combo][i_name])
        combinationsDict.setdefault(parameter_names[i_name], parameter_list)

combo_stats = []
for i in range(len(combinations)):
    BUFFER_SIZE = combinationsDict["BUFFER_SIZE"][i]
    BATCH_SIZE = combinationsDict["BATCH_SIZE"][i]
    UPDATE_EVERY = combinationsDict["UPDATE_EVERY"][i]
    LR = combinationsDict["LR"][i]
    ARCHITECTURE = combinationsDict["ARCHITECTURE"][i]

    begin_time = datetime.datetime.now()
    agent = TutorialAgent(state_size=state_shape, action_size=action_shape, 
                          seed=0, fc1_units=ARCHITECTURE[0], fc2_units=ARCHITECTURE[1])
    run_stats = []
    for run in range(total_runs):
        run_stats.append(dqn(n_episodes=n_episodes))
    combo_stats.append(run_stats)
    
    time_taken = datetime.datetime.now() - begin_time
    print("Combination " + str(i+1) + " - " + str(time_taken))

#%%
run_stats_matrix = np.zeros((len(combo_stats),total_runs,len(combo_stats[0][0][0])))
for i in range(len(combo_stats)):
    for j in range(total_runs):
        for k in range(len(combo_stats[0][0][0])):
            run_stats_matrix[i][j][k] = combo_stats[i][j][0][k] 

#%%
for i in range(len(combo_stats)):
    plt.figure()
    plt.plot(np.arange(10,n_episodes+1,10), np.mean(run_stats_matrix[i], axis=0))
    plt.savefig("DQNcomb_"+str(i)+".png")
    plt.show()
# %%
