# Reinforce Algorithm

Reinforce is a easy and simple to understand RL algorithm that belongs to policy gradient class of algorithms.
In policy gradient class of algorithms, the agent learns a policy which is a receipe of action to taken for a given state.

The agent goes through following steps
- The agent acts on the environment and collects - next state, reward, action. 
- The agents continuous to take action on the environment until it reaches the end state.
- The next state, reward and the action taken  

In [None]:
import tensorflow as tf
import numpy as np
import gym 
import tensorflow_probability as tfp
from tqdm import tqdm

In [None]:
env= gym.make("CartPole-v1")
low = env.observation_space.low
high = env.observation_space.high
print(f'Number of action spaces:  {env.action_space.n}')

In [None]:
class model(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(30,activation='relu')
    self.d2 = tf.keras.layers.Dense(30,activation='relu')
    self.out = tf.keras.layers.Dense(env.action_space.n,activation='softmax')

  def call(self, input_data):
    x = tf.convert_to_tensor(input_data)
    x = self.d1(x)
    x = self.d2(x)
    x = self.out(x)
    return x

In [None]:
class agent():
  def __init__(self):
    self.model = model()
    self.opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    self.gamma = 1

  def act(self,state):
    prob = self.model(np.array([state]))
    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
    action = dist.sample()
    return int(action.numpy()[0])

  def a_loss(self,prob, action, reward): 
    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
    log_prob = dist.log_prob(action)
    loss = -log_prob*reward
    return loss 

  def train(self, states, rewards, actions):
    sum_reward = 0
    discnt_rewards = []
    rewards.reverse()
    for r in rewards:
      sum_reward = r + self.gamma*sum_reward
      discnt_rewards.append(sum_reward)
    discnt_rewards.reverse()  

    for state, reward, action in zip(states, discnt_rewards, actions):
      with tf.GradientTape() as tape:
        # forward pass
        p = self.model(np.array([state]), training=True)
        # compute loss
        loss = self.a_loss(p, action, reward)
      # compute gradients
      grads = tape.gradient(loss, self.model.trainable_variables)
      # update weights
      self.opt.apply_gradients(zip(grads, self.model.trainable_variables))

In [None]:
agentoo7 = agent()
steps = 1000
rewards_tracked = []
for s in tqdm(range(steps)):
  done = False
  state = env.reset()[0]
  total_reward = 0
  rewards = []
  states = []
  actions = []
  while not done:
    # env.render()
    action = agentoo7.act(state)
    next_state, reward, done, info, _ = env.step(action)
    rewards.append(reward)
    states.append(state)
    actions.append(action)
    state = next_state
    total_reward += reward
    if done:
      agentoo7.train(states, rewards, actions)
      rewards_tracked.append(total_reward)

## Analysis

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
x_axis = np.asarray(range(0, len(rewards_tracked)))
y_axis = np.asarray(rewards_tracked)
plt.figure(0, figsize=(16,4))
plt.title('Rewards per episode')
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.plot(x_axis,y_axis,'green')
plt.grid()
plt.show()