In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install git+https://github.com/tensorflow/docs > /dev/null 2>&1
!pip install gym[classic_control]

In [6]:
import numpy as np
import random
import gym
# from gym.wrappers import Monitor
import glob
import io
import matplotlib.pyplot as plt
from IPython.display import HTML
import torch
import torch.nn as nn
from collections import deque, namedtuple
import random
import torch.nn.functional as F
import itertools

In [None]:
env = gym.make('CartPole-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

print(state_shape)
print(no_of_actions)
print(env.action_space.sample())

#Dueling DQN


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
LR = 1e-3              # learning rate
UPDATE_EVERY = 20       # how often to update the network (When Q target is present)


class QNetwork1(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=256, typ = 1): #typ is for type of update equation to be used in forward method

        super(QNetwork1, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)

        self.value_fc = nn.Linear(fc1_units, fc2_units)
        self.adv_fc = nn.Linear(fc1_units, fc2_units)

        self.value = nn.Linear(fc2_units, 1)
        self.adv = nn.Linear(fc2_units, action_size)

        assert typ == 1 or typ == 2, "Type should either be 1 or 2"
        self.typ = typ

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        # x = F.relu(self.fc2(x))
        val = F.relu(self.value_fc(x))
        adv = F.relu(self.adv_fc(x))

        val = self.value(val)
        adv = self.adv(adv)

        if self.typ  == 1 :
            return val + adv - torch.mean(adv, dim = 1, keepdim=True)
        else :
            return val + adv - torch.max(adv, dim = 1,  keepdim=True)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):

        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [None]:
class Agent():

    def __init__(self, state_size, action_size, seed, typ = 1):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.qnetwork_local = QNetwork1(state_size = state_size, action_size= action_size, seed= seed, typ = typ).to(device)
        self.qnetwork_target = QNetwork1(state_size = state_size, action_size= action_size, seed= seed, typ = typ).to(device)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) >= BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:

            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, eps=0.01):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        with torch.no_grad():
            Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()

        self.optimizer.step()

In [None]:
def dueling_dqn(env, agent, typ, avg_window_score, n_episodes=10000, max_t=1000, eps_start=0.3, eps_end=0.01, eps_decay=0.995):
    max_len = 100
    scores_window = deque(maxlen=max_len)

    eps = eps_start
    rewards = np.zeros((n_episodes,))
    for i_episode in (range(1, n_episodes+1)):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break

        scores_window.append(score)

        rewards[i_episode-1] = score
        # rewards[i_episode-1] = score
        eps = max(eps_end, eps_decay*eps)

        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")

        if i_episode % 100 == 0:
           print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=avg_window_score:
           print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
           break
    return agent, np.array(rewards)


##CARTPOLE

In [None]:
runs = 5
episodes = 10000
rewards_avg = np.zeros((runs, episodes,))
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n
for run in range(runs) :
  s = np.random.randint(0, 1000)
  print("-"*45)
  print(f"Running experiment with seed value as {s}")
  print("-"*45)
  agent = Agent(state_size=state_shape,action_size = action_shape, seed = s)
  agent, rewards = dueling_dqn(env, agent,  n_episodes = episodes, typ = 1, avg_window_score = 195)
  rewards_avg[run] = rewards

In [None]:
cartpole_run = {}
cartpole_run["type1"] = rewards_avg

In [None]:
runs = 5
rewards_avg = np.zeros((runs, 10000,))
for run in range(runs) :
  s = np.random.randint(0, 1000)
  print("-"*45)
  print(f"Running experiment with seed value as {s}")
  print("-"*45)
  agent = Agent(state_size=state_shape,action_size = action_shape, seed = s)
  agent, rewards = dueling_dqn(env, agent, typ = 2, avg_window_score=195)
  rewards_avg[run] = rewards

In [None]:
cartpole_run["type2"] = rewards_avg

In [None]:
def plot_rewards_episodes(steps, message = "Rewards vs Episodes"):

  fig, ax =  plt.subplots()
  for step in steps:
    mini = np.mean(step, axis=0) - np.std(step, axis = 0)
    maxi = np.mean(step, axis=0) + np.std(step, axis = 0)
    ax.plot(np.arange(len(step[0])), np.mean(step, axis=0))
    ax.fill_between(np.arange(len(step[0])), maxi, mini, alpha = 0.3)
  ax.set_xlabel('Episode')
  ax.set_ylabel('Average of Total Rewards')
  fig.suptitle(message)
  plt.show()


plot_rewards_episodes([cartpole_run["type1"][:, :100]], "Episode Rewards vs Episode")

In [None]:
plot_rewards_episodes([cartpole_run["type2"][:, :100]], "Episode Rewards vs Episode")

In [None]:
plot_rewards_episodes([cartpole_run["type1"][:, :100], cartpole_run["type2"][:, :100]], "Episode Rewards vs Episode")

##ACROBOT

In [None]:
env = gym.make('Acrobot-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

print(state_shape)
print(no_of_actions)
print(env.action_space.sample())

In [None]:
runs = 5
episodes = 10000
rewards_avg = np.zeros((runs, episodes,))
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n
LR = 3e-3
for run in range(runs) :
  s = np.random.randint(0, 1000)
  print("-"*45)
  print(f"Running experiment with seed value as {s}")
  print("-"*45)
  agent = Agent(state_size=state_shape,action_size = action_shape, seed = s)
  agent, rewards = dueling_dqn(env, agent,  n_episodes = episodes, typ = 1, avg_window_score = -80)
  rewards_avg[run] = rewards

In [None]:
acrobot_run = {}
acrobot_run["type1"] = rewards_avg

In [None]:
runs = 5
episodes = 10000
rewards_avg = np.zeros((runs, episodes,))
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n
LR = 3e-3
for run in range(runs) :
  s = np.random.randint(0, 1000)
  print("-"*45)
  print(f"Running experiment with seed value as {s}")
  print("-"*45)
  agent = Agent(state_size=state_shape,action_size = action_shape, seed = s)
  agent, rewards = dueling_dqn(env, agent,  n_episodes = episodes, typ = 2, avg_window_score = -80)
  rewards_avg[run] = rewards

In [None]:
acrobot_run["type2"] = rewards_avg

In [None]:
plot_rewards_episodes([acrobot_run["type1"][:, :600]], "Episode Rewards vs Episode")

In [None]:
plot_rewards_episodes([acrobot_run["type2"][:, :580]], "Episode Rewards vs Episode")

In [None]:
plot_rewards_episodes([acrobot_run["type1"][:, :580], acrobot_run["type2"][:, :580]], "Episode Rewards vs Episode")

#Monte Carlo REINFORCE

In [2]:
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
LR = 1e-3              # learning rate

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque

class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=256):

        super(PolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim = 1)
        return x


class StateValueNetwork(nn.Module) :
    def __init__(self, state_size, seed, fc1_units = 64, fc2_units = 128):

        super(StateValueNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, 1)

    def forward(self, state):

        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [None]:
class Agent() :

    def __init__(self, state_size, action_size, seed, gamma, baseline = True):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.baseline = baseline

        self.policy = PolicyNetwork(state_size = state_size, action_size= action_size, seed= seed).to(device)
        self.policyoptimizer = torch.optim.Adam(self.policy.parameters(), lr=LR)
        if baseline :
            self.value = StateValueNetwork(state_size = state_size, seed= seed).to(device)
            self.valueoptimizer = torch.optim.Adam(self.value.parameters(), lr=LR)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        action_probs = self.policy(state)
        state = state.detach()
        a = Categorical(action_probs)
        action = a.sample()
        return action.item(), a.log_prob(action)

    def update_policy(self, ret, log_probs):
        print(torch.tensor(log_probs).shape)
        policy_loss = -ret*torch.tensor(log_probs)

        self.policyoptimizer.zero_grad()
        torch.sum(policy_loss).backward()
        self.policyoptimizer.step()

    def update_value(self, ret, states):
        states = torch.tensor(states).float().to(device)
        values = self.value(states).squeeze()

        val_loss = F.mse_loss(values, ret)
        self.valueoptimizer.zero_grad()
        val_loss.backward()
        self.valueoptimizer.step()
        return values

  and should_run_async(code)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def reinforce(env, agent, avg_window_score, n_episodes = 10000, max_t=1000, baseline = True):
    scores = np.zeros((n_episodes,))
    scores_window = deque(maxlen=100)
    for i_episode in range(n_episodes):
        state = env.reset()
        states = []
        log_probs = []
        rewards = []
        score = 0
        for t in range(max_t):
            action, log_prob = agent.act(state)
            new_state, reward, done, _ = env.step(action)
            score += reward
            states.append(state)
            log_probs.append(log_prob)
            rewards.append(reward)
            if done:
                break
            state = new_state
        scores[i_episode] = score
        scores_window.append(score)


        ret = []
        total_reward = 0
        for r in range(len(rewards)-1, -1, -1):
            total_reward = r + total_reward*GAMMA
            ret.append(total_reward)
        ret = torch.tensor(ret[::-1]).to(device)
        ret = (ret - ret.mean())/ret.std()

        if baseline :
            values = agent.update_value(ret, states)
            ret = ret - values.detach()

        agent.update_policy(ret, log_probs)


        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if np.mean(scores_window)>=avg_window_score:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            break

    return agent, np.array(scores)

##CARTPOLE

In [None]:
env = gym.make('CartPole-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

print(state_shape)
print(no_of_actions)
print(env.action_space.sample())

In [None]:
runs = 5
episodes = 10000
rewards_avg = np.zeros((runs, episodes))
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n
for run in range(runs) :
    s = np.random.randint(0, 1000)
    print("-"*45)
    print(f"Running experiment with seed value as {s}")
    print("-"*45)
    agent = Agent(state_size=state_shape,action_size = action_shape, seed = s, gamma = GAMMA, baseline = True)
    agent, rewards = reinforce(env, agent,  n_episodes = episodes, avg_window_score = 195, baseline = True)
    rewards_avg[run] = rewards

In [None]:
cartpole_run["with baseline"] = rewards_avg

In [None]:
def plot_rewards_episodes(steps, message = "Rewards vs Episodes"):

  fig, ax =  plt.subplots()
  for step in steps:
    mini = np.mean(step, axis=0) - np.std(step, axis = 0)
    maxi = np.mean(step, axis=0) + np.std(step, axis = 0)
    ax.plot(np.arange(len(step[0])), np.mean(step, axis=0))
    ax.fill_between(np.arange(len(step[0])), maxi, mini, alpha = 0.3)
  ax.set_xlabel('Episode')
  ax.set_ylabel('Average of Total Rewards')
  fig.suptitle(message)
  plt.show()

plot_rewards_episodes([cartpole_run["with baseline"][:, :200]])

In [None]:
runs = 5
episodes = 10000
rewards_avg = np.zeros((runs, episodes))
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n
for run in range(runs) :
    s = np.random.randint(0, 1000)
    print("-"*45)
    print(f"Running experiment with seed value as {s}")
    print("-"*45)
    agent = Agent(state_size=state_shape,action_size = action_shape, seed = s, gamma = GAMMA, baseline = False)
    agent, rewards = reinforce(env, agent,  n_episodes = episodes, avg_window_score = 198, baseline = False)
    rewards_avg[run] = rewards

In [None]:
cartpole_run["without baseline"] = rewards_avg

In [None]:
plot_rewards_episodes([cartpole_run["without baseline"][:, :200]])

In [None]:
plot_rewards_episodes([cartpole_run["without baseline"][:, :200], cartpole_run["with baseline"][:, :200]])

##ACROBOT

In [None]:
env = gym.make('Acrobot-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

print(state_shape)
print(no_of_actions)
print(env.action_space.sample())

In [None]:
runs = 5
episodes = 10000
rewards_avg = np.zeros((runs, episodes,))
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n
for run in range(runs) :
  s = np.random.randint(0, 1000)
  print("-"*45)
  print(f"Running experiment with seed value as {s}")
  print("-"*45)
  agent = Agent(state_size=state_shape,action_size = action_shape, seed = s, gamma  = GAMMA, baseline = False)
  agent, rewards = reinforce(env, agent, avg_window_score = -80,  n_episodes = episodes, baseline = False)
  rewards_avg[run] = rewards

In [None]:
acrobot_run["without baseline"] = rewards_avg

In [None]:
runs = 5
episodes = 10000
rewards_avg = np.zeros((runs, episodes))
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.n
for run in range(runs) :
    s = np.random.randint(0, 1000)
    print("-"*45)
    print(f"Running experiment with seed value as {s}")
    print("-"*45)
    agent = Agent(state_size=state_shape,action_size = action_shape, seed = s, gamma = GAMMA, baseline = True)
    agent, rewards = reinforce(env, agent,  n_episodes = episodes, avg_window_score = -80, baseline = True)
    rewards_avg[run] = rewards

In [None]:
acrobot_run["with baseline"] = rewards_avg

In [None]:
plot_rewards_episodes([cartpole_run["with baseline"][:, :1000]])

In [None]:
plot_rewards_episodes([cartpole_run["without baseline"][:, :1000]])

In [None]:
plot_rewards_episodes([cartpole_run["without baseline"][:, :1000], cartpole_run["with baseline"][:, :1000]])