In [1]:
import pdb
import gym
import math
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
from baselines.common.schedules import LinearSchedule
from copy import deepcopy

In [2]:
env = gym.make('CartPole-v0')

In [3]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)
        
class DQN(nn.Module):
    
    def __init__(self):
        super(DQN, self).__init__()
        
        self.l1 = nn.Linear(4, 16);
        self.l2 = nn.Linear(16, 64);        
        self.l3 = nn.Linear(64, 256);
        self.l4 = nn.Linear(256, 2);
        
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = F.relu(self.l3(x))                
        x = F.relu(self.l4(x))                
        return x

In [4]:
class DQN_learn():
    def __init__(self, model, learning_rate, epsilon_sched, gamma, batch_size, act_space):
        self.dqn = model
        self.target_dqn = deepcopy(model)        
        self.optim = optim.Adam(lr= learning_rate, params= self.dqn.parameters())
        self.epsilon = epsilon_sched
        self.batch_size = batch_size
        self.act_space = act_space
        
        self.replay_buffer = ReplayBuffer(50000)
        self.update_freq = 100
        self.train_freq = 1
        self.t = 1
        self.obs_t = np.array([])
        
    def train(self):        
        
        buffer_sample = self.replay_buffer.sample(self.batch_size)
        obs = torch.from_numpy(buffer_sample[0]).float()
        act = torch.from_numpy(buffer_sample[1]).long()
        obs1 = torch.from_numpy(buffer_sample[2]).float()
        rew = torch.from_numpy(buffer_sample[3]).float()
        dones = torch.from_numpy(buffer_sample[4].astype(int)).float()                 
                      
        val = self.dqn(obs)
        val = val.gather(1, act.view(-1, 1)).squeeze() # Q-values of chosen actions
        
        _, max_act = self.target_dqn(obs1).detach().max(1)
        val1 = self.dqn(obs1).detach()
        val1 = val1.gather(1, max_act.view(-1, 1)).squeeze()
        
        targets = rew + gamma*torch.mul(val1, (1 - dones))
        
        self.optim.zero_grad()
        loss = nn.MSELoss()(val, targets)                
        loss.backward()
        self.optim.step()
                
    def step(self, obs_t1, rew_t, done_t):
        
        self.replay_buffer.add(self.obs_t, self.act_t, obs_t1, rew_t, done_t)
        self.obs_t = obs_t1
        self.t = self.t + 1        
        
        if self.t > self.batch_size and self.t%self.train_freq == 0:
            self.train()
            
        if self.t > self.update_freq and self.t%self.update_freq == 0:
            self.update_target()
        
        self.act_t = self.act(obs_t1)
        return self.act_t
        
    def act(self, obs):
        random_prob = np.random.binomial(1, self.epsilon.value(t))
        
        if random_prob == 1 or not obs.tolist() :
            
            # Act randomly with epsilon probability
            curr_act = self.act_space.sample()
            
            if self.t%100 == 0:
                print("Exploring with prob " + str(self.epsilon.value(t)))
                                
        else:                           
            curr_act = np.argmax(self.dqn(torch.from_numpy(obs).float().unsqueeze(0)).detach().numpy())                        
            
        return curr_act
    
    def reset(self, obs):
        
        self.obs_t = obs
        self.act_t = self.act(obs)        
        return self.act(obs)
        
    def update_target(self):
        self.target_dqn.load_state_dict(self.dqn.state_dict())     

In [None]:
steps = 30000
epsilon = LinearSchedule(steps, 0.05, 1.0)
lr = 2e-4
batch_size = 32
gamma = 1

dqn = DQN()
dqn.apply(init_weights)
agent = DQN_learn(dqn, lr, epsilon, gamma, batch_size, env.action_space)

done = True

episode_rew = 0
episode_count = 0

for t in range(steps):
    if done:
        obs = env.reset()
        act = agent.reset(obs)                    
        print("Steps = " + str(t) + ", Episode " + str(episode_count) + " with reward = " + str(episode_rew))
        
        episode_rew = 0
        episode_count = episode_count + 1
        
    obs, rew, done, _ = env.step(act)   
    if done:        
        rew = 0
    act = agent.step(obs, rew, done)
    episode_rew = episode_rew + rew


In [6]:
steps = 1000
agent.epsilon = LinearSchedule(steps, 0.01, 0.01)

In [7]:
#Testing Agent

done = True

episode_rew = 0
episode_count = 0
res = []

while episode_count <= 100:
    if done:
        obs = env.reset()
        act = agent.reset(obs)            
        
        print("Episode " + str(episode_count) + " with reward = " + str(episode_rew))
        res.append(episode_rew)
        episode_rew = 0
        episode_count = episode_count + 1
        
    obs, rew, done, _ = env.step(act)       
    act = agent.step(obs, rew, done)
    episode_rew = episode_rew + rew

#     env.render()
# env.close()

Episode 0 with reward = 0
Episode 1 with reward = 200.0
Episode 2 with reward = 200.0
Episode 3 with reward = 200.0
Episode 4 with reward = 200.0
Episode 5 with reward = 200.0
Episode 6 with reward = 200.0
Exploring with prob 0.01
Episode 7 with reward = 200.0
Episode 8 with reward = 200.0
Episode 9 with reward = 200.0
Episode 10 with reward = 200.0
Episode 11 with reward = 200.0
Episode 12 with reward = 200.0
Episode 13 with reward = 200.0
Episode 14 with reward = 200.0
Episode 15 with reward = 200.0
Episode 16 with reward = 200.0
Episode 17 with reward = 200.0
Episode 18 with reward = 200.0
Episode 19 with reward = 200.0
Episode 20 with reward = 200.0
Episode 21 with reward = 200.0
Episode 22 with reward = 200.0
Episode 23 with reward = 200.0
Episode 24 with reward = 200.0
Episode 25 with reward = 200.0
Episode 26 with reward = 200.0
Episode 27 with reward = 200.0
Episode 28 with reward = 200.0
Episode 29 with reward = 200.0
Episode 30 with reward = 200.0
Episode 31 with reward = 200

In [8]:
np.array(res[1:]).mean()

200.0

__Result for 400th episode:__
![last_episode](img/cartpole4.gif "Cartpole")
__Result for 500th episode:__
![last_episode](img/cartpole5.gif "Cartpole")
__Result after 30,000 steps:__
![last_episode](img/cartpole_final.gif "Cartpole")