In [1]:
import pdb
import gym
import math
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
from baselines.common.schedules import LinearSchedule
from copy import deepcopy

In [2]:
env = gym.make('CartPole-v0')

In [3]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)
        
class DQN(nn.Module):
    
    def __init__(self):
        super(DQN, self).__init__()
        
        self.l1 = nn.Linear(4, 16);
        self.l2 = nn.Linear(16, 64);        
        self.l3 = nn.Linear(64, 256);
        self.l4 = nn.Linear(256, 2);
        
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = F.relu(self.l3(x))                
        x = F.relu(self.l4(x))                
        return x

In [4]:
class DQN_learn():
    def __init__(self, model, learning_rate, epsilon_sched, gamma, batch_size, act_space):
        self.dqn = model
        self.target_dqn = deepcopy(model)        
        self.optim = optim.Adam(lr= learning_rate, params= self.dqn.parameters())
        self.epsilon = epsilon_sched
        self.batch_size = batch_size
        self.act_space = act_space
        
        self.replay_buffer = ReplayBuffer(50000)
        self.update_freq = 100
        self.train_freq = 1
        self.t = 1
        self.obs_t = np.array([])
        
    def train(self):        
        
        buffer_sample = self.replay_buffer.sample(self.batch_size)
        obs = torch.from_numpy(buffer_sample[0]).float()
        act = torch.from_numpy(buffer_sample[1]).long()
        obs1 = torch.from_numpy(buffer_sample[2]).float()
        rew = torch.from_numpy(buffer_sample[3]).float()
        dones = torch.from_numpy(buffer_sample[4].astype(int)).float()                 
                      
        val = self.dqn(obs)
        val = val.gather(1, act.view(-1, 1)).squeeze() # Q-values of chosen actions
        
        _, max_act = self.target_dqn(obs1).detach().max(1)
        val1 = self.dqn(obs1).detach()
        val1 = val1.gather(1, max_act.view(-1, 1)).squeeze()
        
        targets = rew + gamma*torch.mul(val1, (1 - dones))
        
        self.optim.zero_grad()
        loss = nn.MSELoss()(val, targets)                
        loss.backward()
        self.optim.step()
                
    def step(self, obs_t1, rew_t, done_t):
        
        self.replay_buffer.add(self.obs_t, self.act_t, obs_t1, rew_t, done_t)
        self.obs_t = obs_t1
        self.t = self.t + 1        
        
        if self.t > self.batch_size and self.t%self.train_freq == 0:
            self.train()
            
        if self.t > self.update_freq and self.t%self.update_freq == 0:
            self.update_target()
        
        self.act_t = self.act(obs_t1)
        return self.act_t
        
    def act(self, obs):
        random_prob = np.random.binomial(1, self.epsilon.value(t))
        
        if random_prob == 1 or not obs.tolist() :
            
            # Act randomly with epsilon probability
            curr_act = self.act_space.sample()
            
            if self.t%100 == 0:
                print("Exploring with prob " + str(self.epsilon.value(t)))
                                
        else:                           
            curr_act = np.argmax(self.dqn(torch.from_numpy(obs).float().unsqueeze(0)).detach().numpy())                        
            
        return curr_act
    
    def reset(self, obs):
        
        self.obs_t = obs
        self.act_t = self.act(obs)        
        return self.act(obs)
        
    def update_target(self):
        self.target_dqn.load_state_dict(self.dqn.state_dict())     

In [5]:
steps = 30000
epsilon = LinearSchedule(steps, 0.05, 1.0)
lr = 2e-4
batch_size = 32
gamma = 1

dqn = DQN()
dqn.apply(init_weights)
agent = DQN_learn(dqn, lr, epsilon, gamma, batch_size, env.action_space)

done = True

episode_rew = 0
episode_count = 0

for t in range(steps):
    if done:
        obs = env.reset()
        act = agent.reset(obs)                    
        print("Steps = " + str(t) + ", Episode " + str(episode_count) + " with reward = " + str(episode_rew))
        
        episode_rew = 0
        episode_count = episode_count + 1
        
    obs, rew, done, _ = env.step(act)   
    if done:        
        rew = 0
    act = agent.step(obs, rew, done)
    episode_rew = episode_rew + rew


Steps = 0, Episode 0 with reward = 0
Steps = 11, Episode 1 with reward = 10.0
Steps = 40, Episode 2 with reward = 28.0
Steps = 56, Episode 3 with reward = 15.0
Steps = 86, Episode 4 with reward = 29.0
Exploring with prob 0.9968966666666667
Steps = 108, Episode 5 with reward = 21.0
Steps = 125, Episode 6 with reward = 16.0
Steps = 138, Episode 7 with reward = 12.0
Steps = 153, Episode 8 with reward = 14.0
Steps = 169, Episode 9 with reward = 15.0
Exploring with prob 0.99373
Steps = 208, Episode 10 with reward = 38.0
Steps = 262, Episode 11 with reward = 53.0
Steps = 272, Episode 12 with reward = 9.0
Steps = 282, Episode 13 with reward = 9.0
Steps = 296, Episode 14 with reward = 13.0
Exploring with prob 0.9905633333333334
Steps = 324, Episode 15 with reward = 27.0
Steps = 369, Episode 16 with reward = 44.0
Steps = 383, Episode 17 with reward = 13.0
Steps = 396, Episode 18 with reward = 12.0
Exploring with prob 0.9873966666666667
Steps = 425, Episode 19 with reward = 28.0
Steps = 446, Epi

Steps = 3347, Episode 158 with reward = 12.0
Exploring with prob 0.8923966666666667
Steps = 3424, Episode 159 with reward = 76.0
Steps = 3437, Episode 160 with reward = 12.0
Steps = 3447, Episode 161 with reward = 9.0
Steps = 3462, Episode 162 with reward = 14.0
Steps = 3479, Episode 163 with reward = 16.0
Steps = 3488, Episode 164 with reward = 8.0
Exploring with prob 0.88923
Exploring with prob 0.8891983333333333
Exploring with prob 0.8891983333333333
Steps = 3499, Episode 165 with reward = 10.0
Steps = 3557, Episode 166 with reward = 57.0
Steps = 3577, Episode 167 with reward = 19.0
Exploring with prob 0.8860633333333333
Steps = 3600, Episode 168 with reward = 22.0
Steps = 3626, Episode 169 with reward = 25.0
Steps = 3648, Episode 170 with reward = 21.0
Steps = 3657, Episode 171 with reward = 8.0
Steps = 3671, Episode 172 with reward = 13.0
Steps = 3690, Episode 173 with reward = 18.0
Exploring with prob 0.8828966666666667
Steps = 3729, Episode 174 with reward = 38.0
Steps = 3745, E

Steps = 7349, Episode 307 with reward = 10.0
Steps = 7364, Episode 308 with reward = 14.0
Steps = 7397, Episode 309 with reward = 32.0
Steps = 7424, Episode 310 with reward = 26.0
Steps = 7459, Episode 311 with reward = 34.0
Steps = 7498, Episode 312 with reward = 38.0
Steps = 7561, Episode 313 with reward = 62.0
Steps = 7575, Episode 314 with reward = 13.0
Steps = 7598, Episode 315 with reward = 22.0
Exploring with prob 0.7593966666666667
Steps = 7622, Episode 316 with reward = 23.0
Steps = 7678, Episode 317 with reward = 55.0
Steps = 7691, Episode 318 with reward = 12.0
Exploring with prob 0.75623
Steps = 7733, Episode 319 with reward = 41.0
Steps = 7763, Episode 320 with reward = 29.0
Steps = 7775, Episode 321 with reward = 11.0
Steps = 7815, Episode 322 with reward = 39.0
Steps = 7843, Episode 323 with reward = 27.0
Steps = 7872, Episode 324 with reward = 28.0
Exploring with prob 0.7498966666666667
Steps = 7964, Episode 325 with reward = 91.0
Exploring with prob 0.74673
Steps = 800

Steps = 14752, Episode 448 with reward = 74.0
Exploring with prob 0.5313966666666667
Steps = 14846, Episode 449 with reward = 93.0
Exploring with prob 0.52823
Steps = 14958, Episode 450 with reward = 111.0
Exploring with prob 0.5250633333333333
Steps = 15022, Episode 451 with reward = 63.0
Exploring with prob 0.5218966666666667
Steps = 15136, Episode 452 with reward = 113.0
Steps = 15165, Episode 453 with reward = 28.0
Steps = 15255, Episode 454 with reward = 89.0
Steps = 15282, Episode 455 with reward = 26.0
Exploring with prob 0.5123966666666667
Steps = 15445, Episode 456 with reward = 162.0
Steps = 15537, Episode 457 with reward = 91.0
Exploring with prob 0.5060633333333333
Steps = 15676, Episode 458 with reward = 138.0
Exploring with prob 0.5028966666666668
Steps = 15840, Episode 459 with reward = 163.0
Exploring with prob 0.49656333333333336
Steps = 15905, Episode 460 with reward = 64.0
Exploring with prob 0.4933966666666667
Steps = 16008, Episode 461 with reward = 102.0
Steps = 1

<Figure size 432x288 with 0 Axes>

In [6]:
steps = 1000
agent.epsilon = LinearSchedule(steps, 0.01, 0.01)

In [7]:
#Testing Agent

done = True

episode_rew = 0
episode_count = 0
res = []

while episode_count <= 100:
    if done:
        obs = env.reset()
        act = agent.reset(obs)            
        
        print("Episode " + str(episode_count) + " with reward = " + str(episode_rew))
        res.append(episode_rew)
        episode_rew = 0
        episode_count = episode_count + 1
        
    obs, rew, done, _ = env.step(act)       
    act = agent.step(obs, rew, done)
    episode_rew = episode_rew + rew

#     env.render()
# env.close()

Episode 0 with reward = 0
Episode 1 with reward = 200.0
Episode 2 with reward = 200.0
Episode 3 with reward = 200.0
Episode 4 with reward = 200.0
Episode 5 with reward = 200.0
Episode 6 with reward = 200.0
Exploring with prob 0.01
Episode 7 with reward = 200.0
Episode 8 with reward = 200.0
Episode 9 with reward = 200.0
Episode 10 with reward = 200.0
Episode 11 with reward = 200.0
Episode 12 with reward = 200.0
Episode 13 with reward = 200.0
Episode 14 with reward = 200.0
Episode 15 with reward = 200.0
Episode 16 with reward = 200.0
Episode 17 with reward = 200.0
Episode 18 with reward = 200.0
Episode 19 with reward = 200.0
Episode 20 with reward = 200.0
Episode 21 with reward = 200.0
Episode 22 with reward = 200.0
Episode 23 with reward = 200.0
Episode 24 with reward = 200.0
Episode 25 with reward = 200.0
Episode 26 with reward = 200.0
Episode 27 with reward = 200.0
Episode 28 with reward = 200.0
Episode 29 with reward = 200.0
Episode 30 with reward = 200.0
Episode 31 with reward = 200

In [8]:
np.array(res[1:]).mean()

200.0

__Result for 400th episode:__
![last_episode](img/cartpole4.gif "Cartpole")
__Result for 500th episode:__
![last_episode](img/cartpole5.gif "Cartpole")
__Result after 30,000 steps:__
![last_episode](img/cartpole_final.gif "Cartpole")