In [2]:
#!nvidia-smi

In [3]:
import numpy as np
import random

import gym
from baselines.common.atari_wrappers import wrap_deepmind


from tqdm import tqdm


import statistics

import argparse
import os

import tensorflow as tf

import matplotlib.pyplot as plt
from copy import deepcopy

from IPython import display
from time import time
%matplotlib inline

from tensorflow import summary
#%load_ext tensorboard

import datetime




In [4]:
class ExperienceBuffer():
    
    def __init__(self, size=10000):
        self.size=size
        self.cursor = 0
        self.buffer = []
        
    def add(self, exp):
        # exp = deepcopy(exp)
        if len(self.buffer) < self.size:
            self.buffer.append(exp)
        else:
            self.buffer[self.cursor] = exp
        self.cursor += 1
        if self.cursor == self.size:
            self.cursor = 0
            
        
    
    def sample(self, sample_size):
        return np.array(random.sample(self.buffer, k=sample_size))


In [5]:
def conv2d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[strides], padding='VALID') # [filter_height, filter_width, in_channels, out_channels]
    x = x+b #tf.nn.bias_add(x, b)
    return tf.nn.relu(x)


In [6]:
class QNetwork():

    def __init__(self, num_actions, obs_dim, scope, atari):
        
        self.scope = scope
        self.out_dim = num_actions
        self.obs_dim = obs_dim
        self.atari = atari
        with tf.variable_scope(scope):

            
            self.X = tf.placeholder(tf.float32, [None] + list(obs_dim))
            self.Y = tf.placeholder(tf.float32)
            self.M = tf.placeholder(tf.float32, [None, self.out_dim])
            fc_nb = 256
            if atari:
                self.layers = [
                    {'type': 'conv', 'in': self.obs_dim[-1], 'out': 16, 'height': 8, 'width': 8, 'stride': 4},
                    {'type': 'conv', 'in': 16, 'out': 32, 'height': 4, 'width': 4, 'stride': 2},
                    {'type': 'fc', 'n': fc_nb, 'prev': 9*9*32},
                    {'type': 'fc', 'n': self.out_dim, 'prev': fc_nb}
                ]
                """
                self.layers = [
                    {'type': 'conv', 'in': self.obs_dim[-1], 'out': 32, 'height': 8, 'width': 8, 'stride': 4},
                    {'type': 'conv', 'in': 32, 'out': 64, 'height': 4, 'width': 4, 'stride': 2},
                    {'type': 'conv', 'in': 64, 'out': 64, 'height': 3, 'width': 3, 'stride': 1},
                    {'type': 'fc', 'n': fc_nb, 'prev': 7*7*64},
                    {'type': 'fc', 'n': self.out_dim, 'prev': fc_nb}
                ]
                """

            else:
                self.layers = [
                    {'type': 'fc', 'n': 64, 'prev': self.obs_dim[-1]},
                    {'type': 'fc', 'n': 64, 'prev': 64},

                    {'type': 'fc', 'n': self.out_dim, 'prev': 64}
                ]

            self.wb = []
            initializer = tf.initializers.variance_scaling(scale=2.0)
            #initializer = tf.initializers.variance_scaling()

            #initializer = tf.contrib.layers.xavier_initializer()
            #initializer = tf.random_normal_initializer()
            for layer in self.layers:
                if layer['type'] == 'conv':
                    self.wb.append([tf.Variable(initializer([layer['height'], layer['width'], layer['in'], layer['out']])),
                                    tf.Variable(tf.zeros([layer['out']]))])
                elif layer['type'] == 'fc':
                    self.wb.append([tf.Variable(initializer([layer['prev'], layer['n']])),
                                    tf.Variable(tf.zeros([layer['n']]))])

            self.forward = self.graph(self.X, self.M)
        
        
    def graph(self, x, mask):
        if self.atari:
            x = x/255.0
        for i, wb in enumerate(self.wb):
            if self.layers[i]['type'] == 'conv':
                x = conv2d(x, wb[0], wb[1], strides=self.layers[i]['stride'])
            else:
                x = tf.contrib.layers.flatten(x)
                x = tf.matmul(x, wb[0])+wb[1]
                if i+1<len(self.wb):
                    x = tf.nn.relu(x)   
        x = x*mask
        return x
    
    
    def copy_model_parameters(self, sess, copy_scope = 'main'):
        """
        Copies the model parameters of one estimator to another.

        Args:
          sess: Tensorflow session instance
          estimator1: Estimator to copy the paramters from
          estimator2: Estimator to copy the parameters to
        """
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(copy_scope)]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        #print('e1_params', e1_params)
        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
        e2_params = sorted(e2_params, key=lambda v: v.name)
        #print('e2_params', e2_params)

        update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assign(e1_v)
            update_ops.append(op)

        sess.run(update_ops)



In [7]:
class DQN():
    
    def __init__(self, env, sess, lr=1e-4, gamma=0.99, buffer_size=100000,
                 device=None, epoch_steps=5e4, evaluation_runs=5, batch_size=32, 
                 multiplier=1, target_network_update_freq = 2000, epsilon_decay_steps = 5e5,
                 start_buffer=20000, atari=True):
        
        self.env = env
        self.sess = sess
        self.atari = atari
        self.epoch = 0
    
        self.exp_buf = ExperienceBuffer(buffer_size)
        
        self.gamma = gamma
        self.multiplier = multiplier
        self.batch_size = batch_size*multiplier
        
        self.evaluation_runs = evaluation_runs
        self.epoch_steps = epoch_steps
        self.num_actions = self.env.action_space.n
        self.obs_dim = self.env.observation_space.shape
        self.epsilon_start = 1
        self.epsilon_end = 0.1
        self.epsilon_decay_steps = epsilon_decay_steps
        
        
        self.qnet = QNetwork(self.num_actions, self.obs_dim, atari=atari, scope='main')
        
        
        self.target_network = QNetwork(self.num_actions, self.obs_dim, atari=atari, scope='target')
        
        self.target_network_update_freq = target_network_update_freq
        
        #self.qnet_loss = tf.reduce_mean(tf.losses.mean_squared_error(predictions = self.qnet.forward, labels=self.qnet.Y))
        self.qnet_loss = tf.reduce_mean(tf.losses.huber_loss(predictions = self.qnet.forward, labels=self.qnet.Y, delta=5.0))
        # self.loss_ph = tf.placeholder(tf.float32,shape=None,name='loss_summary')
        
        self.tf_loss_ph = tf.placeholder(tf.float32,shape=None,name='loss_summary')

        self.loss_summary = tf.summary.scalar('loss', self.tf_loss_ph)
        #self.summary = tf.summary.merge_all()
        self.writer = tf.summary.FileWriter('logs/', self.sess.graph)


        self.qnet_optimizer = tf.train.AdamOptimizer(lr).minimize(self.qnet_loss)
        #self.qnet_optimizer = tf.train.RMSPropOptimizer(lr, momentum=0.95).minimize(self.qnet_loss)
        #self.qnet_optimizer = tf.train.RMSPropOptimizer(lr).minimize(self.qnet_loss)
            

        self.steps = 0
        self.best_eval = -99999
        
        self.initialize_buffer(start_buffer)
        
        self.sess.run(tf.global_variables_initializer())

    
    def save_model(self):
        saver = tf.train.Saver()
        saver.save(self.sess, './Epoch' + str(self.epoch) + '-Pong.ckpt')
        print('saved')
        
    def initialize_buffer(self, steps=20000):
        done = True
        obs = None
        print("\nInitializing buffer:")
        for _ in tqdm(range(steps)):
            if done:
                obs = self.env.reset()
            init_obs = obs
            act = self.env.action_space.sample()
            obs, rew, done, _ = self.env.step(act)
            self.exp_buf.add([init_obs, act, rew, obs, not done])

            
    def choose_action(self, obs):
        if random.random() < self.epsilon:
            return env.action_space.sample()
        else:
            nn_input = {self.qnet.X: [obs], self.qnet.M: np.array([self.num_actions*[1]])}
            action = self.qnet.forward.eval(nn_input, session=self.sess)
            action = np.argmax(action[0])
            return action
        


    def train_epoch(self):
        
        self.epoch += 1
        i = 0
        print("====Epoch:", self.epoch, "====")
        print('steps', self.steps)
        print('epsilon', self.epsilon)
        
        losss = []

        rew_list = []
        diff_list = []
        with tqdm(total=self.epoch_steps) as pbar:

            while i < self.epoch_steps:
                
                obs = self.env.reset()

                done = False
                tot_rew = 0
                

                while (not done) and (i < self.epoch_steps):
                    
                    step_num = i + (self.epoch-1)*self.epoch_steps
                    
                    for e in range(self.multiplier):
                        init_obs = obs
                        act = self.choose_action(obs)
                        obs, rew, done, _ = self.env.step(act)

                        tot_rew += rew
                        self.exp_buf.add([init_obs, act, rew, obs, not done])

                        if done:
                            rew_list.append(tot_rew)
                            break

                    if not (self.steps)%self.target_network_update_freq:
                        self.target_network.copy_model_parameters(self.sess, self.qnet.scope)
                    
                    # training 
                    st = time()

                    sample = self.exp_buf.sample(self.batch_size)
                    
                    #assert all(s.shape == (84, 84, 4) for s in sample[:,3])
                    DDQN = True
                    verb = False

                    if DDQN:
                        nn_input = {self.qnet.X: np.stack(sample[:,3]), self.qnet.M: np.ones((len(sample), self.num_actions))}
                        nextqs = self.qnet.forward.eval(nn_input, session=self.sess)
                        #target_q = np.amax(nextqs, 1)

                        maxq_index = np.argmax(nextqs, axis=1)
                        #if not self.steps%10000:
                        if verb and not self.steps%1:
                            print('nextqs', nextqs)
                            print('maxq_index', maxq_index)
                    
                    nn_input = {self.target_network.X: np.stack(sample[:,3]), self.target_network.M: np.ones((len(sample), self.num_actions))}
                    nextqs = self.target_network.forward.eval(nn_input, session=self.sess)
                    if verb and not self.steps%1:
                        print('nextqs_target', nextqs)
                    
                    if DDQN:
                        target_q = nextqs[range(len(nextqs)), maxq_index]
                        #print()
                        #if not self.steps%10000:
                        if verb and not self.steps%1:
                            print('target_q', target_q)
                            print('DQN target_q', np.amax(nextqs, 1))
                        #if not self.steps%1:
                        #    diff_list.append(sum((target_q-np.amax(nextqs, 1))**2 > 0))
                            #print("diff", sum((target_q-np.amax(nextqs, 1))**2 > 0))

                    else:
                        target_q = np.amax(nextqs, 1)
                    

                    

                    ys = sample[:,2] + sample[:,4] * self.gamma * target_q
                    
                    #if not self.steps%1:
                    #    print('ys', ys)
                    #    print('dqn ys', sample[:,2] + sample[:,4] * self.gamma * np.amax(nextqs, 1))
                    #    print("diff2", sum((ys-(sample[:,2] + sample[:,4] * self.gamma * np.amax(nextqs, 1)))**2))
                    
                    
                    action_mask = np.eye(self.num_actions)[sample[:,1].astype(int)]
                    if verb:
                        print('actions', sample[:,1])
                        print('action mask', action_mask)
                    #ys_dqn = np.multiply(sample[:,2] + sample[:,4] * self.gamma * np.amax(nextqs, 1), action_mask.T).T
                    ys = np.multiply(ys, action_mask.T).T
                    #print('ys', ys)
                    if verb and not self.steps%10000:
                        print('ys', ys)
                    #print('ys_dqn', ys_dqn, '\n')
                    
                    if not self.steps%150000 and self.steps:
                        print('ys', ys[:8])
                        
                    
                    #print("b:", time()-st)

                    st = time()
                    
                    #if not self.steps%10:
                    summ, c = self.sess.run([self.qnet_optimizer, self.qnet_loss], feed_dict = {self.qnet.X: np.stack(sample[:,0]), self.qnet.M: action_mask, self.qnet.Y: ys})
                    loss_summ = self.sess.run(self.loss_summary, feed_dict = {self.tf_loss_ph: c})
                    self.writer.add_summary(loss_summ, self.steps)
                    #else:
                    #    summ, c = self.sess.run([self.qnet_optimizer, self.qnet_loss], feed_dict = {self.qnet.X: np.stack(sample[:,0]), self.qnet.M: action_mask, self.qnet.Y: ys})


                    losss.append(c)
                    
                    
                    self.steps += 1
                    i += 1
                    pbar.update(1)
                    #print("c:", time()-st)
        print("\nAvg rew:", statistics.mean(rew_list))
        if self.best_eval < statistics.mean(rew_list):
            self.best_eval = statistics.mean(rew_list)
        print("losss:", statistics.mean(list(map(float, losss))))
        #print(diff_list)
        self.run_evaluation()
        
        #self.save_model()

    
    def run_evaluation(self):
        done = False
        obs = None
        
        rs = []
        for r in tqdm(range(self.evaluation_runs)):
            tot_rew = 0
            obs = self.env.reset()
            done = False
            noopAct = random.randint(0,10)
            for i in range(50000):
                #render = True
                
                #if render:
                #    self.env.render()
                if done:
                    break
                    
                if i < noopAct: # https://arxiv.org/pdf/1511.06581.pdf
                    act = 0 # env.action_space.sample()
                #if random.random() < 0.01:
                
                #act = env.action_space.sample()
                else:
                    nn_input = {self.qnet.X: [obs], self.qnet.M: np.ones((1, self.num_actions))}
                    action = self.qnet.forward.eval(nn_input, session=self.sess)
                    act = np.argmax(action[0])
                obs, rew, done, _ = self.env.step(act)
                tot_rew += rew
            rs.append(tot_rew)
        
        print("test rewards:", rs)
        
        
    def display_agent(self):
        import io
        import base64
        from IPython.display import HTML

        env = gym.wrappers.Monitor(self.env, "./gym-results", force=True)

        obs = env.reset()
        done = False
        for _ in range(50000):
            if done:
                break
            nn_input = {self.qnet.X: [obs], self.qnet.M: np.ones((1, self.num_actions))}
            action = self.qnet.forward.eval(nn_input, session=self.sess)
                        #print(action)
            act = np.argmax(action[0])
            #time.sleep(0.003)
            obs, rew, done, _ = env.step(act)
            #env.render()
        env.close()



        video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
        encoded = base64.b64encode(video)
        HTML(data='''
            <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
        .format(encoded.decode('ascii')))
    
    @property
    def epsilon(self):
        eps = self.epsilon_start - (self.epsilon_start-self.epsilon_end) * self.steps / self.epsilon_decay_steps
        eps = max(self.epsilon_end, eps)
        return eps

    


In [8]:
#if __name__ == '__main__':

env = gym.make('CartPole-v1')
#env = gym.make('Pong-v4')
#env = gym.make('PongNoFrameskip-v4')
#env = gym.make('PongDeterministic-v4')
#env = gym.make('Breakout-v0')
#env = gym.make('BreakoutDeterministic-v4')


config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
#sess = tf.Session()

atari = False
if 'atari' in str(env.env).lower():
    print('atari')
    env = wrap_deepmind(env, frame_stack=True, episode_life=False)
    atari = True   
    dqn_args = {
        "env": env,
        "sess": sess,
        "multiplier": 1,
        'lr':2e-4,
        'buffer_size':100000,
        'epoch_steps':5e4,
        'gamma': 0.99,
        'target_network_update_freq': 5000,
        'epsilon_decay_steps': 6e5,
        'start_buffer': 40000,
        'batch_size': 32,
        'atari': atari

    }

else:
    dqn_args = {
    'env': env,
    'sess': sess,
    'lr': 1e-3,
    'buffer_size':50000,
    'epoch_steps':1e4,
    'gamma': 0.95,
    'target_network_update_freq': 1000,
    'epsilon_decay_steps': 5e4,
    'atari': atari,

    }
    
#allow_soft_placement=True,



epochs = 100

dqn = DQN(**dqn_args)

for e in range(epochs):
    print(e)
    dqn.train_epoch()


    
    """    
    steps = 200
    done = True
    obs = None
    #for i in tqdm(range(steps)):
    for i in range(steps):

        if done:
            obs = env.reset()
        #self.exp_buf.store(init_obs, act, rew, obs, done)
        #plt.imshow(env.render(mode='rgb_array'))
        display.display(plt.gcf())    
        display.clear_output(wait=True)
        
        
        init_obs = obs
        act = env.action_space.sample()
        obs, rew, done, _ = env.step(act)
        
    env.close()
    """


 36%|███▌      | 7114/20000 [00:00<00:00, 71133.79it/s]


Initializing buffer:


100%|██████████| 20000/20000 [00:00<00:00, 42226.04it/s]
  0%|          | 0/10000.0 [00:00<?, ?it/s]

0
====Epoch: 1 ====
steps 0
epsilon 1.0


100%|██████████| 10000/10000.0 [00:52<00:00, 189.65it/s]
  0%|          | 0/5 [00:00<?, ?it/s]


Avg rew: 25.713917525773194
losss: 0.02063869594070711


100%|██████████| 5/5 [00:00<00:00, 11.69it/s]
  0%|          | 1/10000.0 [00:00<17:23,  9.58it/s]

test rewards: [10.0, 9.0, 172.0, 221.0, 10.0]
1
====Epoch: 2 ====
steps 10000
epsilon 0.8200000000000001


100%|██████████| 10000/10000.0 [00:51<00:00, 195.18it/s]
  0%|          | 0/5 [00:00<?, ?it/s]


Avg rew: 31.50473186119874
losss: 0.046496653694927226


100%|██████████| 5/5 [00:00<00:00,  6.96it/s]
  0%|          | 1/10000.0 [00:00<26:46,  6.22it/s]

test rewards: [9.0, 9.0, 199.0, 279.0, 9.0]
2
====Epoch: 3 ====
steps 20000
epsilon 0.64


  0%|          | 19/10000.0 [00:00<03:03, 54.46it/s]


KeyboardInterrupt: 

In [None]:
"""
import gym
from gym import wrappers
import time

#env = gym.make('Pong-v0')
env = gym.make('Breakout-v0')

env = wrap_deepmind(env, frame_stack=True, episode_life=False)


env = wrappers.Monitor(env, "./gym-results", force=True)

env.reset()
done = False
for _ in range(50000):
    if done:
        break
    act = env.action_space.sample()
    #time.sleep(0.003)
    obs, rew, done, _ = env.step(act)
    #env.render()
env.close()
"""

In [None]:
"""
import io
import base64
from IPython.display import HTML

video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))
"""

In [None]:

#


#%tensorboard --logdir logs
#print(dqn.epoch)
#dqn.run_evaluation()
print(dqn.best_eval)

dqn.run_evaluation()

In [None]:
dqn.display_agent()
    #!pip uninstall tensorflow-gpu
  #!pip install tensorflow-gpu