In [None]:
#!nvidia-smi

In [None]:
import numpy as np
import random

import gym
from atari_wrappers import wrap_deepmind

from tqdm import tqdm

import statistics

import argparse
import os

import tensorflow as tf

import matplotlib.pyplot as plt
from copy import deepcopy

from IPython import display
from time import time
%matplotlib inline

from tensorflow import summary
#%load_ext tensorboard

import datetime

In [None]:
class ExperienceBuffer():
    
    def __init__(self, size=10000):
        self.size=size
        self.cursor = 0
        self.buffer = []
        
    def add(self, exp):
        if len(self.buffer) < self.size:
            self.buffer.append(exp)
        else:
            self.buffer[self.cursor] = exp
        self.cursor += 1
        if self.cursor == self.size:
            self.cursor = 0   
    
    def sample(self, sample_size):
        return np.array(random.sample(self.buffer, k=sample_size))
        

In [None]:
def conv2d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation

    x = tf.nn.conv2d(x, W, strides=[strides], padding='VALID') # [filter_height, filter_width, in_channels, out_channels]
    x = x+b #tf.nn.bias_add(x, b)
    return tf.nn.relu(x)


In [None]:
class QNetwork():

    def __init__(self, num_actions, obs_dim, scope, atari, dueling=True):
        
        self.scope = scope
        self.out_dim = num_actions
        self.obs_dim = obs_dim
        self.atari = atari
        self.dueling = dueling
        with tf.variable_scope(scope):

            self.X = tf.placeholder(tf.float32, [None] + list(obs_dim))
            self.Y = tf.placeholder(tf.float32)
            self.M = tf.placeholder(tf.float32, [None, self.out_dim])
            fc_nb = 512
            if atari:
                if dueling:
                    self.layers = [
                        {'type': 'conv', 'in': self.obs_dim[-1], 'out': 32, 'height': 8, 'width': 8, 'stride': 4},
                        {'type': 'conv', 'in': 32, 'out': 64, 'height': 4, 'width': 4, 'stride': 2},
                        {'type': 'conv', 'in': 64, 'out': 64, 'height': 3, 'width': 3, 'stride': 1},
                        [{'type': 'fc', 'n': fc_nb, 'prev': 7*7*64}, {'type': 'fc', 'n': fc_nb, 'prev': 7*7*64}],
                        [{'type': 'fc', 'n': 1, 'prev': fc_nb}, {'type': 'fc', 'n': self.out_dim, 'prev': fc_nb}] 
                    ]
                else:
                    self.layers = [
                        {'type': 'conv', 'in': self.obs_dim[-1], 'out': 16, 'height': 8, 'width': 8, 'stride': 4},
                        {'type': 'conv', 'in': 16, 'out': 32, 'height': 4, 'width': 4, 'stride': 2},
                        {'type': 'fc', 'n': fc_nb, 'prev': 9*9*32},
                        {'type': 'fc', 'n': self.out_dim, 'prev': fc_nb}
                    ]
            else:
                self.layers = [
                    {'type': 'fc', 'n': 64, 'prev': self.obs_dim[-1]},
                    {'type': 'fc', 'n': 64, 'prev': 64},
                    {'type': 'fc', 'n': self.out_dim, 'prev': 64}
                ]

            self.wb = []
            initializer = tf.initializers.variance_scaling(scale=2.0)
            #initializer = tf.initializers.variance_scaling()
            #initializer = tf.contrib.layers.xavier_initializer()
            #initializer = tf.random_normal_initializer()
            for layer in self.layers:
                if type(layer) is not list:
                    if layer['type'] == 'conv':
                        self.wb.append([tf.Variable(initializer([layer['height'], layer['width'], layer['in'], layer['out']])),
                                        tf.Variable(tf.zeros([layer['out']]))])
                    elif layer['type'] == 'fc':
                        self.wb.append([tf.Variable(initializer([layer['prev'], layer['n']])),
                                        tf.Variable(tf.zeros([layer['n']]))])
                else:
                    self.wb.append([])
                    for l in layer:
                        if l['type'] == 'conv':
                            self.wb[-1].append([tf.Variable(initializer([l['height'], l['width'], l['in'], l['out']])),
                                                tf.Variable(tf.zeros([l['out']]))])
                        elif l['type'] == 'fc':
                            self.wb[-1].append([tf.Variable(initializer([l['prev'], l['n']])),
                                                tf.Variable(tf.zeros([l['n']]))])
            self.forward = self.graph(self.X, self.M)
        
        
    def graph(self, x, mask):
        if self.atari:
            x = x/255.0
        xx = []
        for i, wb in enumerate(self.wb):
            
            if type(wb[0]) is not list:
                if self.layers[i]['type'] == 'conv':
                    x = conv2d(x, wb[0], wb[1], strides=self.layers[i]['stride'])
                else:
                    x = tf.contrib.layers.flatten(x)
                    x = tf.matmul(x, wb[0])+wb[1]
                    if i+1<len(self.wb):
                        x = tf.nn.relu(x)
            else:
                if not xx:
                    x = tf.contrib.layers.flatten(x)
                    xx = [tf.matmul(x, wb[0][0])+wb[0][1], tf.matmul(x, wb[1][0])+wb[1][1]]    
                else:
                    xx[0] = tf.matmul(xx[0], wb[0][0])+wb[0][1]
                    xx[1] = tf.matmul(xx[1], wb[1][0])+wb[1][1]
                if i+1<len(self.wb):
                    xx[0] = tf.nn.relu(xx[0])
                    xx[1] = tf.nn.relu(xx[1])
        
        if self.dueling:
            x = xx[0] + tf.subtract(xx[1], tf.reduce_mean(xx[1], axis=1, keepdims=True))
                
        x = x*mask
        return x
    
    
    def copy_model_parameters(self, sess, copy_scope = 'main'):
        """
        Copies the model parameters of one estimator to another.

        Args:
          sess: Tensorflow session instance
          estimator1: Estimator to copy the paramters from
          estimator2: Estimator to copy the parameters to
        """
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(copy_scope)]
        e1_params = sorted(e1_params, key=lambda v: v.name)

        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(self.scope)]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assign(e1_v)
            update_ops.append(op)

        sess.run(update_ops)


In [None]:
class DQN():
    
    def __init__(self, env, sess, lr=1e-4, gamma=0.99, buffer_size=100000,
                 device=None, epoch_steps=5e4, evaluation_runs=5, batch_size=32, 
                 multiplier=1, target_network_update_freq = 2000, epsilon_decay_steps = 5e5,
                 start_buffer=20000, atari=True, algo='dDQ', eval_rate=1):
        
        self.env = env
        self.sess = sess
        self.atari = atari
        self.epoch = 0
        self.losss = []
        self.eval_rate = eval_rate
        self.eval_rec = {}
        self.algo = algo
    
        self.exp_buf = ExperienceBuffer(buffer_size)
        
        self.gamma = gamma
        self.multiplier = multiplier
        self.batch_size = batch_size*multiplier
        
        self.evaluation_runs = evaluation_runs
        self.epoch_steps = epoch_steps
        self.num_actions = self.env.action_space.n
        self.obs_dim = self.env.observation_space.shape
        self.epsilon_start = 1
        self.epsilon_end = 0.1
        self.epsilon_decay_steps = epsilon_decay_steps
        
        self.dueling = False
        if 'd' in algo:
            self.dueling = True

        self.qnet = QNetwork(self.num_actions, self.obs_dim, atari=atari, scope='main', dueling=self.dueling)
        
        self.target_network = QNetwork(self.num_actions, self.obs_dim, atari=atari, scope='target', dueling=self.dueling)
        
        self.target_network_update_freq = target_network_update_freq
        
        self.qnet_loss = tf.reduce_mean(tf.losses.huber_loss(predictions = self.qnet.forward, labels=self.qnet.Y, delta=5.0))

        self.qnet_optimizer = tf.train.AdamOptimizer(lr).minimize(self.qnet_loss)
     
        self.steps = 0
        self.best_train_eval = -99999
        self.best_test_eval = -99999

        
        self.initialize_buffer(start_buffer)
        
        self.sess.run(tf.global_variables_initializer())

    
    def save_model(self, info='', folder='models'):
        if not os.path.isdir('./'+folder):
            os.mkdir('./'+folder)
        if not os.path.isdir('./'+folder+'/'+self.env.unwrapped.spec.id):
            os.mkdir('./'+folder+'/'+self.env.unwrapped.spec.id)
            
        saver = tf.train.Saver()
        saver.save(self.sess, './' + folder + '/' + self.env.unwrapped.spec.id + '/Epoch' + str(self.epoch) + str(info) + '.ckpt')
        print('saved')
        
    def initialize_buffer(self, steps=20000):
        done = True
        obs = None
        print("\nInitializing buffer:")
        for _ in tqdm(range(steps)):
            if done:
                obs = self.env.reset()
            init_obs = obs
            act = self.env.action_space.sample()
            obs, rew, done, _ = self.env.step(act)
            self.exp_buf.add([init_obs, act, rew, obs, not done])

            
    def choose_action(self, obs):
        if random.random() < self.epsilon:
            return env.action_space.sample()
        else:
            nn_input = {self.qnet.X: [obs], self.qnet.M: np.array([self.num_actions*[1]])}
            action = self.qnet.forward.eval(nn_input, session=self.sess)
            action = np.argmax(action[0])
            return action
        

    def train_epoch(self):
        
        self.epoch += 1
        i = 0
        print("====Epoch:", self.epoch, "====")
        print('steps', self.steps)
        print('epsilon', self.epsilon)
        
        epoch_losss = []

        rew_list = []
        diff_list = []
        with tqdm(total=self.epoch_steps) as pbar:

            while i < self.epoch_steps:
                
                obs = self.env.reset()

                done = False
                tot_rew = 0
                
                while (not done) and (i < self.epoch_steps):
                    
                    step_num = i + (self.epoch-1)*self.epoch_steps
                    
                    for e in range(self.multiplier):
                        init_obs = obs
                        act = self.choose_action(obs)
                        obs, rew, done, _ = self.env.step(act)

                        tot_rew += rew
                        self.exp_buf.add([init_obs, act, rew, obs, not done])

                        if done:
                            rew_list.append(tot_rew)
                            break

                    if not (self.steps)%self.target_network_update_freq:
                        self.target_network.copy_model_parameters(self.sess, self.qnet.scope)
                          
                    # training 

                    sample = self.exp_buf.sample(self.batch_size)
                    
                    #assert all(s.shape == (84, 84, 4) for s in sample[:,3])
                    DDQN = False
                    if 'D' in self.algo:
                        DDQN = True
                    
                    verb = False

                    if DDQN:
                        nn_input = {self.qnet.X: np.stack(sample[:,3]), self.qnet.M: np.ones((len(sample), self.num_actions))}
                        nextqs = self.qnet.forward.eval(nn_input, session=self.sess)
                        maxq_index = np.argmax(nextqs, axis=1)   
                    
                    nn_input = {self.target_network.X: np.stack(sample[:,3]), self.target_network.M: np.ones((len(sample), self.num_actions))}
                    nextqs = self.target_network.forward.eval(nn_input, session=self.sess)
                      
                    if DDQN:
                        target_q = nextqs[range(len(nextqs)), maxq_index]              
                    else:
                        target_q = np.amax(nextqs, 1)
                    

                    ys = sample[:,2] + sample[:,4] * self.gamma * target_q
                    
                    action_mask = np.eye(self.num_actions)[sample[:,1].astype(int)]

                    ys = np.multiply(ys, action_mask.T).T
                    
                    _, loss = self.sess.run([self.qnet_optimizer, self.qnet_loss], feed_dict = {self.qnet.X: np.stack(sample[:,0]), self.qnet.M: action_mask, self.qnet.Y: ys})

                    epoch_losss.append(loss)
                    
                    self.steps += 1
                    i += 1
                    pbar.update(1)
        
        mean_acc = statistics.mean(rew_list)
        print("\nAvg rew:", mean_acc)
        if self.best_train_eval < mean_acc:
            self.best_train_eval = mean_acc
        print("losss:", statistics.mean(list(map(float, epoch_losss))))
        self.losss += epoch_losss
        if not self.epoch%self.eval_rate:
            results = self.run_evaluation()
            self.eval_rec[self.epoch] = results
            mean_acc = statistics.mean(results)
            if self.best_test_eval < mean_acc:
                self.best_test_eval = mean_acc
                info = 'acc:' + str(mean_acc)
                self.save_model(info)
        

    
    def run_evaluation(self, evaluation_runs=5):
        done = False
        obs = None
        
        rs = []
        for r in tqdm(range(evaluation_runs)):
            tot_rew = 0
            obs = self.env.reset()
            done = False
            noopAct = random.randint(0,30)
            for i in range(100000):
                if done:
                    break      
                if i < noopAct: # https://arxiv.org/pdf/1511.06581.pdf
                    act = 0
                else:
                        nn_input = {self.qnet.X: [obs], self.qnet.M: np.ones((1, self.num_actions))}
                        action = self.qnet.forward.eval(nn_input, session=self.sess)
                        act = np.argmax(action[0])
                obs, rew, done, _ = self.env.step(act)
                tot_rew += rew
            rs.append(tot_rew)
        print("test rewards:", rs)
        return rs
        
        
    def display_agent(self):
        import io
        import base64
        from IPython.display import HTML
        uid = self.env.unwrapped.spec.id + '-' + 'Epoch' + str(self.epoch)
        env = gym.wrappers.Monitor(self.env, "./gym-results", force=True, uid=uid)

        obs = env.reset()
        done = False
        noopAct = random.randint(0,30)
        for i in range(50000):
            if done:
                break
            
            if i < noopAct: # https://arxiv.org/pdf/1511.06581.pdf
                    act = 0 # env.action_space.sample()
            else:
                nn_input = {self.qnet.X: [obs], self.qnet.M: np.ones((1, self.num_actions))}
                action = self.qnet.forward.eval(nn_input, session=self.sess)
                act = np.argmax(action[0])

            obs, rew, done, _ = env.step(act)

        env.close()

        return env   
    
    
    @property
    def epsilon(self):
        eps = self.epsilon_start - (self.epsilon_start-self.epsilon_end) * self.steps / self.epsilon_decay_steps
        eps = max(self.epsilon_end, eps)
        return eps

In [None]:
#env = gym.make('CartPole-v1')
#env = gym.make('Pong-v4')
#env = gym.make('PongNoFrameskip-v4')
#env = gym.make('PongDeterministic-v4')
#env = gym.make('Breakout-v0')

tests = [ # algo: d=dueling, D=double
        #{'env': 'CartPole-v1', 'algo': 'DQ', 'epochs': 200},
        {'env': 'PongDeterministic-v4', 'algo': 'dDQ', 'epochs': 200},
        #{'env': 'BreakoutDeterministic-v4', 'algo': 'dDQ', 'epochs': 200},

        #{'env': 'Pong-v4', 'algo': 'dDQ', 'epochs': 200}, 
         #{'env': 'BreakoutDeterministic-v4', 'algo': 'dDQ', 'epochs': 200},
         #{'env': 'PongDeterministic-v4', 'algo': 'dDQ', 'epochs': 200},
        ]

for test in tests:
    env = gym.make(test['env'])
    
    config = tf.ConfigProto(log_device_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    #sess = tf.Session()

    atari = False
    if 'atari' in str(env.env).lower():
        print('atari')
        env = wrap_deepmind(env, frame_stack=True, episode_life=False)
        atari = True   
        dqn_args = {
            "env": env,
            "sess": sess,
            "multiplier": 1,
            'lr':1e-4,
            'buffer_size':100000,
            'epoch_steps':5e4,
            'gamma': 0.99,
            'target_network_update_freq': 8000,
            'epsilon_decay_steps': 6e5,
            'start_buffer': 20000,
            'batch_size': 32,
            'atari': atari,

        }

    else:
        dqn_args = {
        'env': env,
        'sess': sess,
        'lr': 1e-3,
        'buffer_size':50000,
        'epoch_steps':1e4,
        'gamma': 0.95,
        'target_network_update_freq': 1000,
        'epsilon_decay_steps': 5e4,
        'atari': atari,

        }
    dqn_args['algo'] = test['algo']

    epochs = test['epochs']

    dqn = DQN(**dqn_args)

    for e in range(epochs):
        print(e)
        dqn.train_epoch()



In [None]:
print(dqn.best_train_eval)

dqn.run_evaluation()

In [None]:
env = dqn.display_agent()
    #!pip uninstall tensorflow-gpu
  #!pip install tensorflow-gpu
import io
import base64
from IPython.display import HTML
video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
            <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
        .format(encoded.decode('ascii')))

In [None]:
#sess.close()