In [None]:
import tensorflow as tf
import numpy as np
import random
from collections import deque
import math

import gym
from tensorflow.python.framework import ops


def conv(inputs, kernel_shape, bias_shape, strides, weights, bias=None, activation=tf.nn.relu):

    weights = tf.get_variable('weights', shape=kernel_shape, initializer=weights)
    conv = tf.nn.conv2d(inputs, weights, strides=strides, padding='SAME')
    if bias_shape is not None:
        biases = tf.get_variable('biases', shape=bias_shape, initializer=bias)
        return activation(conv + biases) if activation is not None else conv + biases
    return activation(conv) if activation is not None else conv


def dense(inputs, units, bias_shape, weights, bias=None, activation=tf.nn.relu):
    

    if not isinstance(inputs, ops.Tensor):
        inputs = ops.convert_to_tensor(inputs, dtype='float')
    if len(inputs.shape) > 2:
        inputs = tf.contrib.layers.flatten(inputs)
    flatten_shape = inputs.shape[1]
    weights = tf.get_variable('weights', shape=[flatten_shape, units], initializer=weights)
    dense = tf.matmul(inputs, weights)
    if bias_shape is not None:
        assert bias_shape[0] == units
        biases = tf.get_variable('biases', shape=bias_shape, initializer=bias)
        return activation(dense + biases) if activation is not None else dense + biases
    return activation(dense) if activation is not None else dense

v_min = 0
v_max = 1000
atoms = 51
gamma = 0.99 
batch_size = 10
update_target_net = 50  
epsilon = 0.5

buffer_length = 20000
replay_buffer = deque(maxlen=buffer_length)

def sample_transitions(batch_size):
    batch = np.random.permutation(len(replay_buffer))[:batch_size]
    trans = np.array(replay_buffer)[batch]
    return trans



class Categorical_DQN():
    def __init__(self,env):
        self.sess = tf.InteractiveSession()
        self.v_max = v_max
        self.v_min = v_min
        self.atoms = atoms 

        self.epsilon = epsilon
        self.state_shape = env.observation_space.shape
        self.action_shape = env.action_space.n

        self.time_step = 0

        target_state_shape = [1]
        target_state_shape.extend(self.state_shape)


        self.state_ph = tf.placeholder(tf.float32,target_state_shape)
        self.action_ph = tf.placeholder(tf.int32,[1,1])
        self.m_ph = tf.placeholder(tf.float32,[self.atoms])

        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)
        self.z = [self.v_min + i * self.delta_z for i in range(self.atoms)]

        self.build_categorical_DQN()
        self.sess.run(tf.global_variables_initializer())


    def build_network(self, state, action, name, units_1, units_2, weights, bias, reg=None):
        with tf.variable_scope('conv1'):
            conv1 = conv(state, [5, 5, 3, 6], [6], [1, 2, 2, 1], weights, bias)
        with tf.variable_scope('conv2'):
            conv2 = conv(conv1, [3, 3, 6, 12], [12], [1, 2, 2, 1], weights, bias)
        with tf.variable_scope('flatten'):
            flatten = tf.contrib.layers.flatten(conv2)

        with tf.variable_scope('dense1'):
            dense1 = dense(flatten, units_1, [units_1], weights, bias)
        with tf.variable_scope('dense2'):
            dense2 = dense(dense1, units_2, [units_2], weights, bias)
        with tf.variable_scope('concat'):
            concatenated = tf.concat([dense2, tf.cast(action, tf.float32)], 1)
        with tf.variable_scope('dense3'):
            dense3 = dense(concatenated, self.atoms, [self.atoms], weights, bias) 
        return tf.nn.softmax(dense3)

    def build_categorical_DQN(self):
        with tf.variable_scope('target_net'):
            name = ['target_net_params',tf.GraphKeys.GLOBAL_VARIABLES]

            weights = tf.random_uniform_initializer(-0.1,0.1)
            bias = tf.constant_initializer(0.1)

            self.target_p = self.build_network(self.state_ph,self.action_ph,name,24,24,weights,bias)

        with tf.variable_scope('main_net'):
            name = ['main_net_params',tf.GraphKeys.GLOBAL_VARIABLES]
            weights = tf.random_uniform_initializer(-0.1,0.1)
            bias = tf.constant_initializer(0.1)

            self.main_p = self.build_network(self.state_ph,self.action_ph,name,24,24,weights,bias)


        self.main_Q = tf.reduce_sum(self.main_p * self.z)
        self.target_Q = tf.reduce_sum(self.target_p * self.z)

        self.cross_entropy_loss = -tf.reduce_sum(self.m_ph * tf.log(self.main_p))
        self.optimizer = tf.train.AdamOptimizer(0.01).minimize(self.cross_entropy_loss)

        main_net_params = tf.get_collection("main_net_params")
        target_net_params = tf.get_collection('target_net_params')

        self.update_target_net = [tf.assign(t, e) for t, e in zip(target_net_params, main_net_params)]


    def train(self,s,r,action,s_,gamma):
        self.time_step += 1

        list_q_ = [self.sess.run(self.target_Q,feed_dict={self.state_ph:[s_],self.action_ph:[[a]]}) for a in range(self.action_shape)]
        
        a_ = tf.argmax(list_q_).eval()
        

        m = np.zeros(self.atoms)
        p = self.sess.run(self.target_p,feed_dict = {self.state_ph:[s_],self.action_ph:[[a_]]})[0]
        for j in range(self.atoms):
            Tz = min(self.v_max,max(self.v_min,r+gamma * self.z[j]))
            bj = (Tz - self.v_min) / self.delta_z 
            l,u = math.floor(bj),math.ceil(bj) 

            pj = p[j]

            m[int(l)] += pj * (u - bj)
            m[int(u)] += pj * (bj - l)

        self.sess.run(self.optimizer,feed_dict={self.state_ph:[s] , self.action_ph:[action], self.m_ph: m })
        if self.time_step % update_target_net == 0:
            self.sess.run(self.update_target_net)

    def select_action(self,s):
        if random.random() <= self.epsilon:
            return random.randint(0, self.action_shape - 1)
        else: 
            return np.argmax([self.sess.run(self.main_Q,feed_dict={self.state_ph:[s],self.action_ph:[[a]]}) for a in range(self.action_shape)])

num_episodes = 800
env = gym.make("Tennis-v0")
agent = Categorical_DQN(env)


for i in range(num_episodes):
    done = False
    state = env.reset()
    Return = 0

    while not done:

        env.render()
        action = agent.select_action(state)

        next_state, reward, done, info = env.step(action)

        Return = Return + reward

        replay_buffer.append([state, reward, [action], next_state])

        if len(replay_buffer) >= batch_size:
            trans = sample_transitions(2)
            for item in trans:
                agent.train(item[0],item[1], item[2], item[3],gamma)

        state = next_state
            
    print("Episode:{}, Return: {}".format(i,Return))
    


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.flatten instead.


In [None]:
env.close()