In [1]:
import numpy as np
import tensorflow as tf
import pickle
import glob
from misc import *

In [2]:
class localNetwork():
    def __init__(self, n_features, n_actions, lr=None, trainable = False):
        self.hp = HP()
        #self.default_nl=tf.nn.relu
        self.hp.lr = lr
        self.next_layer_id = 0
        self.n_features = n_features
        self.n_actions = n_actions
        self.theta = {}
        self.estimator = self.vanilla_network()
        self.q_target = tf.placeholder(tf.float32, [None, n_actions])
        if trainable:
            self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.q_target, logits=self.estimator)
            # self.loss = tf.reduce_mean(tf.losses.absolute_difference(self.q_target, self.estimator))
            # tf.losses.absolute_difference()
            # self.train_op = tf.train.GradientDescentOptimizer(self.hp.lr).minimize(self.loss)
            # self.train_op = tf.train.RMSPropOptimizer(self.hp.lr).minimize(self.loss)
            self.train_op = tf.train.AdamOptimizer(self.hp.lr).minimize(self.loss)
        self.sess = None

    def get_layer_id(self):
        this_layer_id = self.next_layer_id
        self.next_layer_id +=1
        return this_layer_id

    def vanilla_network(self, layer_size = [None]+[200]*2+[ None]):
        layer_size[0] = self.n_features
        layer_size[-1] = self.n_actions
        next_l = self.input_layer() #todo currently the  number of features in the input layer is defined elsewhere
        self.observations = next_l
        for ll, ll_size  in enumerate(layer_size[1:-1]):
            next_l = self.dense_ff_layer(next_l, ll_size)
            # next_l = tf.nn.dropout(next_l, 0.95)
        ll_size=layer_size[-1]
        next_l = self.dense_ff_layer(next_l, ll_size, nl= lambda x: x,g=1e-10)
        return next_l


    def dense_ff_layer(self, previous_layer, output_size, nl=tf.nn.relu, theta = None,g=1.0):
        if theta is None:
            this_theta = {}
            # print(np.float(np.shape(previous_layer)[-1])**0.5)
            this_theta['w'] = tf.Variable(
                tf.random_normal(shape=[np.shape(previous_layer)[-1].value, output_size],
                                 mean=0.0,
                                 stddev=g*2.0 / np.sqrt(np.shape(previous_layer)[-1].value)))
            this_theta['b'] = tf.Variable(
                tf.random_normal(shape=[1, output_size],
                                 mean=0.0,
                                 stddev=0.01))
        else:
            error('explicit theta is still unsupported')
        self.theta[self.get_layer_id()] = this_theta
        #print(self.get_layer_id())
        ff_layer = nl(tf.matmul(previous_layer, this_theta['w']) + this_theta['b'])
        return ff_layer

    def input_layer(self):
        return tf.placeholder(tf.float32, [None, self.n_features])

    # def train_step_op(self):
    #     return tf.train.RMSPropOptimizer(self.hp.lr).minimize(self.loss)

    def assign_param_prep(self,source_nwk): #todo support more elaborated structures than double dictionary
        self.assign_param_op = []
        for ll in source_nwk.theta.keys():
            for this_param in source_nwk.theta[ll]:
                self.assign_param_op.append(tf.assign(self.theta[ll][this_param],
                                                 source_nwk.theta[ll][this_param]))

    def theta_values(self): #todo support more elaborated structures than double dictionary
        t = {}
        for ll in self.theta.keys():
            t[ll] = {}
            for this_param in self.theta[ll]:
                t[ll][this_param] = self.theta[ll][this_param].eval(self.sess)
        return t

    def theta_update(self,t): #todo support more elaborated structures than double dictionary
        for ll in t.keys():
            for this_param in t[ll]:
                self.theta[ll][this_param].assign(t[ll][this_param]).op.run(session=self.sess)

    def update(self, sess):
        sess.run(self.assign_param_op)

In [3]:
    def training_step(nwk, observations, q_target):
        return sess.run([nwk.train_op,nwk.loss],
                  feed_dict={nwk.observations: observations,
                             nwk.q_target: q_target})

    def calc_loss(nwk, observations, q_target):
        return sess.run([nwk.loss],
                  feed_dict={nwk.observations: observations,
                             nwk.q_target: q_target})
    
    def a_eval(nwk,observations):
        return sess.run(nwk.estimator,
                  feed_dict={nwk.observations: observations})

In [4]:
def one_hot(a,depth):
    o=np.zeros([len(a),depth])
    o[list(range(len(a))),a]=1
    return o

In [5]:
network = localNetwork( 256*4, 3, lr=0.0025, trainable = True)

In [6]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [7]:
def read_observation_action_from_path(path = None, filenames = None, max_file=1e7):
    if filenames is None:
        filenames = sorted(glob.glob(path))
    observation_list=[]
    a_list=[]
    for cnt, this_file in enumerate(filenames):
        if cnt<max_file:
            with open(this_file,'rb') as f:
                observation_list_s, a_list_s = pickle.load(f)
                observation_list += observation_list_s
                a_list += a_list_s
        else:
            break
    return observation_list,a_list

In [8]:
import random

In [9]:
batch_size = 512

In [10]:
Xobservation_list,Xa_list = read_observation_action_from_path('XXobservation_action_dump_*')

In [11]:
with open('nwk200x200xrelu_0p1.nwk', 'rb') as f:
    with sess.as_default():
        theta_list = pickle.load(f)
        network.theta_update(theta_list)

In [12]:
for step in range(100):
    this_batch = random.sample(list(zip(Xobservation_list,Xa_list)),batch_size)
    this_batch = list(zip(*this_batch))    
    these_actions= np.argmax(a_eval(network,this_batch[0]),axis=1)
    loss = calc_loss(network,this_batch[0],one_hot(this_batch[1],3))
    if not step%1:
        print(step, np.mean(these_actions==this_batch[1]),np.mean(loss))

0 0.794921875 3.8290036
1 0.7578125 4.7180567
2 0.765625 6.643584
3 0.771484375 3.5995011
4 0.771484375 5.1158695
5 0.740234375 5.138471
6 0.76953125 7.1452675
7 0.75 4.7170377
8 0.791015625 3.1270542
9 0.765625 5.439272
10 0.775390625 5.6071243
11 0.783203125 4.9340715
12 0.75 7.4632463
13 0.767578125 3.67767
14 0.7578125 7.266601
15 0.76171875 4.2382774
16 0.748046875 5.8354063
17 0.74609375 7.99998
18 0.7578125 5.5590196
19 0.732421875 6.5605264
20 0.765625 7.5470247
21 0.767578125 3.3008876
22 0.76953125 5.626027
23 0.763671875 7.414331
24 0.765625 4.010283
25 0.75390625 4.648808
26 0.763671875 4.371417
27 0.7734375 5.1772175
28 0.75 7.2572346
29 0.779296875 6.2004766
30 0.767578125 4.05719
31 0.775390625 4.143829
32 0.73828125 8.332233
33 0.77734375 5.7977467
34 0.787109375 4.854104
35 0.755859375 4.6827345
36 0.783203125 4.6360598
37 0.78125 4.155876
38 0.7421875 6.3077455
39 0.7265625 3.9159138
40 0.7734375 3.9879005
41 0.759765625 6.380677
42 0.744140625 6.4409037
43 0.75390625

In [19]:

e_batch = [Xobservation_list,Xa_list]
e_actions= np.argmax(a_eval(network,e_batch[0]),axis=1)
e_loss = calc_loss(network,e_batch[0],one_hot(e_batch[1],3))
print(np.mean(e_actions==e_batch[1]),np.mean(e_loss))

0.7644071781208369 5.379124
