In [1]:
import numpy as np
import tensorflow as tf
import pickle
import glob
from misc import *

In [2]:
subrate = 8

In [3]:
class localNetwork():
    def __init__(self, n_features, n_actions, lr=None, trainable = False):
        self.hp = HP()
        #self.default_nl=tf.nn.relu
        self.hp.lr = lr
        self.next_layer_id = 0
        self.n_features = n_features
        self.n_actions = n_actions
        self.theta = {}
        self.estimator = self.vanilla_network()
        self.q_target = tf.placeholder(tf.float32, [None, n_actions])
        if trainable:
            self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.q_target, logits=self.estimator)
            # self.loss = tf.reduce_mean(tf.losses.absolute_difference(self.q_target, self.estimator))
            # tf.losses.absolute_difference()
            # self.train_op = tf.train.GradientDescentOptimizer(self.hp.lr).minimize(self.loss)
            # self.train_op = tf.train.RMSPropOptimizer(self.hp.lr).minimize(self.loss)
            self.train_op = tf.train.AdamOptimizer(self.hp.lr).minimize(self.loss)
        self.sess = None

    def get_layer_id(self):
        this_layer_id = self.next_layer_id
        self.next_layer_id +=1
        return this_layer_id

    def vanilla_network(self, layer_size = [None]+[81]*1+[ None]):
        layer_size[0] = self.n_features
        layer_size[-1] = self.n_actions
        next_l = self.input_layer() #todo currently the  number of features in the input layer is defined elsewhere
        self.observations = next_l
        for ll, ll_size  in enumerate(layer_size[1:-1]):
            next_l = self.dense_ff_layer(next_l, ll_size)
            next_l = tf.nn.dropout(next_l, 0.98)
        ll_size=layer_size[-1]
        next_l = self.dense_ff_layer(next_l, ll_size, nl= lambda x: x,g=1e-10)
        return next_l


    def dense_ff_layer(self, previous_layer, output_size, nl=tf.nn.relu, theta = None,g=1.0):
        if theta is None:
            this_theta = {}
            # print(np.float(np.shape(previous_layer)[-1])**0.5)
            this_theta['w'] = tf.Variable(
                tf.random_normal(shape=[np.shape(previous_layer)[-1].value, output_size],
                                 mean=0.0,
                                 stddev=g*2.0 / np.sqrt(np.shape(previous_layer)[-1].value)))
            this_theta['b'] = tf.Variable(
                tf.random_normal(shape=[1, output_size],
                                 mean=0.0,
                                 stddev=0.01))
        else:
            error('explicit theta is still unsupported')
        self.theta[self.get_layer_id()] = this_theta
        #print(self.get_layer_id())
        ff_layer = nl(tf.matmul(previous_layer, this_theta['w']) + this_theta['b'])
        return ff_layer

    def input_layer(self):
        return tf.placeholder(tf.float32, [None, self.n_features])

    # def train_step_op(self):
    #     return tf.train.RMSPropOptimizer(self.hp.lr).minimize(self.loss)

    def assign_param_prep(self,source_nwk): #todo support more elaborated structures than double dictionary
        self.assign_param_op = []
        for ll in source_nwk.theta.keys():
            for this_param in source_nwk.theta[ll]:
                self.assign_param_op.append(tf.assign(self.theta[ll][this_param],
                                                 source_nwk.theta[ll][this_param]))

    def theta_values(self): #todo support more elaborated structures than double dictionary
        t = {}
        for ll in self.theta.keys():
            t[ll] = {}
            for this_param in self.theta[ll]:
                t[ll][this_param] = self.theta[ll][this_param].eval(self.sess)
        return t

    def theta_update(self,t): #todo support more elaborated structures than double dictionary
        for ll in t.keys():
            for this_param in t[ll]:
                self.theta[ll][this_param].assign(t[ll][this_param]).op.run(session=self.sess)

    def update(self, sess):
        sess.run(self.assign_param_op)

In [4]:
    def training_step(nwk, observations, q_target):
        return sess.run([nwk.train_op,nwk.loss],
                  feed_dict={nwk.observations: observations,
                             nwk.q_target: q_target})

    def calc_loss(nwk, observations, q_target):
        return sess.run([nwk.loss],
                  feed_dict={nwk.observations: observations,
                             nwk.q_target: q_target})
    
    def a_eval(nwk,observations):
        return sess.run(nwk.estimator,
                  feed_dict={nwk.observations: observations})

In [5]:
def one_hot(a,depth):
    o=np.zeros([len(a),depth])
    o[list(range(len(a))),a]=1
    return o

def subsample(data,s):
    sub=list(range(0,len(data[0]),s))
    return [dd[sub] for dd in data]

In [6]:
network = localNetwork( 128, 3, lr=0.0025, trainable = True)
# network = localNetwork( 256*4, 3, lr=0.0025, trainable = True)

In [7]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [8]:
def read_observation_action_from_path(path = None, filenames = None, max_file=1e7):
    if filenames is None:
        filenames = sorted(glob.glob(path))
    observation_list=[]
    a_list=[]
    for cnt, this_file in enumerate(filenames):
        if cnt<max_file:
            with open(this_file,'rb') as f:
                observation_list_s, a_list_s = pickle.load(f)
                observation_list += observation_list_s
                a_list += a_list_s
        else:
            break
    return observation_list,a_list

In [9]:
observation_list,a_list = read_observation_action_from_path('offline_learning_data/YYobservation_action_dump_*')

In [10]:
Xobservation_list,Xa_list = read_observation_action_from_path('offline_learning_data/XXobservation_action_dump_*')

In [11]:
import random

In [12]:
batch_size = 512

In [13]:
sub_observ=subsample(observation_list,subrate)
for step in range(10000):
#     this_batch = random.sample(list(zip(observation_list,a_list)),batch_size)
    this_batch = random.sample(list(zip(sub_observ[:-5000],a_list[:-5000])),batch_size)
    this_batch = list(zip(*this_batch))    
    _,loss= training_step(network,this_batch[0],one_hot(this_batch[1],3))
    if not step%100:
        print('train',step, np.mean(loss))
#         e_batch = [Xobservation_list,Xa_list]
        e_batch = [sub_observ[-5000:],a_list[-5000:]]
        e_actions= np.argmax(a_eval(network,e_batch[0]),axis=1)
        e_loss = calc_loss(network,e_batch[0],one_hot(e_batch[1],3))
        print('eval:',np.mean(e_actions==e_batch[1]),np.mean(e_loss))

train 0 1.0967304
eval: 0.6908 1.0654695
train 100 0.5342269
eval: 0.8132 0.45803475
train 200 0.43722582
eval: 0.8158 0.4261295
train 300 0.3868209
eval: 0.8184 0.40573344
train 400 0.4323538
eval: 0.846 0.38229063
train 500 0.38983053
eval: 0.8482 0.3724771
train 600 0.34090889
eval: 0.854 0.36231536
train 700 0.38302025
eval: 0.856 0.3540086
train 800 0.32950607
eval: 0.861 0.33960122
train 900 0.33009207
eval: 0.8662 0.3319243
train 1000 0.31940037
eval: 0.8676 0.32630983
train 1100 0.3411628
eval: 0.8676 0.32402325
train 1200 0.4319547
eval: 0.871 0.31904104
train 1300 0.3658003
eval: 0.8722 0.31500906
train 1400 0.3055213
eval: 0.876 0.31335786
train 1500 0.3676196
eval: 0.8708 0.31350103
train 1600 0.3299188
eval: 0.871 0.306704
train 1700 0.3098334
eval: 0.8744 0.31187183
train 1800 0.2705818
eval: 0.8744 0.30541897
train 1900 0.3090287
eval: 0.8742 0.305995
train 2000 0.43554428
eval: 0.8766 0.3040884
train 2100 0.29352897
eval: 0.8722 0.30498534
train 2200 0.2957105
eval: 0.8

In [14]:
with open('nwk81xrelu_0p1_in128.nwk', 'wb') as f:
    with sess.as_default():
        pickle.dump(network.theta_values(), f)

In [15]:
network.sa

AttributeError: 'localNetwork' object has no attribute 'sa'

In [None]:
one_hot([0,1,2,3,2,1],5)

In [None]:
len(a_list)

In [None]:
for step in range(100):
    this_batch = random.sample(list(zip(observation_list,a_list)),batch_size)
    this_batch = list(zip(*this_batch))    
    these_actions= np.argmax(a_eval(network,this_batch[0]),axis=1)
    loss = calc_loss(network,this_batch[0],one_hot(this_batch[1],3))
    if not step%1:
        print(step, np.mean(these_actions==this_batch[1]),np.mean(loss))

In [None]:
these_actions

In [None]:
this_batch[1]

In [None]:
np.shape(observation_list)

In [None]:
np.shape(e_batch[0][2])

In [None]:
np.shape(sub_observ)