In [None]:
import warnings
import timeit
import json
import os

import numpy

from keras.callbacks import Callback


class MyCallback(Callback):
    def __init__(self, output_path="."):
        # Some algorithms compute multiple episodes at once since they are multi-threaded.
        # We therefore use a dictionary that is indexed by the episode to separate episodes
        # from each other.
        self.episode_start = {}
        self.observations = {}
        self.rewards = {}
        self.actions = {}
        self.metrics = {}
        self.step = 0
        self.lastreward = -99999999
        self.output_path = output_path

    def on_train_begin(self, logs):
        self.train_start = timeit.default_timer()
        self.metrics_names = self.model.metrics_names
        print('Training for {} steps ...'.format(self.params['nb_steps']))
        
    def on_train_end(self, logs):
        duration = timeit.default_timer() - self.train_start
        print('done, took {:.3f} seconds'.format(duration))

    def on_episode_begin(self, episode, logs):
        self.episode_start[episode] = timeit.default_timer()
        self.observations[episode] = []
        self.rewards[episode] = []
        self.actions[episode] = []
        self.metrics[episode] = []
        

    def on_episode_end(self, episode, logs):
        duration = timeit.default_timer() - self.episode_start[episode]
        episode_steps = len(self.observations[episode])

        # Format all metrics.
        metrics = numpy.array(self.metrics[episode])
        metrics_template = ''
        metrics_variables = []
        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            for idx, name in enumerate(self.metrics_names):
                if idx > 0:
                    metrics_template += ', '
                try:
                    value = numpy.nanmean(metrics[:, idx])
                    metrics_template += '{}: {:f}'
                except Warning:
                    value = '--'
                    metrics_template += '{}: {}'
                metrics_variables += [name, value]          
        metrics_text = metrics_template.format(*metrics_variables)

        nb_step_digits = str(int(numpy.ceil(numpy.log10(self.params['nb_steps']))) + 1)
        template = '{step: ' + nb_step_digits + 'd}/{nb_steps}: episode: {episode}, duration: {duration:.3f}s, episode steps: {episode_steps}, steps per second: {sps:.0f}, episode reward: {episode_reward:.3f}, mean reward: {reward_mean:.3f} [{reward_min:.3f}, {reward_max:.3f}], mean action: {action_mean:.3f} [{action_min:.3f}, {action_max:.3f}], {metrics}'
        variables = {
            'step': self.step,
            'nb_steps': self.params['nb_steps'],
            'episode': episode + 1,
            'duration': duration,
            'episode_steps': episode_steps,
            'sps': float(episode_steps) / duration,
            'episode_reward': numpy.sum(self.rewards[episode]),
            'reward_mean': numpy.mean(self.rewards[episode]),
            'reward_min': numpy.min(self.rewards[episode]),
            'reward_max': numpy.max(self.rewards[episode]),
            'action_mean': numpy.mean(self.actions[episode]),
            'action_min': numpy.min(self.actions[episode]),
            'action_max': numpy.max(self.actions[episode]),
            'metrics': metrics_text,
        }
        
        print(template.format(**variables))
        '''
        Code for saving up weights if the episode reward is higher than the last one
        '''
        
        if numpy.sum(self.rewards[episode])>self.lastreward:
            
            previousWeights = "{}/best_weight.hdf5".format(self.output_path)
            if os.path.exists(previousWeights): os.remove(previousWeights)
            self.lastreward = numpy.sum(self.rewards[episode])
            print("The reward is higher than the best one, saving checkpoint weights")
            newWeights = "{}/best_weight.hdf5".format(self.output_path)
            self.model.save_weights(newWeights, overwrite=True)
            
        else:
            print("The reward is lower than the best one, checkpoint weights not updated")
            

        # Free up resources.
        del self.episode_start[episode]
        del self.observations[episode]
        del self.rewards[episode]
        del self.actions[episode]
        del self.metrics[episode]

    def on_step_end(self, step, logs):
        episode = logs['episode']
        self.observations[episode].append(logs['observation'])
        self.rewards[episode].append(logs['reward'])
        self.actions[episode].append(logs['action'])
        self.metrics[episode].append(logs['metrics'])
        self.step += 1


In [None]:
import gym.spaces
import numpy
import pandas

def calc_profit(action, df, index):
    if action == 0:
        p = 1
    elif action == 1:
        p = -1
    else:
        p = 0
    return  p * df["c"][index]

def calc_observation(df, index, columns):
    return numpy.array([df[col][index] for col in columns])

class Game(gym.core.Env):
    metadata = {'render.modes': ['human', 'rgb_array']}
    def __init__(self, df, columns):
        self.df = df.reset_index(drop=True)
        self.columns = columns
        self.action_space = gym.spaces.Discrete(2)
        low_bound = numpy.array([0]*len(columns))
        high_bound = numpy.array([1]*len(columns))
        self.observation_space = gym.spaces.Box(low=low_bound, high=high_bound)
        self.time = 0
        self.profit = 0
        
    def step(self, action):
        reward = calc_profit(action, self.df, self.time)
        self.time += 1
        self.profit += reward       
        done = self.time == (len(self.df) - 1)
        if done:
            print("profit___{}".format(self.profit))
        info = {}
        observation = calc_observation(self.df, self.time, self.columns)
        return observation, reward, done, info

    def reset(self):
        self.time = 0
        self.profit = 0
        return calc_observation(self.df, self.time, self.columns)
    
    def render(self, mode):
        pass
    
    def close(self):
        pass
    
    def seed(self):
        pass


In [None]:
from keras.models import Sequential, Model
from keras.layers import (
    Dense,
    Activation,
    Flatten,
    Input,
    concatenate,
    Dropout
)

class Network(object):
    def __init__(self):
        self.model = None

    def sample_model(self, obsrvation_shape):
        model = Sequential()
        model.add(Flatten(input_shape=(1,) + obsrvation_shape))
        model.add(Dense(32))
        model.add(Activation('relu'))
        model.add(Dropout(0.6))
        model.add(Dense(16))
        model.add(Activation('relu'))
        model.add(Dropout(0.6))
        model.add(Dense(n_action))
        model.add(Activation('linear'))
        self.model = model
        return model        
    
    def from_json(self, file_path):
        pass
    
    def to_json(self, output_path):
        pass


In [None]:
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
from keras.optimizers import Adam

def agent(model, n_action):
    memory = SequentialMemory(limit=50000, window_length=1)
    policy = EpsGreedyQPolicy(eps=0.1)
    dqn_agent = DQNAgent(model=model, nb_actions=n_action,
                         memory=memory, nb_steps_warmup=100,
                         target_model_update=1e-2, policy=policy)
    dqn_agent.compile(Adam(lr=1e-3), metrics=['mae'])
    return dqn_agent

In [None]:
import pandas
import numpy

df = pandas.DataFrame({"a": numpy.random.rand(1000), "b": numpy.random.rand(1000)})
df["c"] = df["a"] * df["a"] - df["b"] * df["b"]

In [None]:
df.head(10)

In [None]:
callback= MyCallback("tmp")
columns = ["a", "b"]
env = Game(df, columns)
n_action = 2
network = Network()
model = network.sample_model(env.observation_space.shape)
agent_v6 = agent(model, n_action)

In [None]:
agent_v6.fit(env, nb_steps=5000, visualize=False,
                  verbose=2, callbacks=[callback])

In [None]:
agent_v6.load_weights("tmp/best_weight.hdf5")

In [None]:
agent_v6.forward([0.5, 0.4])

In [None]:
agent_v6.backward(100, False)

In [None]:
agent_v6.backward?