In [196]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

In [197]:
class CardGameEnv(py_environment.PyEnvironment):
    def __init__(self):
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(1,), dtype=np.int32, minimum=0, name='observation')
        self._state = 0
        self._episode_ended = False


    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self._state = 0
        self._episode_ended = False
        return ts.restart(np.array([self._state], dtype=np.int32))

    def _step(self, action):

        if self._episode_ended:
          # The last action ended the episode. Ignore the current action and start
          # a new episode.
            return self.reset()

        # Make sure episodes don't go on forever.
        if action == 1:
            self._episode_ended = True
        elif action == 0:
            new_card = np.random.randint(1, 11)
            self._state += new_card
        else:
            raise ValueError('`action` should be 0 or 1.')

        if self._episode_ended or self._state >= 21:
            reward = self._state - 21 if self._state <= 21 else -21
            return ts.termination(np.array([self._state], dtype=np.int32), reward)
        else:
            return ts.transition(
              np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)

In [198]:
environment = CardGameEnv()
utils.validate_py_environment(environment, episodes=5)

## 2. MPG Environment

In [199]:
from mpg.games import mpg


In [364]:
from mpg.games import strategy,mpg
from mpg.rl import model_free,environment as rl_env
import importlib
importlib.reload(strategy)
importlib.reload(mpg)
importlib.reload(model_free)
importlib.reload(rl_env)
G=mpg.mpg_from_file("data/test01.in",ignore_header=1)
G
environment = rl_env.MPGEnvironment(G,0,0,10,bad_action_penalty=-40)
utils.validate_py_environment(environment, episodes=5)

In [365]:
environment.reward_spec()

ArraySpec(shape=(), dtype=dtype('float32'), name='reward')

In [366]:
environment = rl_env.MPGEnvironment(G,1,0,max_turns=100,bad_action_penalty=-100)

In [367]:
fixed_env=rl_env.FixedStrategyMPGEnvironment(environment,strategy.GreedyStrategy(environment.graph,turn=mpg.MeanPayoffGraph.player1))
fixed_env.reset()
agent=model_free.RLearningAgent(fixed_env)
fo_env=rl_env.FullyObservableMPGEnvironment(fixed_env)

1

In [204]:
environment._vertex=np.array(0)

In [205]:
{1:2,5:5,3:5,4:5}

{1: 2, 5: 5, 3: 5, 4: 5}

In [206]:
A={0:np.zeros(5),1:np.zeros(5)}

In [207]:
A[1][1]

0.0

In [208]:
from mpg.visualisation import game as vgame
VG=vgame.MPGVisualisation(G)
VG

MPGVisualisation(layout=Layout(height='500px', width='100%'))

In [423]:
importlib.reload(rl_env)
import mpg.rl.driver as rl_driver
importlib.reload(rl_driver)
import tf_agents as tfa
import mpg.rl.replay_buffers as rl_replay
importlib.reload(rl_replay)
import mpg.rl.agents as rl_agents
importlib.reload(rl_agents)
import mpg.rl.architectures.example as rl_arch
importlib.reload(rl_arch)


<module 'mpg.rl.architectures.example' from '/home/ramizouari/Academic/AI/MeanPayOffGames/notebooks/mpg/rl/architectures/example.py'>

In [424]:
importlib.reload(rl_env)
E=rl_env.MPGMatrixExtractor(matrix="both",graph_size=8)
E.get_env_specs(fixed_env)

ArraySpec(shape=(2, 8, 8), dtype=dtype('float32'), name=None)

In [425]:
E(fixed_env)

array([[[ 0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.],
        [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]],

       [[ 0.,  5.,  0.,  0.,  0.,  4.,  0.,  0.],
        [ 0.,  0., -7.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  5.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [-3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0., -3.,  0.],
        [ 0.,  0.,  0.,  0.,  3.,  0.,  0.,  0.],
        [ 5.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]])

In [426]:
optimizer=tf.keras.optimizers.Adam()

converter=rl_env.MPGTrajectoryConverter(fixed_env,E)

qnet=tfa.networks.q_network.QNetwork(
    input_tensor_spec=converter.data_spec["environment"],
    action_spec=fixed_env.action_spec(),
    preprocessing_layers=None,
    preprocessing_combiner=None,
    conv_layer_params=None,
    fc_layer_params=(75, 40),
    dropout_layer_params=None,
    activation_fn=tf.keras.activations.relu,
    kernel_initializer=None,
    batch_squash=True,
    dtype=tf.float32,
    q_layer_activation_fn=None,
    name='QNetwork'
)

net=rl_arch.MPGNetworkExample(fo_env.observation_spec(),fo_env.count_vertices)


In [427]:
agent=tfa.agents.DqnAgent(
    time_step_spec=fo_env.time_step_spec(),
    action_spec= fo_env.action_spec(),
    q_network= net,
    optimizer= optimizer,
    observation_and_action_constraint_splitter= None,
    epsilon_greedy= 0.1,
    n_step_update = 1,
#    training_data_spec= converter.data_spec
)

In [373]:
agent.training_data_spec

_TupleWrapper(Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(7, dtype=int32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': DictWrapper({'state': BoundedTensorSpec(shape=(), dtype=tf.int32, name='observation', minimum=array(0, dtype=int32), maximum=array(7, dtype=int32)), 'environment': TensorSpec(shape=(2, 8, 8), dtype=tf.float32, name='environment')}),
 'policy_info': (),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')}))

In [374]:
patched_agent=rl_agents.FullyObservableMPGAgentWrapper(agent,fixed_env)

In [375]:
patched_agent

<tf_agents.agents.dqn.dqn_agent.DqnAgent at 0x7fc2244a8b80>

In [428]:
#buffer=rl_replay.MPGMatrixBuffer(converter.data_spec["environment"],10)
driver=rl_driver.MPGDriver(fo_env,agent.collect_policy,total_observers=[],partial_observers=[])

In [429]:
driver.run(fo_env.reset())

InvalidArgumentError: Exception encountered when calling layer 'concatenate_7' (type Concatenate).

{{function_node __wrapped__ConcatV2_N_2_device_/job:localhost/replica:0/task:0/device:CPU:0}} ConcatOp : Dimension 0 in both shapes must be equal: shape[0] = [2,64] vs. shape[1] = [1,1] [Op:ConcatV2] name: concat

Call arguments received by layer 'concatenate_7' (type Concatenate):
  • inputs=['tf.Tensor(shape=(2, 64), dtype=float32)', 'tf.Tensor(shape=(1, 1), dtype=float32)']

In [380]:
t0=tfa.trajectories.time_step.TimeStep(step_type=np.array(tfa.trajectories.time_step.StepType.FIRST),observation=np.array(0),reward=np.array(0),discount=np.array(1))
s=tfa.trajectories.PolicyStep(action=np.array(0))
t1=tfa.trajectories.time_step.TimeStep(step_type=tfa.trajectories.time_step.StepType.FIRST,observation=np.array(0),reward=0,discount=1)
#driver.observers[0](tfa.trajectories.from_transition(t0,s,t1))

In [382]:
fo_env.observation_spec()

{'state': BoundedArraySpec(shape=(), dtype=dtype('int32'), name='observation', minimum=0, maximum=7),
 'environment': ArraySpec(shape=(2, 8, 8), dtype=dtype('float32'), name='environment')}

In [169]:
buffer.gather_all()["state"]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [272]:
buffer.add_batch(t1)

IndexError: tuple index out of range

In [113]:
tfa.utils.nest_utils.get_outer_array_shape(t0, buffer._data_spec)

()

In [122]:
buffer._data_spec

{'state': ArraySpec(shape=(), dtype=dtype('int32'), name=None),
 'environment': ArraySpec(shape=(2, 8, 8), dtype=dtype('float32'), name=None)}

In [121]:
tfa.utils.nest_utils.get_outer_array_shape(np.array([[5,3],[5,2]],dtype=np.int32),spec=tfa.specs.ArraySpec(shape=[2],dtype=np.int32))

(2,)

In [123]:
t0

TimeStep(
{'discount': array([1]),
 'observation': array([0]),
 'reward': array([0]),
 'step_type': array([0], dtype=int32)})

In [283]:
agent.collect_data_spec

Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(7, dtype=int32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': BoundedTensorSpec(shape=(), dtype=tf.int32, name='observation', minimum=array(0, dtype=int32), maximum=array(7, dtype=int32)),
 'policy_info': (),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

In [None]:
tfa.utils.nest_utils.get_outer_array_shape(np.array([[5,3],[5,2]],dtype=np.int32),spec=tfa.specs.ArraySpec(shape=[2],dtype=np.int32))