In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

2023-05-14 14:31:41.362632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [317]:
class CardGameEnv(py_environment.PyEnvironment):
    def __init__(self):
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(1,), dtype=np.int32, minimum=0, name='observation')
        self._state = 0
        self._episode_ended = False


    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self._state = 0
        self._episode_ended = False
        return ts.restart(np.array([self._state], dtype=np.int32))

    def _step(self, action):

        if self._episode_ended:
          # The last action ended the episode. Ignore the current action and start
          # a new episode.
            return self.reset()

        # Make sure episodes don't go on forever.
        if action == 1:
            self._episode_ended = True
        elif action == 0:
            new_card = np.random.randint(1, 11)
            self._state += new_card
        else:
            raise ValueError('`action` should be 0 or 1.')

        if self._episode_ended or self._state >= 21:
            reward = self._state - 21 if self._state <= 21 else -21
            return ts.termination(np.array([self._state], dtype=np.int32), reward)
        else:
            return ts.transition(
              np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)

In [318]:
environment = CardGameEnv()
utils.validate_py_environment(environment, episodes=5)

## 2. MPG Environment

In [319]:
from mpg.games import mpg


In [320]:
from mpg.games import strategy,mpg
from mpg.rl import model_free,environment as rl_env
import importlib
importlib.reload(strategy)
importlib.reload(mpg)
importlib.reload(model_free)
importlib.reload(rl_env)
G=mpg.mpg_from_file("data/test01.in",ignore_header=1)
G
environment = rl_env.MPGEnvironment(G,0,0,10,bad_action_penalty=None)
#utils.validate_py_environment(environment, episodes=5)

In [321]:
environment.reward_spec()

ArraySpec(shape=(), dtype=dtype('float32'), name='reward')

In [322]:
environment = rl_env.MPGEnvironment(G,1,0,max_turns=100,bad_action_penalty=-1000)

In [323]:
fixed_env=rl_env.FixedStrategyMPGEnvironment(environment,strategy.GreedyStrategy(environment.graph,turn=mpg.MeanPayoffGraph.player1))
fixed_env.reset()
agent=model_free.RLearningAgent(fixed_env)
fo_env=rl_env.FullyObservableMPGEnvironment(fixed_env)

In [324]:
environment._vertex=np.array(0)

In [325]:
{1:2,5:5,3:5,4:5}

{1: 2, 5: 5, 3: 5, 4: 5}

In [326]:
A={0:np.zeros(5),1:np.zeros(5)}

In [327]:
A[1][1]

0.0

In [328]:
from mpg.visualisation import game as vgame
VG=vgame.MPGVisualisation(G)
VG

MPGVisualisation(layout=Layout(height='500px', width='100%'))

In [329]:
importlib.reload(rl_env)
import mpg.rl.driver as rl_driver
importlib.reload(rl_driver)
import tf_agents as tfa
import mpg.rl.replay_buffers as rl_replay
importlib.reload(rl_replay)
import mpg.rl.agents as rl_agents
importlib.reload(rl_agents)
import mpg.rl.architectures.example as rl_arch
importlib.reload(rl_arch)


<module 'mpg.rl.architectures.example' from '/home/ramizouari/Academic/AI/MeanPayOffGames/notebooks/mpg/rl/architectures/example.py'>

In [330]:
importlib.reload(rl_env)
E=rl_env.MPGMatrixExtractor(matrix="both",graph_size=8)
E.get_env_specs(fixed_env)

ArraySpec(shape=(2, 8, 8), dtype=dtype('float32'), name=None)

In [331]:
E(fixed_env)

array([[[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.],
        [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]],

       [[ 0.,  5.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  5.,  0.,  4.,  0.,  0.,  0.],
        [ 0.,  0.,  0., -7.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.],
        [ 0.,  0.,  0.,  0.,  0., -3.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.],
        [ 0., -3.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]], dtype=float32)

In [332]:
optimizer=tf.keras.optimizers.Adam()

converter=rl_env.MPGTrajectoryConverter(fixed_env,E)

qnet=tfa.networks.q_network.QNetwork(
    input_tensor_spec=converter.data_spec["environment"],
    action_spec=fixed_env.action_spec(),
    preprocessing_layers=None,
    preprocessing_combiner=None,
    conv_layer_params=None,
    fc_layer_params=(75, 40),
    dropout_layer_params=None,
    activation_fn=tf.keras.activations.relu,
    kernel_initializer=None,
    batch_squash=True,
    dtype=tf.float32,
    q_layer_activation_fn=None,
    name='QNetwork'
)



In [362]:
#buffer=rl_replay.MPGMatrixBuffer(converter.data_spec["environment"],10)
from mpg.rl import observers as rl_observer
from mpg.rl import utils as rl_utils
importlib.reload(rl_observer)
importlib.reload(rl_utils)
train_env=tfa.environments.tf_py_environment.TFPyEnvironment(fo_env)
T=tfa.specs.tensor_spec.add_outer_dims_nest(
    train_env.time_step_spec(), [1]
)
A=tfa.specs.tensor_spec.add_outer_dims_nest(
    train_env.action_spec(), [1]
)
O=tfa.specs.tensor_spec.add_outer_dims_nest(
    train_env.observation_spec(), [1]
)


def normal_splitter(observation):
    if isinstance(observation["state"],tfa.specs.TensorSpec):
        return observation,tf.TensorSpec(shape=(tf.cast(observation["state"].maximum,dtype=tf.int32)+1,))
    C=tf.gather_nd(observation["environment"],tf.concat([tf.constant([0,0]),tf.cast(observation["state"],dtype=tf.int32)],0))
    return observation,tf.reshape(C,shape=(1,-1))
net=rl_arch.MPGNetworkExample(O,fo_env.count_vertices)
splitter=rl_utils.MPGActionConstraintSplitter()
agent=tfa.agents.DqnAgent(
    time_step_spec=train_env.time_step_spec(),
    action_spec= train_env.action_spec(),
    q_network= net,
    optimizer= optimizer,
    observation_and_action_constraint_splitter= splitter,
    epsilon_greedy= 0.1,
    n_step_update = 3,
#    training_data_spec= converter.data_spec
)
buffer=tfa.replay_buffers.tf_uniform_replay_buffer.TFUniformReplayBuffer(agent.collect_data_spec,batch_size=1,max_length=3)
driver=rl_driver.MPGDriver(train_env,agent.collect_policy,total_observers=[],partial_observers=[buffer.add_batch,rl_observer.PrinterObserver()],num_episodes=10,max_steps=3)

In [363]:
import traceback
driver.run(train_env.reset())

S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[6]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[6]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[2]
S:[3], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]
S:[1], A:[4]
S:[5], A:[7]

(TimeStep(
 {'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
  'observation': {'environment': <tf.Tensor: shape=(1, 2, 8, 8), dtype=float32, numpy=
 array([[[[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.],
          [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.],
          [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]],
 
         [[ 0.,  5.,  0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  5.,  0.,  4.,  0.,  0.,  0.],
          [ 0.,  0.,  0., -7.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.],
          [ 0.,  0.,  0.,  0.,  0., -3.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.],
          [ 0., -3.,  0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0., 

In [None]:
t0=tfa.trajectories.time_step.TimeStep(step_type=np.array(tfa.trajectories.time_step.StepType.FIRST),observation=np.array(0),reward=np.array(0),discount=np.array(1))
s=tfa.trajectories.PolicyStep(action=np.array(0))
t1=tfa.trajectories.time_step.TimeStep(step_type=tfa.trajectories.time_step.StepType.FIRST,observation=np.array(0),reward=0,discount=1)
#driver.observers[0](tfa.trajectories.from_transition(t0,s,t1))

In [None]:
agent.collect_data_spec

In [None]:
train_env.observation_spec()


In [311]:
for k,_ in enumerate(buffer.as_dataset()):
    print(_[0].observation["state"])
    if k > 100:
        break

InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_9_device_/job:localhost/replica:0/task:0/device:CPU:0}} assertion failed: [TFUniformReplayBuffer is empty. Make sure to add items before sampling the buffer.] [Condition x > y did not hold element-wise:] [x (TFUniformReplayBuffer/get_next/SelectV2_1:0) = ] [0] [y (TFUniformReplayBuffer/get_next/SelectV2:0) = ] [0]
	 [[{{function_node TFUniformReplayBuffer_get_next_assert_greater_Assert_AssertGuard_false_9950585}}{{node TFUniformReplayBuffer/get_next/assert_greater/Assert/AssertGuard/Assert}}]] [Op:IteratorGetNext]

In [None]:
driver.env.time_step_spec()

In [None]:
for k,element in enumerate(buffer.as_dataset()):
    print(element)
    print(f"****::{k}:****")
    if k > 2:
        break

In [None]:
fo_env.batched

In [None]:
A=tf.random.uniform(shape=[3,3,3])

In [None]:
A[2,tf.constant(1)]

In [312]:
tf.gather_nd(A,tf.constant([[0,0,0],[1,1,1]]))

ValueError: Attempt to convert a value ({0: array([0., 0., 0., 0., 0.]), 1: array([0., 0., 0., 0., 0.])}) with an unsupported type (<class 'dict'>) to a Tensor.

In [313]:
A

{0: array([0., 0., 0., 0., 0.]), 1: array([0., 0., 0., 0., 0.])}

In [271]:
A[1,1,1]

<tf.Tensor: shape=(), dtype=float32, numpy=0.6443225>

In [339]:
fo_env.env.graph.adjacency_matrix

array([[0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.]], dtype=float32)