In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

2023-05-16 17:16:52.520203: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class CardGameEnv(py_environment.PyEnvironment):
    def __init__(self):
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(1,), dtype=np.int32, minimum=0, name='observation')
        self._state = 0
        self._episode_ended = False


    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        self._state = 0
        self._episode_ended = False
        return ts.restart(np.array([self._state], dtype=np.int32))

    def _step(self, action):

        if self._episode_ended:
          # The last action ended the episode. Ignore the current action and start
          # a new episode.
            return self.reset()

        # Make sure episodes don't go on forever.
        if action == 1:
            self._episode_ended = True
        elif action == 0:
            new_card = np.random.randint(1, 11)
            self._state += new_card
        else:
            raise ValueError('`action` should be 0 or 1.')

        if self._episode_ended or self._state >= 21:
            reward = self._state - 21 if self._state <= 21 else -21
            return ts.termination(np.array([self._state], dtype=np.int32), reward)
        else:
            return ts.transition(
              np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)

In [3]:
environment = CardGameEnv()
utils.validate_py_environment(environment, episodes=5)

## 2. MPG Environment

In [4]:
from mpg.games import mpg


In [5]:
from mpg.games import strategy,mpg
from mpg.rl import model_free,environment as rl_env
import importlib
importlib.reload(strategy)
importlib.reload(mpg)
importlib.reload(model_free)
importlib.reload(rl_env)
G=mpg.mpg_from_file("data/test01.in",ignore_header=1)
G
environment = rl_env.MPGEnvironment(G,0,0,10,bad_action_penalty=None)
#utils.validate_py_environment(environment, episodes=5)

In [6]:
environment.reward_spec()

ArraySpec(shape=(), dtype=dtype('float32'), name='reward')

In [7]:
environment = rl_env.MPGEnvironment(G,1,0,max_turns=100,bad_action_penalty=-10)

In [8]:
fixed_env=rl_env.FixedStrategyMPGEnvironment(environment,strategy.GreedyStrategy(environment.graph,turn=mpg.MeanPayoffGraph.player1))
fixed_env.reset()
agent=model_free.RLearningAgent(fixed_env)
fo_env=rl_env.FullyObservableMPGEnvironment(fixed_env)

In [9]:
environment._vertex=np.array(0)

In [10]:
{1:2,5:5,3:5,4:5}

{1: 2, 5: 5, 3: 5, 4: 5}

In [11]:
A={0:np.zeros(5),1:np.zeros(5)}

In [12]:
A[1][1]

0.0

In [13]:
from mpg.visualisation import game as vgame
VG=vgame.MPGVisualisation(G)
VG

MPGVisualisation(layout=Layout(height='500px', width='100%'))

In [14]:
importlib.reload(rl_env)
import mpg.rl.driver as rl_driver
importlib.reload(rl_driver)
import tf_agents as tfa
import mpg.rl.replay_buffers as rl_replay
importlib.reload(rl_replay)
import mpg.rl.agents as rl_agents
importlib.reload(rl_agents)
import mpg.rl.architectures.example as rl_arch
importlib.reload(rl_arch)


<module 'mpg.rl.architectures.example' from '/home/ramizouari/Academic/AI/MeanPayOffGames/notebooks/mpg/rl/architectures/example.py'>

In [15]:
importlib.reload(rl_env)
E=rl_env.MPGMatrixExtractor(matrix="both",graph_size=8)
E.get_env_specs(fixed_env)

ArraySpec(shape=(2, 8, 8), dtype=dtype('float32'), name=None)

In [16]:
E(fixed_env)

array([[[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.],
        [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]],

       [[ 0.,  5.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  5.,  0.,  4.,  0.,  0.,  0.],
        [ 0.,  0.,  0., -7.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  5.,  0.],
        [ 0.,  0.,  0.,  0.,  0., -3.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.],
        [ 0., -3.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]], dtype=float32)

In [17]:
optimizer=tf.keras.optimizers.Adam()

converter=rl_env.MPGTrajectoryConverter(fixed_env,E)

qnet=tfa.networks.q_network.QNetwork(
    input_tensor_spec=converter.data_spec["environment"],
    action_spec=fixed_env.action_spec(),
    preprocessing_layers=None,
    preprocessing_combiner=None,
    conv_layer_params=None,
    fc_layer_params=(75, 40),
    dropout_layer_params=None,
    activation_fn=tf.keras.activations.relu,
    kernel_initializer=None,
    batch_squash=True,
    dtype=tf.float32,
    q_layer_activation_fn=None,
    name='QNetwork'
)



In [94]:
#buffer=rl_replay.MPGMatrixBuffer(converter.data_spec["environment"],10)
from mpg.rl import observers as rl_observer
from mpg.rl import utils as rl_utils
importlib.reload(rl_observer)
importlib.reload(rl_utils)
train_env=tfa.environments.tf_py_environment.TFPyEnvironment(fo_env)
T=tfa.specs.tensor_spec.add_outer_dims_nest(
    train_env.time_step_spec(), [1]
)
A=tfa.specs.tensor_spec.add_outer_dims_nest(
    train_env.action_spec(), [1]
)
O=tfa.specs.tensor_spec.add_outer_dims_nest(
    train_env.observation_spec(), [None]
)


def normal_splitter(observation):
    if isinstance(observation["state"],tfa.specs.TensorSpec):
        return observation,tf.TensorSpec(shape=(tf.cast(observation["state"].maximum,dtype=tf.int32)+1,))
    C=tf.gather_nd(observation["environment"],tf.concat([tf.constant([0,0]),tf.cast(observation["state"],dtype=tf.int32)],0))
    return observation,tf.reshape(C,shape=(1,-1))
net=rl_arch.MPGNetworkExample(train_env.observation_spec(),fo_env.count_vertices)
splitter=rl_utils.MPGActionConstraintSplitter()
agent=tfa.agents.DqnAgent(
    time_step_spec=train_env.time_step_spec(),
    action_spec= train_env.action_spec(),
    q_network= net,
    optimizer= optimizer,
    observation_and_action_constraint_splitter= splitter,
    epsilon_greedy= 0.1,
    n_step_update = 3,
#    training_data_spec= converter.data_spec
)
buffer=tfa.replay_buffers.tf_uniform_replay_buffer.TFUniformReplayBuffer(agent.collect_data_spec,batch_size=1,max_length=20)
driver=rl_driver.MPGDriver(train_env,agent.collect_policy,total_observers=[],partial_observers=[buffer.add_batch],num_episodes=10,max_steps=8)

In [95]:
dataset = buffer.as_dataset(
    num_parallel_calls=8,
    sample_batch_size=8,
    num_steps=4).prefetch(3)
driver.run(train_env.reset())
batch=next(iter(dataset))
batch

(Trajectory(
 {'action': <tf.Tensor: shape=(8, 4), dtype=int32, numpy=
 array([[2, 7, 2, 7],
        [2, 7, 2, 7],
        [7, 2, 7, 2],
        [2, 7, 4, 7],
        [7, 2, 7, 2],
        [2, 7, 2, 7],
        [7, 2, 7, 2],
        [7, 2, 7, 2]], dtype=int32)>,
  'discount': <tf.Tensor: shape=(8, 4), dtype=float32, numpy=
 array([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]], dtype=float32)>,
  'next_step_type': <tf.Tensor: shape=(8, 4), dtype=int32, numpy=
 array([[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]], dtype=int32)>,
  'observation': DictWrapper({'state': <tf.Tensor: shape=(8, 4), dtype=int32, numpy=
 array([[1, 3, 1, 3],
        [1, 3, 1, 3],
        [3, 1, 3, 1],
        [1, 3, 1, 5],
        [3, 1, 3, 

In [96]:
importlib.reload(rl_utils)
batch=next(iter(dataset))
E={key:batch[0].observation[key][:,0] for key in batch[0].observation}
splitter(E)

({'state': <tf.Tensor: shape=(8,), dtype=int32, numpy=array([3, 1, 3, 3, 3, 1, 3, 1], dtype=int32)>,
  'environment': <tf.Tensor: shape=(8, 2, 8, 8), dtype=float32, numpy=
  array([[[[ 0.,  1.,  0., ...,  0.,  0.,  0.],
           [ 0.,  0.,  1., ...,  0.,  0.,  0.],
           [ 0.,  0.,  0., ...,  0.,  0.,  0.],
           ...,
           [ 0.,  0.,  0., ...,  0.,  1.,  1.],
           [ 0.,  1.,  0., ...,  0.,  0.,  0.],
           [ 0.,  1.,  0., ...,  0.,  0.,  0.]],
  
          [[ 0.,  5.,  0., ...,  0.,  0.,  0.],
           [ 0.,  0.,  5., ...,  0.,  0.,  0.],
           [ 0.,  0.,  0., ...,  0.,  0.,  0.],
           ...,
           [ 0.,  0.,  0., ...,  0.,  3.,  0.],
           [ 0., -3.,  0., ...,  0.,  0.,  0.],
           [ 0.,  0.,  0., ...,  0.,  0.,  0.]]],
  
  
         [[[ 0.,  1.,  0., ...,  0.,  0.,  0.],
           [ 0.,  0.,  1., ...,  0.,  0.,  0.],
           [ 0.,  0.,  0., ...,  0.,  0.,  0.],
           ...,
           [ 0.,  0.,  0., ...,  0.,  1.,  1.],


In [97]:


iterator=iter(dataset)
iterations=100
time_step=train_env.reset()
log_interval=1
eval_interval=1
for k in range(iterations):
    time_step, _ = driver.run(time_step)

  # Sample a batch of data from the buffer and update the agent's network. w
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))



Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


step = 1: loss = 1.1339476108551025
step = 2: loss = 2.201998472213745
step = 3: loss = 1.8107719421386719
step = 4: loss = 1.2546463012695312
step = 5: loss = 0.5603786706924438
step = 6: loss = 1.3612220287322998
step = 7: loss = 0.4488934278488159
step = 8: loss = 0.4845980405807495
step = 9: loss = 0.35422948002815247
step = 10: loss = 0.45054376125335693
step = 11: loss = 0.4913141429424286
step = 12: loss = 0.4865337908267975
step = 13: loss = 0.5816371440887451
step = 14: loss = 1.6906977891921997
step = 15: loss = 2.4146690368652344
step = 16: loss = 2.189333200454712
step = 17: loss = 1.0436348915100098
step = 18: loss = 2.8249311447143555
step = 19: loss = 1.458680510520935
step = 20: loss = 1.2377842664718628
step = 21: loss = 1.9340684413909912
step = 22: loss = 0.8917479515075684
step = 23: loss = 1.4628241062164307
step = 24: loss = 0.8024151921272278
step = 25: loss = 1.2241432666778564
step = 26: loss = 1.204770803451538
step = 27: loss = 1.0098650455474854
step = 28: l

## Z=iter(dataset)

In [None]:
next(Z)

In [None]:
for k,_ in enumerate(buffer.as_dataset()):
    print(_[0].observation["state"])
    if k > 100:
        break

In [None]:
driver.env.time_step_spec()

In [None]:
for k,element in enumerate(buffer.as_dataset()):
    print(element)
    print(f"****::{k}:****")
    if k > 2:
        break

In [None]:
fo_env.batched

In [None]:
A=tf.random.uniform(shape=[3,3,3])

In [111]:
t0=train_env.reset()
agent.policy.action(t0)

PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([4], dtype=int32)>, state=(), info=())

In [312]:
tf.gather_nd(A,tf.constant([[0,0,0],[1,1,1]]))

ValueError: Attempt to convert a value ({0: array([0., 0., 0., 0., 0.]), 1: array([0., 0., 0., 0., 0.])}) with an unsupported type (<class 'dict'>) to a Tensor.

In [313]:
A

{0: array([0., 0., 0., 0., 0.]), 1: array([0., 0., 0., 0., 0.])}

In [271]:
A[1,1,1]

<tf.Tensor: shape=(), dtype=float32, numpy=0.6443225>

In [339]:
fo_env.env.graph.adjacency_matrix

array([[0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [424]:
tf.zeros(shape=(2,2,2)).shape

TensorShape([2, 2, 2])

In [46]:
A=tf.random.uniform([2,1,5,5,5])

In [49]:
tf.gather_nd(A,indices=I,batch_dims=2)

<tf.Tensor: shape=(2, 1, 5), dtype=float32, numpy=
array([[[0.76097524, 0.13867462, 0.5122886 , 0.18104172, 0.08846366]],

       [[0.1883893 , 0.79562414, 0.60452783, 0.92311394, 0.04559219]]],
      dtype=float32)>

In [48]:
I=tf.constant([[[1,1]],[[2,2]]])

In [113]:
agent.q_network

AttributeError: 'DqnAgent' object has no attribute 'q_network'

In [125]:
tf.math.reduce_sum(tf.ragged.constant([[0,1],[1]]),axis=0)

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([1, 1], dtype=int32)>

In [132]:
tf.Variable(tf.sparse.SparseTensor([0,5],[0,0]))

TypeError: SparseTensor.__init__() missing 1 required positional argument: 'dense_shape'