# Trainer TF2 scratchpad

Generate training data
```
python rgi/main.py --game connect4 --player1 random --player2 random --num_games 100 --save_trajectories
```

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
from typing import Any, NamedTuple
from rgi.games import connect4
from rgi.core import trajectory

print(f"TensorFlow version: {tf.__version__}")

# Check for GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

import numpy as np
np.set_printoptions(linewidth=150)

TensorFlow version: 2.18.0-rc2
Num GPUs Available:  1


In [3]:
# Use GPU if available
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

print(gpus)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Is built with CUDA:", tf.test.is_built_with_cuda())
print("Is GPU available:", tf.test.is_gpu_available())
print("GPU devices:", tf.config.list_physical_devices('GPU'))

Num GPUs Available:  1
Is built with CUDA: True
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Is GPU available: True
GPU devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


I0000 00:00:1729124454.084847     979 gpu_device.cc:2022] Created device /device:GPU:0 with 5708 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2070 SUPER, pci bus id: 0000:2d:00.0, compute capability: 7.5


In [5]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("CUDA version:", tf.sysconfig.get_build_info()["cuda_version"])
print("cuDNN version:", tf.sysconfig.get_build_info()["cudnn_version"])

TensorFlow version: 2.18.0-rc2
CUDA version: 12.5.1
cuDNN version: 9


In [6]:
with tf.device('/GPU:0'):
    a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
    c = tf.matmul(a, b)
print(c)

tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)


I0000 00:00:1729124456.177457     979 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5708 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2070 SUPER, pci bus id: 0000:2d:00.0, compute capability: 7.5


# Load Trajectories

In [7]:
# Define TrajectoryStep NamedTuple
class TrajectoryStep(NamedTuple):
    move_index: int
    state: tf.Tensor
    action: tf.Tensor
    next_state: tf.Tensor
    reward: tf.Tensor

def fixup_reward(x: float) -> float:
    return (x + 1) / 2

# Load trajectories
game_name = "connect4"
trajectories_glob = os.path.join("..", "data", "trajectories", game_name, "*.trajectory.npy")
trajectories = trajectory.load_trajectories(trajectories_glob)

print(f'trajectories_glob: {trajectories_glob}')
print(f'num_trajectories: {len(trajectories)}')

# Unroll trajectories
def unroll_trajectory(encoded_trajectories):
    for t in encoded_trajectories:
        for i in range(t.length - 1):
            state = tf.convert_to_tensor(t.states[i], dtype=tf.float32)
            next_state = tf.convert_to_tensor(t.states[i + 1], dtype=tf.float32)
            action = tf.convert_to_tensor(t.actions[i], dtype=tf.int32)
            reward = tf.convert_to_tensor(fixup_reward(t.final_rewards[0]), dtype=tf.float32)
            
            yield TrajectoryStep(i, state, action, next_state, reward)

all_trajectory_steps = list(unroll_trajectory(trajectories))


trajectories_glob: ../data/trajectories/connect4/*.trajectory.npy
num_trajectories: 1100


2024-10-17 00:21:00.087765: W external/xla/xla/service/gpu/nvptx_compiler.cc:893] The NVIDIA driver's CUDA version is 12.4 which is older than the PTX compiler version 12.6.77. Because the driver is older than the PTX compiler version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.


In [8]:
# Prepare batches
state_batch = tf.stack([t.state for t in all_trajectory_steps])
action_batch = tf.stack([t.action for t in all_trajectory_steps])
reward_batch = tf.stack([t.reward for t in all_trajectory_steps])

print(f'state shape:  {state_batch.shape}')
print(f'action shape: {action_batch.shape}')
print(f'reward shape: {reward_batch.shape}')

state shape:  (22835, 43)
action shape: (22835,)
reward shape: (22835,)


# Define Models

In [42]:
# Define models - original. Fails.
class Connect4StateEmbedder(keras.Model):
    def __init__(self, embedding_dim=64, hidden_dim=256):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.conv1 = keras.layers.Conv2D(32, kernel_size=3, padding='same', activation='relu')
        self.conv2 = keras.layers.Conv2D(64, kernel_size=3, padding='same', activation='relu')
        self.flatten = keras.layers.Flatten()
        self.dense1 = keras.layers.Dense(self.hidden_dim, activation='relu')
        self.dense2 = keras.layers.Dense(self.embedding_dim)
    
    def call(self, inputs):
        x = self._state_to_array(inputs)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.flatten(x)
        x = self.dense1(x)
        return self.dense2(x)
    
    def _state_to_array(self, encoded_state_batch):
        return tf.reshape(encoded_state_batch[:, :-1], [-1, 6, 7, 1])

In [43]:
def test_state_embedder():
    state_embedder = Connect4StateEmbedder()
    sample_state = tf.random.normal((2, 43))  # 2 sample states
    output = state_embedder(sample_state)
    print("State Embedder output shape:", output.shape)
    assert output.shape == (2, 64), f"Expected shape (2, 64), got {output.shape}"

# Force CPU mode.
#with tf.device('/CPU:0'):
#    test_state_embedder() 

In [44]:
# Test on GPU
test_state_embedder()

State Embedder output shape: (2, 64)


In [45]:
class Connect4ActionEmbedder(keras.Model):
    def __init__(self, embedding_dim: int = 64, num_actions: int = 7):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.num_actions = num_actions
        self.embedding = keras.layers.Embedding(num_actions, embedding_dim)
    
    def call(self, inputs: tf.Tensor) -> tf.Tensor:
        return self.embedding(inputs - 1)
    
    def all_action_embeddings(self) -> tf.Tensor:
        return self.embedding(tf.range(self.num_actions))

In [46]:
# After defining the Connect4ActionEmbedder class
def test_action_embedder():
    action_embedder = Connect4ActionEmbedder()
    sample_actions = tf.constant([1, 3, 5, 7])  # 4 sample actions
    output = action_embedder(sample_actions)
    print("Action Embedder output shape:", output.shape)
    assert output.shape == (4, 64), f"Expected shape (4, 64), got {output.shape}"

test_action_embedder()

Action Embedder output shape: (4, 64)


In [47]:
class PredictionModel(keras.Model):
    def __init__(self, state_embedder: keras.Model, action_embedder: keras.Model, embedding_dim: int = 64, num_actions: int = 7):
        super().__init__()
        self.state_embedder = state_embedder
        self.action_embedder = action_embedder
        self.embedding_dim = embedding_dim
        self.num_actions = num_actions
        self.reward_head = keras.layers.Dense(1)
        
        # Call build method to initialize weights
        self.build((None, 43))  # Assuming input shape is (batch_size, 43)
    
    def build(self, input_shape):
        # This ensures that all layers are built
        self.state_embedder.build(input_shape)
        self.action_embedder.build((None,))
        self.reward_head.build((None, self.embedding_dim))
        super().build(input_shape)
    
    def call(self, inputs):
        state_embeddings = self.state_embedder(inputs)
        action_embeddings = self.action_embedder.all_action_embeddings()
        action_logits = tf.matmul(state_embeddings, action_embeddings, transpose_b=True)
        reward_pred = self.reward_head(state_embeddings)
        return action_logits, reward_pred
    
    def action_probs(self, state_batch):
        action_logits, _ = self(state_batch)
        return tf.nn.softmax(action_logits)
    
    def reward_pred(self, state_batch):
        _, reward_pred = self(state_batch)
        return reward_pred

In [48]:
# After defining the PredictionModel class
def test_prediction_model():
    state_embedder = Connect4StateEmbedder()
    action_embedder = Connect4ActionEmbedder()
    prediction_model = PredictionModel(state_embedder, action_embedder)
    
    sample_state = tf.random.normal((3, 43))  # 3 sample states
    action_logits, reward_pred = prediction_model(sample_state)
    
    print("Action logits shape:", action_logits.shape)
    print("Reward prediction shape:", reward_pred.shape)
    
    assert action_logits.shape == (3, 7), f"Expected action logits shape (3, 7), got {action_logits.shape}"
    assert reward_pred.shape == (3, 1), f"Expected reward prediction shape (3, 1), got {reward_pred.shape}"
    
    action_probs = prediction_model.action_probs(sample_state)
    print("Action probabilities shape:", action_probs.shape)
    assert action_probs.shape == (3, 7), f"Expected action probabilities shape (3, 7), got {action_probs.shape}"
    
    # Check if probabilities sum to 1
    prob_sums = tf.reduce_sum(action_probs, axis=1)
    tf.debugging.assert_near(prob_sums, tf.ones_like(prob_sums), atol=1e-6)

test_prediction_model()

Action logits shape: (3, 7)
Reward prediction shape: (3, 1)
Action probabilities shape: (3, 7)




In [49]:
# Loss function
def loss_fn(prediction_model, batch, l2_weight=1e-4):
    action_logits, reward_pred = prediction_model(batch['state'])
    action_labels = batch['action'] - 1
    action_data_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=action_labels, logits=action_logits))
    
    reward_data_loss = tf.reduce_mean(tf.square(reward_pred - tf.expand_dims(batch['reward'], -1)))
    
    l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in prediction_model.trainable_variables])
    
    total_loss = action_data_loss + reward_data_loss + l2_weight * l2_loss
    return total_loss, (action_logits, reward_pred)

In [50]:
# After defining the loss_fn
def test_loss_fn():
    state_embedder = Connect4StateEmbedder()
    action_embedder = Connect4ActionEmbedder()
    prediction_model = PredictionModel(state_embedder, action_embedder)
    
    sample_batch = {
        'state': tf.random.normal((5, 43)),
        'action': tf.random.uniform((5,), minval=1, maxval=8, dtype=tf.int32),
        'reward': tf.random.uniform((5,))
    }
    
    loss, (action_logits, reward_pred) = loss_fn(prediction_model, sample_batch)
    
    print("Loss:", loss.numpy())
    print("Action logits shape:", action_logits.shape)
    print("Reward prediction shape:", reward_pred.shape)
    
    assert isinstance(loss, tf.Tensor), f"Expected loss to be a tensor, got {type(loss)}"
    assert action_logits.shape == (5, 7), f"Expected action logits shape (5, 7), got {action_logits.shape}"
    assert reward_pred.shape == (5, 1), f"Expected reward prediction shape (5, 1), got {reward_pred.shape}"

test_loss_fn()


Loss: 2.3576984
Action logits shape: (5, 7)
Reward prediction shape: (5, 1)




In [51]:

# Training step
def train_step(prediction_model, optimizer, batch):
    with tf.GradientTape() as tape:
        loss, (logits, reward_pred) = loss_fn(prediction_model, batch)
    gradients = tape.gradient(loss, prediction_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, prediction_model.trainable_variables))
    return loss, logits, reward_pred

# Main training loop
def train_model(print_logits=False, num_epochs=10, batch_size=64):
    state_embedder = Connect4StateEmbedder()
    action_embedder = Connect4ActionEmbedder()
    prediction_model = PredictionModel(state_embedder, action_embedder)
    optimizer = keras.optimizers.Adam(learning_rate=0.0005)
    
    dataset = tf.data.Dataset.from_tensor_slices((state_batch, action_batch, reward_batch))
    dataset = dataset.shuffle(buffer_size=len(all_trajectory_steps)).batch(batch_size)
    
    for epoch in range(num_epochs):
        total_loss = 0
        for i, (states, actions, rewards) in enumerate(dataset):
            batch = {'state': states, 'action': actions, 'reward': rewards}
            loss, logits, reward_pred = train_step(prediction_model, optimizer, batch)
            total_loss += loss
            
            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(all_trajectory_steps)//batch_size}], Loss: {loss.numpy():.4f}')
        
        avg_loss = total_loss / (len(all_trajectory_steps) // batch_size)
        print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}')
        
        if print_logits:
            for j in range(4):
                state = tf.expand_dims(states[j], 0)
                action_probs = prediction_model.action_probs(state)
                reward_pred = prediction_model.reward_pred(state).numpy()[0][0]
                reward_true = rewards[j].numpy()
                print(j, f'r={reward_true:.4f} p={reward_pred:.4f}', action_probs.numpy())
            print()
    
    return prediction_model

In [52]:
# Train the model
prediction_model = train_model(print_logits=True, num_epochs=10)




Epoch [1/10], Step [100/356], Loss: 2.1972
Epoch [1/10], Step [200/356], Loss: 2.1949
Epoch [1/10], Step [300/356], Loss: 2.1764


2024-10-17 00:46:21.653708: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch [1/10], Average Loss: 2.2127
0 r=0.0000 p=0.5832 [[0.14328542 0.14001538 0.14318919 0.14771304 0.13766432 0.14400549
  0.14412719]]
1 r=0.0000 p=0.2180 [[0.14174928 0.14107995 0.14175102 0.14766948 0.13927938 0.1445238
  0.14394705]]
2 r=0.0000 p=0.2965 [[0.14365692 0.14041342 0.14397071 0.14524876 0.14057101 0.14363462
  0.14250456]]
3 r=0.0000 p=0.4129 [[0.14202298 0.14172916 0.1426362  0.14665827 0.14078775 0.14340623
  0.14275937]]

Epoch [2/10], Step [100/356], Loss: 2.1618
Epoch [2/10], Step [200/356], Loss: 2.1618
Epoch [2/10], Step [300/356], Loss: 2.1151


2024-10-17 00:46:35.411995: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch [2/10], Average Loss: 2.1578
0 r=1.0000 p=0.9765 [[0.14767376 0.12999563 0.1361254  0.1570923  0.13549763 0.14772983
  0.14588548]]
1 r=0.0000 p=0.6613 [[0.13871919 0.13632214 0.13953085 0.1502517  0.14188133 0.14216723
  0.15112758]]
2 r=0.0000 p=0.4926 [[0.14630662 0.14108334 0.14351256 0.14545064 0.13720173 0.1471451
  0.1393    ]]
3 r=1.0000 p=0.8637 [[0.14439763 0.13277633 0.13718966 0.15168189 0.13982089 0.14985637
  0.14427726]]

Epoch [3/10], Step [100/356], Loss: 2.0713
Epoch [3/10], Step [200/356], Loss: 2.1377
Epoch [3/10], Step [300/356], Loss: 2.0935
Epoch [3/10], Average Loss: 2.1179
0 r=1.0000 p=0.7513 [[0.14494765 0.15739755 0.14113164 0.14886838 0.13742116 0.12277129
  0.14746237]]
1 r=0.0000 p=0.4556 [[0.14181429 0.14000657 0.14385055 0.14906852 0.13937187 0.14267984
  0.14320838]]
2 r=1.0000 p=0.6179 [[0.15268938 0.14359729 0.13978714 0.13929805 0.1371026  0.14649598
  0.1410296 ]]
3 r=1.0000 p=0.6753 [[0.14517663 0.14222877 0.14288227 0.14371711 0.14083879 0.1

In [None]:



# Save the model
prediction_model.save_weights('connect4_prediction_model')

# Load the model
loaded_model = PredictionModel(Connect4StateEmbedder(), Connect4ActionEmbedder())
loaded_model.load_weights('connect4_prediction_model')

# Test the loaded model
game = connect4.Connect4Game()
serializer = connect4.Connect4Serializer()

print('Move 0:')
s_0 = game.initial_state()
state_array = serializer.state_to_jax_array(game, s_0)
j_0 = tf.convert_to_tensor(state_array, dtype=tf.float32)
j_0 = tf.expand_dims(j_0, 0)

print(j_0, prediction_model.reward_pred(j_0).numpy()[0][0], prediction_model.action_probs(j_0).numpy())
print(j_0, loaded_model.reward_pred(j_0).numpy()[0][0], loaded_model.action_probs(j_0).numpy())