In [1]:
import torch
import numpy
from ml_modules import *
from sequence_modules import *
from llm_modules import *
from agent import *
from simulation import *

from utils import *

from ml_ops_utils import *

import gc

from hand_strength import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!nvidia-smi

Sun Feb 22 18:12:38 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.119.02             Driver Version: 580.119.02     CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 5090        Off |   00000000:01:00.0  On |                  N/A |
|  0%   25C    P8             24W /  450W |   15175MiB /  32607MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [3]:
num_players = 2
small_blind = 1
big_blind = 2
num_streets = 4
starting_stack_sizes = 400
max_seq_len = 1024
embed_dim = 256
device = "cuda"


softmax_prob = torch.nn.Softmax(dim=-1)

action_validator = PokerActionValidator(
    num_players = num_players,
    small_blind = small_blind,
    big_blind = big_blind,
    starting_stack_sizes = starting_stack_sizes
)

street_embedder_p1 = StreetPositionalEncoding(
    num_streets = num_streets,
    embedding_dim = embed_dim,
    max_seq_len = max_seq_len,
    device = device
)

table_position_embedder_p1 = TablePositionalEncoding(
    num_players = num_players,
    embedding_dim = embed_dim,
    max_seq_len = max_seq_len,
    device = device
)

action_embedder_p1 = ActionEncoding(
    #num_actions = 21,
    embedding_dim = embed_dim,
    max_seq_len = max_seq_len,
    device = device
)

pot_size_embedder_p1 = PotSizeSequenceEmbedder(
    max_seq_len = max_seq_len,
    pad_value = -1,
    device = device
)

poker_sequence_embedder_p1 = PokerSequenceEmbedder(
    street_input_dimension = embed_dim,
    table_position_input_dimension = embed_dim,
    action_input_dimension = embed_dim,
    latent_dimensions = [embed_dim, embed_dim * 2, embed_dim * (2**2), embed_dim * (2**3)],
    device = device
)
cards_p1 = Cards(device = device)

self_position_embedder_p1 = SelfPositionEmbedder(number_of_positions = num_players, device = device)


street_embedder_p2 = StreetPositionalEncoding(
    num_streets = num_streets,
    embedding_dim = embed_dim,
    max_seq_len = max_seq_len,
    device = "cuda"
)

table_position_embedder_p2 = TablePositionalEncoding(
    num_players = num_players,
    embedding_dim = embed_dim,
    max_seq_len = max_seq_len,
    device = device
)

action_embedder_p2 = ActionEncoding(
    #num_actions = 21,
    embedding_dim = embed_dim,
    max_seq_len = max_seq_len,
    device = device
)

pot_size_embedder_p2 = PotSizeSequenceEmbedder(
    max_seq_len = max_seq_len,
    pad_value = -1,
    device = device
)

poker_sequence_embedder_p2 = PokerSequenceEmbedder(
    street_input_dimension = embed_dim,
    table_position_input_dimension = embed_dim,
    action_input_dimension = embed_dim,
    latent_dimensions = [embed_dim, embed_dim * 2, embed_dim * (2**2), embed_dim * (2**3)],
    device = device
)

cards_p2 = Cards(device = device)

self_position_embedder_p2 = SelfPositionEmbedder(number_of_positions = num_players, device = device)

In [4]:
model_name = "./models/qwen3-1point7b/"

tokenizer, model = load_model(model_name)

model

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  5.81it/s]
The module name  (originally ) is not a valid Python identifier. Please rename the original module to avoid import issues.


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (up_proj): Linear(in_features=2048, out_features=6144, bias=False)
          (down_proj): Linear(in_features=6144, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((2048,), eps=1e-06)
        (post_attention_layer

In [5]:
policy_model_p1 = PolicyModel(
    num_players = num_players,
    self_position_embedder = self_position_embedder_p1,
    active_players_hidden_dims = [1024, 2048],
    stack_size_hidden_dims = [1024, 2048],
    card_embeddings_hidden_dims = [2048, 2048],
    final_output_hidden_dims = [1024, 512, 256],
    value_output_hidden_dims = [1024, 512, 256],
    dropout_rate = 0,
    device = 'cuda',
)

policy_model_p2 = PolicyModel(
    num_players = num_players,
    self_position_embedder = self_position_embedder_p2,
    active_players_hidden_dims = [1024, 2048],
    stack_size_hidden_dims = [1024, 2048],
    card_embeddings_hidden_dims = [2048, 2048],
    final_output_hidden_dims = [1024, 512, 256],
    value_output_hidden_dims = [1024, 512, 256],
    dropout_rate = 0,
    device = 'cuda',
)

poker_player_p1 = PokerAgent(
    cards_p1,
    street_embedder_p1,
    table_position_embedder_p1,
    action_embedder_p1,
    pot_size_embedder_p1,
    poker_sequence_embedder_p1,
    model,
    policy_model_p1,
    device = 'cuda',
    llm_train = False
)

poker_player_p2 = PokerAgent(
    cards_p2,
    street_embedder_p2,
    table_position_embedder_p2,
    action_embedder_p2,
    pot_size_embedder_p2,
    poker_sequence_embedder_p2,
    model,
    policy_model_p2,
    device = 'cuda',
    llm_train = False
)

p1_optim = torch.optim.AdamW(poker_player_p1.parameters(), lr = 1e-4)
p2_optim = torch.optim.AdamW(poker_player_p2.parameters(), lr = 1e-4)

In [6]:
batch_size = 1024

deck_order_shuffled = torch.argsort(torch.rand(1,52))


street_idxs = (torch.zeros((batch_size, 1024)) + 6).long()
street_idxs[:, :2] = 0 # posting small blind and big blind

table_position_idxs = (torch.zeros((batch_size, 1024)) + 2).long()
table_position_idxs[:, 0] = 0 # sb/b
table_position_idxs[:, 1] = 1 # bb

action_idxs = (torch.zeros((batch_size, 1024)) + 21).long()
action_idxs[:, 0] = 0 # post sb
action_idxs[:, 1] = 1 # post bb

pot_size_sequence = (torch.zeros((batch_size, 1024)) - 1)
pot_size_sequence[:, 0] = 1
pot_size_sequence[:, 1] = 3


active_players = torch.Tensor([
    [1, 1]
]).tile(batch_size, 1)
print(active_players.shape)
stack_size = torch.Tensor(
    [
        [399, 398],
    ]
).tile(batch_size, 1)
stack_size.shape

sb_cards = torch.zeros((batch_size,2,7)).to(dtype=torch.long).to('cuda')
bb_cards = torch.zeros((batch_size,2,7)).to(dtype=torch.long).to('cuda')

sb_cards[:, 0, :2] = deck_order_shuffled[0, :2]%13
sb_cards[:, 1, :2] = deck_order_shuffled[0, :2]//13

sb_cards[:, 0, 2:] = 13
sb_cards[:, 1, 2:] = 4


bb_cards[:, 0, :2] = deck_order_shuffled[0, 2:4]%13
bb_cards[:, 1, :2] = deck_order_shuffled[0, 2:4]//13

bb_cards[:, 0, 2:] = 13
bb_cards[:, 1, 2:] = 4

p1_position_player = torch.Tensor([0]).to('cuda').tile((batch_size))
p2_position_player = torch.Tensor([1]).to('cuda').tile((batch_size))

table = {
    0 : [poker_player_p1, sb_cards, p1_position_player],
    1 : [poker_player_p2, bb_cards, p2_position_player]
}

(
    sim_street_idxs, 
    sim_table_position_idxs, 
    sim_action_idxs, 
    sim_pot_size_sequence, 
    sim_active_players, 
    sim_stack_size,
    sim_table
) = simulate_hand(
    num_players,
    street_idxs,
    table_position_idxs,
    action_idxs,
    pot_size_sequence,
    active_players,
    stack_size,
    table,
    action_validator,
    deck_order_shuffled,
)

torch.Size([1024, 2])
after action 2 


2
1
tensor([0, 1])
tensor([1., 3.])
tensor([0, 1])
tensor([0, 0])
tensor([399., 398.])
tensor([1., 1.])

 

0 is the next to act
4
tensor(399.)
after action 3 


3
2
tensor([0, 1, 9])
tensor([ 1.,  3., 19.])
tensor([0, 1, 0])
tensor([0, 0, 0])
tensor([383., 398.])
tensor([1., 1.])

 

1 is the next to act
tensor(32.)
tensor(398.)
after action 4 


4
3
tensor([ 0,  1,  9, 16])
tensor([  1.,   3.,  19., 417.])
tensor([0, 1, 0, 1])
tensor([0, 0, 0, 0])
tensor([383.,   0.])
tensor([1., 1.])

 

0 is the next to act
after action 5 


5
4
tensor([ 0,  1,  9, 16,  2])
tensor([  1.,   3.,  19., 417., 417.])
tensor([0, 1, 0, 1, 0])
tensor([0, 0, 0, 0, 0])
tensor([383.,   0.])
tensor([0., 1.])

 

-1 is the next to act


In [7]:
rewards = determine_winner(
    sim_active_players[-1],
    sim_pot_size_sequence[-1],
    sim_stack_size[-1],
    sim_table
)

for i in range(num_players):

    sim_table[i].append(rewards[i])

p0_action_batch_indices = torch.where(
    action_validator.get_next_to_act(
        sim_street_idxs,
        sim_table_position_idxs,
        sim_action_idxs,
        sim_active_players
    ) == 0
)[0]

p1_action_batch_indices = torch.where(
    action_validator.get_next_to_act(
        sim_street_idxs,
        sim_table_position_idxs,
        sim_action_idxs,
        sim_active_players
    ) == 1
)[0]

In [8]:
mask_p0 = (sim_action_idxs[p0_action_batch_indices + 1] == 21)
mask_p0 = mask_p0.float().argmax(dim = 1)

mask_p1 = (sim_action_idxs[p1_action_batch_indices + 1] == 21)
mask_p1 = mask_p1.float().argmax(dim = 1)

In [10]:
position = sim_table[0][2][p0_action_batch_indices]
cards = sim_table[0][1][p0_action_batch_indices]
street_idxs = sim_street_idxs[p0_action_batch_indices]
table_position_idxs = sim_table_position_idxs[p0_action_batch_indices]
action_idxs = sim_action_idxs[p0_action_batch_indices]
pot_size_sequence = sim_pot_size_sequence[p0_action_batch_indices]
active_players = sim_active_players[p0_action_batch_indices]
stack_size = sim_stack_size[p0_action_batch_indices]

In [12]:
outputs = sim_table[0][0](
                position,
                cards,
                street_idxs,
                table_position_idxs,
                action_idxs,
                pot_size_sequence,
                active_players.to('cuda'),
                stack_size.to('cuda')
)

In [17]:
neg_log_probs = outputs['probits'][torch.arange(outputs['probits'].shape[0]), sim_action_idxs[p0_action_batch_indices+1, mask_p0 - 1]]

In [19]:
neg_log_probs

tensor([ 0.8303, -0.8246], device='cuda:0', grad_fn=<IndexBackward0>)

In [29]:
outputs.keys()

dict_keys(['street_idxs', 'street_embedding', 'table_position_idxs', 'table_position_embedding', 'action_idxs', 'action_embedding', 'pot_size_sequence', 'active_players', 'stack_size', 'card_embeddings', 'attention_mask', 'llm_state', 'self_position', 'probits', 'value_pred'])

In [21]:
sim_action_idxs[p0_action_batch_indices+1, mask_p0 - 1]

tensor([9, 2])

In [22]:
model_rewards = neg_log_probs * sim_table[0][-1]

In [24]:
model_rewards.mean()

tensor(-0.0484, device='cuda:0', grad_fn=<MeanBackward0>)

In [27]:
model_rewards.mean().backward()

In [28]:
p1_optim.step()