In [1]:
import numpy as np
import pandas as pd


import json
import os

import subprocess

from matplotlib import pyplot as plt

In [2]:
%matplotlib inline
%config Completer.use_jedi = False

In [3]:
import torch
from torch import nn
from torch import optim

from tensorboardX import SummaryWriter

In [4]:
from importlib import reload
from utils import dataset

reload(dataset)

<module 'utils.dataset' from '/Users/sergmiller/Documents/my/lux-ai-v1/research/utils/dataset.py'>

In [33]:
def learn(train, val, model_ff, criterion, iter_data, epochs=5, batch_size=64, shuffle=True, freq=10,lr=1e-3, l2=1e-5, optimizer=None, writer=None): 
    assert iter_data is not None
    if writer is None:
        writer = SummaryWriter()
    
#     np.random.seed(1)
    ids_nn = np.arange(train.targets.shape[0])
    
    reshape_to_last = lambda x: torch.reshape(x, [np.prod(x.shape[:-1]), x.shape[-1]])

    if optimizer is None:
        optimizer = optim.Adam(model_ff.parameters(), lr=lr, weight_decay=l2)

    time_for_print_loss = lambda i: (i + 1) % freq == 0
    
    n_iter = 0


    for epoch in np.arange(epochs):
        np.random.shuffle(ids_nn)

        model_ff.train(True)

        for b in np.arange(0, train.targets.shape[0], batch_size):
            X_batch = torch.FloatTensor(train.features[ids_nn[b:b+batch_size]])
            y_batch = torch.FloatTensor(train.weights[ids_nn[b:b+batch_size]])  # reward(advantage)
            a_batch = torch.LongTensor(train.targets[ids_nn[b:b+batch_size]])  # action

            optimizer.zero_grad()
            y_pred_logits = model_ff(X_batch)

            loss = criterion(y_pred_logits, y_batch, a_batch, X_batch, model_ff, (writer, iter_data["n_iter"]))
            loss.backward()

            optimizer.step()

            if (b // batch_size + 1) % freq == 0:
                print('train loss in %d epoch in %d batch: %.5f' %
                  (epoch + 1, b // batch_size + 1, loss.item()))

                writer.add_scalar('data/train_loss', loss.item(), iter_data["n_iter"])
                writer.add_scalar('data/epoch', epoch + 1, iter_data["n_iter"])
                writer.add_scalar('data/batch', b // batch_size + 1, iter_data["n_iter"])

                val_loss = 0
                its = 0
                model_ff.train(False)
                for b in np.arange(0, val.targets.shape[0], batch_size):
                    its += 1
                    X_batch = torch.FloatTensor(val.features[b:b+batch_size])
#                     X_batch = reshape_to_last(X_batch)

                    y_batch = torch.FloatTensor(val.weights[b:b+batch_size])
                    a_batch = torch.LongTensor(val.targets[b:b+batch_size])
                    with torch.no_grad():
                        y_pred_logits = model_ff(X_batch)
                    loss = criterion(y_pred_logits, y_batch, a_batch, X_batch, model_ff, None)
                    val_loss += loss.item()
                val_loss /= its
                print('val loss in %d epoch: %.5f' % (epoch + 1, val_loss))

                writer.add_scalar('data/val_loss', val_loss, iter_data["n_iter"])
                n_iter += 1
                iter_data["n_iter"] += 1

In [6]:
# datasets = dataset.read_datasets_from_dir("features_v3/")

In [7]:
# dataset.read_columns_from_random_file("features_v3")

In [8]:
# dataset.CAT_FEATURES

In [9]:
FLOAT_FEATURES = [i for i in range(42 + 32*32*7) if i not in dataset.CAT_FEATURES_V4]

In [10]:
FLOAT_FEATURES[:20], FLOAT_FEATURES[-10:]

([0, 1, 5, 6, 7, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20, 21, 23, 26, 27, 28],
 [7200, 7201, 7202, 7203, 7204, 7205, 7206, 7207, 7208, 7209])

In [11]:
import pickle
from sklearn.preprocessing import OneHotEncoder

In [12]:
with open("../submissions/simple/models/ohe_v2", "rb") as f:
    OHE = pickle.load(f)

In [13]:
def prepare_features(t: dataset.Dataset, v: dataset.Dataset, ohe=None, categories=None) -> (dataset.Dataset, dataset.Dataset):
    create_ohe = ohe is None
    if create_ohe:
         ohe = OneHotEncoder(sparse=False, categories=categories)
    def prepare(d, is_train):
        cf = d.features[:, dataset.CAT_FEATURES_V4]
        ff = d.features[:, FLOAT_FEATURES]
        cf[cf == "False"] = False
        cf[cf == "True"] = True
        cf[cf == None] = "None"
        cf[cf == "1"] = 1
        cf[cf == "2"] = 2
        cf[cf == "3"] = 3
        ff[ff == "None"] = 0
        cf_o = ohe.fit_transform(cf) if is_train and create_ohe else ohe.transform(cf)
        return dataset.Dataset(
            features=np.array(np.concatenate([cf_o, ff], axis=1), dtype=np.float),
            targets=np.array(d.targets, dtype=np.float),
            weights=np.array(d.weights, dtype=np.float),
            next_state_id = d.next_state_id
        )
    t = prepare(t, True)
    v = prepare(v, False)
    return (t,v, ohe)

In [14]:
# dt,dv,OHE = prepare_features(data, data, None, [
#      np.array(["None", False, True], dtype=object),
#      np.array([1, 2, 3], dtype=object),
#      np.array(['None', 'bcity', 'e', 'n', 'p', 's', 'w'], dtype=object),
#      np.array(['None', 'c', 'e', 'n', 's', 'w'], dtype=object),
#      np.array(["None", False, True], dtype=object),
#      np.array(['None', 'e', 'n', 's', 'w'], dtype=object),
#      np.array(["None", False, True], dtype=object),
#      np.array(['None', 'c', 'e', 'n', 's', 'w'], dtype=object),
#      np.array(['None', 'coal', 'uranium', 'wood'], dtype=object),
#      np.array(["None", False, True], dtype=object),
#      np.array(["None", False, True], dtype=object),
#      np.array(["None", False, True], dtype=object),
#      np.array(["None", False, True], dtype=object),
#      np.array(["None", False, True], dtype=object)])

In [15]:
OHE.categories_

[array(['None', False, True], dtype=object),
 array([1, 2, 3], dtype=object),
 array(['None', 'bcity', 'e', 'n', 'p', 's', 'w'], dtype=object),
 array(['None', 'c', 'e', 'n', 's', 'w'], dtype=object),
 array(['None', False, True], dtype=object),
 array(['None', 'e', 'n', 's', 'w'], dtype=object),
 array(['None', False, True], dtype=object),
 array(['None', 'c', 'e', 'n', 's', 'w'], dtype=object),
 array(['None', 'coal', 'uranium', 'wood'], dtype=object),
 array(['None', False, True], dtype=object),
 array(['None', False, True], dtype=object),
 array(['None', False, True], dtype=object),
 array(['None', False, True], dtype=object),
 array(['None', False, True], dtype=object)]

In [16]:
# with open("../submissions/simple/models/ohe_v2", "wb") as f:
#     pickle.dump(OHE, f)

In [17]:
# with open("../submissions/simple/models/ohe_v1", "wb") as f:
#     pickle.dump(ohe, f)

In [18]:
from torch.nn import functional as F

In [26]:
MAP_F = 32 * 32 * 7

class NNWithCustomFeatures(nn.Module):
    def __init__(self, INPUT_F, DROP_P, H, A=6):
        super().__init__()
        INPUT_F_C = INPUT_F + 128
        self.model_q =  nn.Sequential(
            nn.Dropout(DROP_P),
            nn.Linear(INPUT_F_C, H),
            nn.LayerNorm(H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Linear(H, A)
        )
        
        self.model_p =  nn.Sequential(
            nn.Dropout(DROP_P),
            nn.Linear(INPUT_F_C, H),
            nn.LayerNorm(H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Linear(H, A)
        )

        self.map_model = nn.Sequential(
            nn.Conv2d(7, 64, 3),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # after -> (16,16)
            nn.Conv2d(64, 128, 3),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # after -> (8, 8)
            nn.Conv2d(128, 256, 3),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),  # after -> (4, 4)
        )
        self.avgpool = nn.AdaptiveAvgPool2d((4, 4))
        self.proj = nn.Sequential(
            nn.Dropout(p=DROP_P),
            nn.Linear(256 * 4 * 4, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(p=DROP_P),
            nn.Linear(256, 128)
        )
        
    #quriosity block
            
        self.model_encoder = nn.Sequential(
            nn.Dropout(DROP_P),
            nn.Linear(INPUT_F_C, H),
            nn.LayerNorm(H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Linear(H, H)
        )
                
        self.forward_model =  nn.Sequential(
            nn.Dropout(DROP_P),
            nn.Linear(H + A, H),
            nn.LayerNorm(H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H)
        )
        
        self.inverse_model =  nn.Sequential(
            nn.Dropout(DROP_P),
            nn.Linear(H * 2, H),
            nn.LayerNorm(H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, H),
            nn.ReLU(),
            nn.Dropout(DROP_P),
            nn.Linear(H, A)
        )
        
    def forward(self, x):
        L = x.shape[1]
        cur_r = self.forward_impl(x[:, :L // 2])
        next_r =  self.forward_impl(x[:, L // 2:])
        return torch.cat([cur_r, next_r],dim=1)

    def forward_impl(self, x):
        mapp = x[:, -MAP_F:].reshape(-1, 32, 32, 7)
        rest = x[:, :-MAP_F]
        mapp = torch.transpose(mapp, 1, -1)
        mapp = self.avgpool(self.map_model(mapp))
        mapp = torch.flatten(mapp, 1)
        mapp_f = self.proj(mapp)
        input_x = torch.cat([rest, mapp_f], dim=1)
        return torch.cat([self.model_q(input_x), self.model_p(input_x)], dim=1)

    def phi(self, x):
        mapp = x[:, -MAP_F:].reshape(-1, 32, 32, 7)
        rest = x[:, :-MAP_F]
        mapp = torch.transpose(mapp, 1, -1)
        mapp = self.avgpool(self.map_model(mapp))
        mapp = torch.flatten(mapp, 1)
        mapp_f = self.proj(mapp)
        input_x = torch.cat([rest, mapp_f], dim=1)
        return self.model_encoder(input_x)

    def calc_curiosity_reward_and_restore_action(self, x, a_batch):
        L = x.shape[1]
        cur_s = x[:, :L // 2]
        next_s = x[:, L // 2:]
        phi_cur = self.phi(cur_s)
        phi_next = self.phi(next_s)
        forward_input = torch.cat([phi_cur, F.one_hot(a_batch, 6)], dim=1)
        phi_hat_next = self.forward_model(forward_input)
        reward = torch.mean((phi_hat_next - phi_next) ** 2)
        phi_cur_and_next = torch.cat([phi_cur, phi_next], dim=1)
        inv_action_pred = self.inverse_model(phi_cur_and_next)
        inv_loss = torch.mean(torch.nn.CrossEntropyLoss(reduction='none')(inv_action_pred, a_batch))
        return reward, inv_loss

In [27]:
torch.version

<module 'torch.version' from '/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/torch/version.py'>

In [28]:
model = NNWithCustomFeatures(63, 0.05, 64)

In [29]:
model.forward_impl(torch.Tensor(4, 63 + 32*32*7))

tensor([[-0.0459, -0.2247,  0.0816, -0.0187, -0.2221,  0.0039,  0.0616, -0.1225,
          0.1290,  0.0431,  0.0778,  0.0308],
        [-0.0899, -0.1996,  0.0961, -0.0575, -0.2508,  0.0254,  0.1081, -0.0994,
          0.1516,  0.0927,  0.0568,  0.0088],
        [-0.1035, -0.1845,  0.1133, -0.0828, -0.2799, -0.0075,  0.0454, -0.1296,
          0.1411,  0.0249,  0.1404,  0.0361],
        [-0.1028, -0.1537,  0.1107, -0.0798, -0.2707, -0.0121,  0.0728, -0.1136,
          0.1850,  0.0318,  0.0977,  0.0129]], grad_fn=<CatBackward0>)

In [30]:
model.calc_curiosity_reward_and_restore_action(torch.Tensor(4, 2 * (63 + 32*32*7)), torch.LongTensor([1,2,3,4]))

(tensor(0.0470, grad_fn=<MeanBackward0>),
 tensor(1.9528, grad_fn=<MeanBackward0>))

In [34]:
ENTROPY_REG = 1.0
PI_REG = 1.0
CURIOSITY_WEIGTH = 1
CURIOSITY_BETA = 0.1
def policy_loss(pi_logits, reward_batch, a_batch, X_batch, _):
    pi_probs = torch.nn.Softmax(dim=1)(pi_logits)
    return torch.mean(torch.nn.CrossEntropyLoss(reduction='none')(pi_logits, a_batch) * reward_batch 
                      - torch.sum(pi_probs * torch.log(pi_probs) * ENTROPY_REG, dim=1))

def q_loss(q_vals, reward_batch, a_batch, X_batch, _):
    q_vals_per_reward = q_vals[np.arange(q_vals.shape[0]), a_batch]
    return torch.nn.MSELoss()(q_vals_per_reward, reward_batch) * 0.01

gamma = 0.99

def get_is_last_state(x):
    t = torch.sum(torch.isclose(x, torch.ones_like(x) * (-1)), dim=1) == x.shape[1]
    return t.float()

def q_loss_pair(q_vals_cur_and_next, reward_batch, a_batch, X_batch, _):
    q_vals = q_vals_cur_and_next[:, :6]
    q_vals_next = q_vals_cur_and_next[:, 6:12]
    q_vals_per_reward_cur = q_vals[np.arange(q_vals.shape[0]), a_batch]
    X_batch_next = X_batch[:, X_batch.shape[1] // 2:]
    best_q_vals_next = torch.max(q_vals_next,dim=1)[0] * (1 - get_is_last_state(X_batch_next))
#     print(list(enumerate([q_vals_per_reward_cur, reward_batch, best_q_vals_next, q_vals_next])))
#     print(0.99 * best_q_vals_next)
#     return torch.nn.MSELoss()(target=q_vals_per_reward_cur.detach(), input=reward_batch + gamma * best_q_vals_next)
    return torch.nn.SmoothL1Loss()(target=q_vals_per_reward_cur.detach(), input=reward_batch + gamma * best_q_vals_next)


def actor_critic_loss(q_pi_payload, _reward_batch, a_batch, X_batch, model_ff, writer_and_iter):
#     curiosity_reward_batch = torch.zeros_like(_reward_batch)
    curiosity_reward_batch, inv_loss = model_ff.calc_curiosity_reward_and_restore_action(X_batch, a_batch)
    reward_batch = _reward_batch + curiosity_reward_batch
    q_vals = q_pi_payload[:, :6]
    pi_logits =  q_pi_payload[:, 6:12]
    pi_probs = torch.nn.Softmax(dim=1)(pi_logits)
    q_vals_next = q_pi_payload[:, 12:18]
    q_vals_per_reward_cur = q_vals[np.arange(q_vals.shape[0]), a_batch]
    X_batch_next = X_batch[:, X_batch.shape[1] // 2:]
    best_q_vals_next = torch.max(q_vals_next,dim=1)[0] * (1 - get_is_last_state(X_batch_next))
    with torch.no_grad():
        reference_q_val = reward_batch + gamma * best_q_vals_next
        advantage = reference_q_val - q_vals_per_reward_cur
    q_loss = torch.nn.SmoothL1Loss()(target=reference_q_val.detach(), input=q_vals_per_reward_cur)
    pi_loss =  torch.mean(torch.nn.CrossEntropyLoss(reduction='none')(pi_logits, a_batch) * advantage.detach()) * PI_REG
    entropy = torch.mean(torch.sum(pi_probs * torch.log(pi_probs), dim=1)) * ENTROPY_REG
    if writer_and_iter is not None:
        writer, n_iter = writer_and_iter
        writer.add_scalar('data/batch_reward', torch.mean(_reward_batch).item(), n_iter)
        writer.add_scalar('data/curiosity_reward', torch.mean(curiosity_reward_batch).item(), n_iter)
        writer.add_scalar('data/train_q_loss', q_loss.item(), n_iter)
        writer.add_scalar('data/train_pi_loss', pi_loss.item(), n_iter)
        writer.add_scalar('data/train_entropy_loss', entropy.item(), n_iter)
    return q_loss + pi_loss + entropy + CURIOSITY_WEIGTH * (CURIOSITY_BETA * inv_loss + (1 - CURIOSITY_BETA) * curiosity_reward_batch)

In [37]:
simple_bot = "../submissions/simple/main.py"
replays = "replays"

def run_game(left_bot=simple_bot, right_bot=simple_bot, seed=42, loglevel=2):
    replay_path = "replay.json"
    python_v = "python3.7"
    
    replay_path = os.path.join(replays, str(np.random.randint(1e9)) + ".json")
    
    size = np.random.choice([12,16,24,32], size=1)[0]
    
    res = subprocess.run([
        "lux-ai-2021",
        left_bot,
        right_bot,
#         "--statefulReplay",
        "--width={}".format(size),
        "--height={}".format(size),
        "--loglevel={}".format(loglevel),
        "--python={}".format(python_v),
        "--maxtime=100000",
        "--maxConcurrentMatches=4",
        "--seed={}".format(seed),
        "--out={}".format(replay_path)], stdout=subprocess.PIPE)
    
    if loglevel > 0:
        print(res.stdout.decode())

    assert res.returncode == 0

    with open(replay_path, "r") as f:
        result = json.load(f)
    return result, res.stdout.decode()

In [432]:
run_game(simple_bot, simple_bot)  # <-- test run one game with default bot

In [38]:
import hashlib

def build_runnable_bot_with_flags(flags: dict, origin = simple_bot, base_path = '../submissions/simple/') -> str:
    lines = []
    with open(origin, "r") as f:
        for line in f:
            lines.append(line[:-1])
    text = '\n'.join(lines)
    f = json.dumps(flags)
    text = text.format(f)
    h = int(hashlib.sha256(f.encode('utf-8')).hexdigest(), 16) % (10 ** 18)
    path = base_path + "main_" + str(h) + ".py"
    with open(path, "w") as f:
        f.write(text)
    return path

In [39]:
def count_series(results: list):
    wins = []
    for i, r in enumerate(results):
        ranks = r[0]['results']['ranks']
        teams = r[0]['teamDetails']
        if ranks[0]['rank'] == 1 and ranks[1]['rank'] == 2:
            if ranks[0]["agentID"] == i % 2:
                wins.append(1)
            else:
                wins.append(0)
        else:
            wins.append(0.5)
    return wins

In [40]:
from joblib import Parallel, delayed
import tqdm

In [41]:
def sample_dataset(d, p=0.5):
    N = len(d.features)
    ids = np.random.choice(N, size=int(N * p))
    return dataset.Dataset(features = d.features[ids], weights = d.weights[ids], targets = d.targets[ids])

In [42]:
def add_next_features(d):
    assert d.next_state_id is not None
    coupled_features = []
    weights = []
    targets = []
    for i in np.arange(d.features.shape[0]):
        next_i = d.next_state_id[i]
        if d.next_state_id[i] != -1:
            next_f = d.features[next_i]
        else:
            next_f = np.ones_like(d.features[i]) * (-1)
        coupled_features.append(np.concatenate([d.features[i], next_f]))
        weights.append(d.weights[i])
        targets.append(d.targets[i])
    return dataset.Dataset(
        features=np.array(coupled_features),
        weights=np.array(weights),
        targets=np.array(targets))

In [None]:
t = 0  #  1778 - value_iter
B = 8

model = NNWithCustomFeatures(83, 0.05, 64)

writer = SummaryWriter()

#optimizer = optim.Adam(model.parameters(), lr=1e-3)

iter_data = {"n_iter": 0}

while True:
    t += 1
    np.random.seed(t)
    torch.save(model.state_dict(), '../submissions/simple/models/ac_last')
    trainD = []
    valD = []
    for i in np.arange(B):
        seed = t * B + i
#         _f = str(seed) + ".txt"
        _f = "log_{}.txt".format(i)
        bot = build_runnable_bot_with_flags({
            "model_path": "models/ac_last",
            "use_policy": True,
            "is_neural": True,
            "prob_use_default_agent": 0.0,
            "prob_use_random": 0.05,
            "ohe_path": "models/ohe_v2",
            "use_old_units_cargo_rules": False,
            "log_features_path": "../../research/features_iter/", "log_path_file_name": _f
        })
        if i % 2 == 0:
            _r = run_game(bot, simple_bot, loglevel=0, seed=seed)
        else:
            _r = run_game(simple_bot, bot, loglevel=0, seed=seed)
        wins = np.mean(count_series([_r]))
        if i % 2 == 1:
            wins = 1 - wins
        game_set = dataset.get_dataset_from_file(os.path.join("features_iter/", _f), wins)
        reward = np.sum(game_set.weights) / (np.sum(game_set.weights != 0) + 1e-9)
        trainD_ohe, valD_ohe, _ = prepare_features(game_set, game_set, OHE)
        max_step = np.max(game_set.features[:, 31])
        trainD_ohe_with_next = add_next_features(trainD_ohe)
        trainD_ohe_with_next_sampled = sample_dataset(trainD_ohe_with_next, 0.1)
        print("Round {}, game: {}, is_win: {}, max_step: {}, reward: {}, example: {}".format(t, i, wins, max_step, reward, _r[0]['results']))
        writer.add_scalar('data/reward', reward, seed)
        writer.add_scalar('data/winrate', wins, seed)
        writer.add_scalar('data/max_step', max_step, seed)
        trainD.append(trainD_ohe_with_next_sampled)
        valD.append(trainD_ohe_with_next_sampled)
    trainD = dataset.concat_datasets(trainD)
    valD = dataset.concat_datasets(valD)
    try:
        learn(trainD, valD, model, actor_critic_loss, iter_data=iter_data,
              batch_size=64, epochs=8, freq=1, writer=writer, lr=1e-4)
    except Exception as e:
        print(e)

Round 1, game: 0, is_win: 0.0, max_step: 69, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/717354021.json'}
Round 1, game: 1, is_win: 0.0, max_step: 115, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/224766667.json'}
Round 1, game: 2, is_win: 0.0, max_step: 158, reward: -0.9999999996666666, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/592322119.json'}
Round 1, game: 3, is_win: 0.0, max_step: 110, reward: -0.99999999975, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/732152370.json'}
Round 1, game: 4, is_win: 0.0, max_step: 109, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/542837249.json'}
Round 1, game: 5, is_win: 0.0, max_step: 118, reward: -0.99999

  if (await self.run_code(code, result,  async_=asy)):


Round 3, game: 0, is_win: 0.0, max_step: 234, reward: -0.9999999996666666, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/218175338.json'}
Round 3, game: 1, is_win: 0.0, max_step: 70, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/446268890.json'}
Round 3, game: 2, is_win: 0.0, max_step: 239, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/39738949.json'}
Round 3, game: 3, is_win: 0.0, max_step: 30, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/800511694.json'}
Round 3, game: 4, is_win: 0.0, max_step: 69, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/740382262.json'}


  if (await self.run_code(code, result,  async_=asy)):


Round 3, game: 5, is_win: 0.0, max_step: 311, reward: -0.9999999998, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/159508645.json'}
Round 3, game: 6, is_win: 0.0, max_step: 110, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/864914992.json'}
Round 3, game: 7, is_win: 0.0, max_step: 229, reward: -0.9999999998, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/890484912.json'}
train loss in 1 epoch in 1 batch: -1.47844
val loss in 1 epoch: -1.41544
train loss in 1 epoch in 2 batch: -1.33407
val loss in 1 epoch: -1.43045
train loss in 2 epoch in 1 batch: -1.42573
val loss in 2 epoch: -1.43713
train loss in 2 epoch in 2 batch: -1.51668
val loss in 2 epoch: -1.44076
train loss in 3 epoch in 1 batch: -1.45928
val loss in 3 epoch: -1.44504
train loss in 3 epoch in 2 batch: -1.51803
val loss in 3 epoch: -1.44872
train loss

  if (await self.run_code(code, result,  async_=asy)):


Round 6, game: 6, is_win: 1.0, max_step: 154, reward: 0.9999999998333333, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/826826325.json'}
Round 6, game: 7, is_win: 1.0, max_step: 119, reward: 0.9999999999, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/15608344.json'}
train loss in 1 epoch in 1 batch: -1.41086
val loss in 1 epoch: -1.52527
train loss in 1 epoch in 2 batch: -1.53956
val loss in 1 epoch: -1.54257
train loss in 2 epoch in 1 batch: -1.45815
val loss in 2 epoch: -1.55034
train loss in 2 epoch in 2 batch: -1.56252
val loss in 2 epoch: -1.56179
train loss in 3 epoch in 1 batch: -1.48381
val loss in 3 epoch: -1.56437
train loss in 3 epoch in 2 batch: -1.57120
val loss in 3 epoch: -1.56550
train loss in 4 epoch in 1 batch: -1.49603
val loss in 4 epoch: -1.56791
train loss in 4 epoch in 2 batch: -1.56815
val loss in 4 epoch: -1.57433
train loss in 5 epoch in 1 batch: -1.52079
va

  if (await self.run_code(code, result,  async_=asy)):


Round 8, game: 3, is_win: 0.0, max_step: 193, reward: -0.9999999996666666, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/441574153.json'}
Round 8, game: 4, is_win: 0.0, max_step: 70, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/627688022.json'}
Round 8, game: 5, is_win: 0.0, max_step: 114, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/60817287.json'}
Round 8, game: 6, is_win: 0.0, max_step: 115, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/592756178.json'}
Round 8, game: 7, is_win: 0.0, max_step: 29, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/300371104.json'}
train loss in 1 epoch in 1 batch: -1.54891
val loss in 1 epoch: -1.5942

  if (await self.run_code(code, result,  async_=asy)):


Round 9, game: 3, is_win: 0.0, max_step: 277, reward: -0.9999999999230769, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/402714107.json'}
Round 9, game: 4, is_win: 0.0, max_step: 118, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/288759037.json'}
Round 9, game: 5, is_win: 0.0, max_step: 29, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/34982697.json'}
Round 9, game: 6, is_win: 0.0, max_step: 70, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/320655317.json'}
Round 9, game: 7, is_win: 0.0, max_step: 30, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/243847411.json'}
train loss in 1 epoch in 1 batch: -1.55535
val loss in 1 epoch: -1

  if (await self.run_code(code, result,  async_=asy)):


Round 10, game: 6, is_win: 0.0, max_step: 230, reward: -0.9999999998, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/50629214.json'}
Round 10, game: 7, is_win: 0.0, max_step: 114, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/458918533.json'}
train loss in 1 epoch in 1 batch: -1.54775
val loss in 1 epoch: -1.58892
train loss in 2 epoch in 1 batch: -1.54906
val loss in 2 epoch: -1.59005
train loss in 3 epoch in 1 batch: -1.55794
val loss in 3 epoch: -1.58996
train loss in 4 epoch in 1 batch: -1.56587
val loss in 4 epoch: -1.59248
train loss in 5 epoch in 1 batch: -1.56843
val loss in 5 epoch: -1.59789
train loss in 6 epoch in 1 batch: -1.56429
val loss in 6 epoch: -1.59959
train loss in 7 epoch in 1 batch: -1.55366
val loss in 7 epoch: -1.60022
train loss in 8 epoch in 1 batch: -1.56256
val loss in 8 epoch: -1.59591
Round 11, game: 0, is_win: 0.0, max_step: 189, 

  if (await self.run_code(code, result,  async_=asy)):


Round 11, game: 4, is_win: 0.0, max_step: 235, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/285045419.json'}
Round 11, game: 5, is_win: 0.0, max_step: 35, reward: -0.9999999996666666, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/935655093.json'}
Round 11, game: 6, is_win: 0.0, max_step: 198, reward: -0.9999999996666666, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/224986811.json'}
Round 11, game: 7, is_win: 0.0, max_step: 30, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/762488942.json'}
train loss in 1 epoch in 1 batch: -1.56545
val loss in 1 epoch: -1.61540
train loss in 2 epoch in 1 batch: -1.56946
val loss in 2 epoch: -1.62082
train loss in 3 epoch in 1 batch: -1.60239
val loss in 3 epoch: -1.62576
train loss in 4 epoch in 1 batc

  if (await self.run_code(code, result,  async_=asy)):


Round 12, game: 0, is_win: 0.0, max_step: 314, reward: -0.9999999999333333, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/662124363.json'}
Round 12, game: 1, is_win: 0.0, max_step: 154, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/227425080.json'}
Round 12, game: 2, is_win: 0.0, max_step: 70, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/75057355.json'}
Round 12, game: 3, is_win: 0.0, max_step: 358, reward: -0.9999999998, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/169335405.json'}
Round 12, game: 4, is_win: 0.0, max_step: 29, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/215478765.json'}
Round 12, game: 5, is_win: 0.0, max_step: 110, reward:

  if (await self.run_code(code, result,  async_=asy)):


Round 13, game: 0, is_win: 0.0, max_step: 118, reward: -0.9999999998333333, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/118980946.json'}
Round 13, game: 1, is_win: 0.0, max_step: 30, reward: -0.9999999995, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/772441077.json'}
Round 13, game: 2, is_win: 0.0, max_step: 110, reward: -0.9999999996666666, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/419108120.json'}
Round 13, game: 3, is_win: 0.0, max_step: 150, reward: -0.9999999999, example: {'ranks': [{'rank': 1, 'agentID': 0}, {'rank': 2, 'agentID': 1}], 'replayFile': 'replays/534433658.json'}
Round 13, game: 4, is_win: 0.0, max_step: 278, reward: -0.9999999989999999, example: {'ranks': [{'rank': 1, 'agentID': 1}, {'rank': 2, 'agentID': 0}], 'replayFile': 'replays/865398223.json'}
Round 13, game: 5, is_win: 0.0, max_step: 70, reward: -0.9