In [1]:

import torch
import numpy as np
import pandas as pd
import random

import matplotlib.pyplot as plt
from tournamentgym import TournamentEnv
import os
import random
from dataclasses import dataclass
import torch
from torch.utils.data import Dataset
from transformers import DecisionTransformerConfig, DecisionTransformerModel, Trainer, TrainingArguments

  from tqdm.autonotebook import tqdm


In [2]:
from stable_baselines3 import PPO,A2C,SAC,DQN,DDPG

from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement
from stable_baselines3.common.env_util import make_vec_env
import torch
from torch.utils.data import DataLoader, Dataset


In [3]:
from imitation.data.rollout import TrajectoryAccumulator
from imitation.data.huggingface_utils import trajectories_to_dataset
# from imitation.data import serialize
from imitation.data.rollout import rollout_stats

In [4]:
MW='M'

team_data=pd.read_csv(f'Process_data/{MW}_pm_encoded.csv')
env=TournamentEnv(team_stats=team_data,
                     season=2022,
                     verbose=False)


  0%|          | 0/10000 [00:00<?, ?it/s]

In [5]:
num_dems=1000
traj_acum=TrajectoryAccumulator()
demonstrations=[]
for i in range(num_dems):
    obs,info=env.reset()
    terminated=False
    step={'obs':obs}
    traj_acum.add_step(key=i,step_dict=step)
    j=0
    while not terminated:
        act=env.cheat_action() 
        j+=1
        if i%10!=0:
            if (j%20)==0:
                act=np.random.choice([0,1])
        next_obs, reward, terminated,truncated, info=env.step(action=act,)
        
        step={'obs':np.array(obs),
            'acts':np.array([act]),
            'rews':np.array(reward),
            'infos':info}
        traj_acum.add_step(key=i,
                        step_dict=step,
                        )
        # print(reward,train_env.trade_week_start,train_env.current_time)
        obs=next_obs
    
    
    traj=traj_acum.finish_trajectory(key=i,terminal=terminated)
    demonstrations.append(traj)

In [6]:
traj_dataset=trajectories_to_dataset(demonstrations)

In [7]:
stats = rollout_stats(demonstrations)
print(stats["return_mean"])
stats

292.331


{'n_traj': 1000,
 'return_min': 173.5,
 'return_mean': 292.331,
 'return_std': 31.8413165399925,
 'return_max': 320.0,
 'len_min': 64,
 'len_mean': 65.704,
 'len_std': 1.4860632557196212,
 'len_max': 67}

In [8]:
@dataclass
class DecisionTransformerGymDataCollator:
    return_tensors: str = "pt"
    max_len: int = 20 #subsets of the episode we use for training
    state_dim: int = 17  # size of state space

    p_sample: np.array = None  
    # n_traj: int = 0 

    def __init__(self, demonstrations) -> None:
        traj_stats=rollout_stats(demonstrations)
        self.n_traj=traj_stats['n_traj'] # to store the number of trajectories in the dataset
        self.max_ep_len=traj_stats['len_max'] # max episode length in the dataset
        self.scale = traj_stats['return_max']  # normalization of rewards/returns
        dataset=trajectories_to_dataset(demonstrations) ## turn list of trajectories into a dataset
        self.dataset = dataset
        self.act_dim = len(dataset[0]['acts'][0])
        # calculate dataset stats for normalization of states
        states = []
        traj_lens = []
        for obs in dataset["obs"]:
            obs=np.array(obs)
            states.append(obs)
            traj_lens.append(len(obs))
        
        states = np.vstack(states)
        self.state_dim=states.shape[1]
        self.state_mean = np.mean(states, axis=0) # to store state means

        self.state_std=np.std(states, axis=0) + 1e-6 # to store state stds
        traj_lens = np.array(traj_lens)
        self.p_sample = traj_lens / sum(traj_lens) # a distribution to take account trajectory lengths

    def _discount_cumsum(self, x, gamma):

        discount_cumsum = np.zeros_like(x)

        discount_cumsum[-1] = x[-1]
        for t in reversed(range(x.shape[0] - 1)):
            discount_cumsum[t] = x[t] + gamma * discount_cumsum[t + 1]
        return discount_cumsum

    def __call__(self, features):
        batch_size = len(features)
        # this is a bit of a hack to be able to sample of a non-uniform distribution
        batch_inds = np.random.choice(
            np.arange(self.n_traj),
            size=batch_size,
            replace=True,
            p=self.p_sample,  # reweights so we sample according to timesteps
        )
        # a batch of dataset features
        s, a, r, rtg, timesteps, mask = [], [], [], [], [], []
        
        for ind in batch_inds:
            # for feature in features:
            feature = self.dataset[int(ind)]
            obs=np.array(feature["obs"])
            rews=np.array(feature["rews"]).reshape(-1,1)
            acts=np.array(feature["acts"])

            si = random.randint(1, len(rews)-1 )
            

            # get sequences from dataset
            s.append(np.array(obs[si : si + self.max_len]).reshape(1, -1, self.state_dim))
            a.append(np.array(acts[si : si + self.max_len]).reshape(1, -1, self.act_dim))
            r.append(np.array(rews[si : si + self.max_len]).reshape(1, -1, 1))

            timesteps.append(np.arange(si, si + s[-1].shape[1]).reshape(1, -1))
            timesteps[-1][timesteps[-1] >= self.max_ep_len] = self.max_ep_len - 1  # padding cutoff
            
            try:
                rews[si:]
                discount_rew=self._discount_cumsum(np.array(rews[si:]), gamma=1.0)
            except:
                print(rews[si:])
            discount_rew=self._discount_cumsum(np.array(rews[si:]), gamma=1.0)

            slen=s[-1].shape[1]

            rew_tg=discount_rew[: slen].reshape(1, -1, 1)
            rtg.append(rew_tg)
            rtg_len=rtg[-1].shape[1]
            rtg_pad=np.zeros((1,self.max_len-rtg_len, 1))
            rtg[-1] = np.concatenate([rtg[-1],rtg_pad], axis=1)
            rtg[-1]=rtg[-1]/ self.scale

            # padding and state + reward normalization
            s_pad=np.zeros((1, self.max_len - slen, self.state_dim))
            s[-1] = np.concatenate([s_pad, s[-1]], axis=1)
            s[-1] = (s[-1] - self.state_mean) / self.state_std

            alen=a[-1].shape[1]
            act_pad=np.ones((1, self.max_len - alen, self.act_dim))
            a[-1] = np.concatenate([ act_pad* -10.0, a[-1]], axis=1,)

            rlen=r[-1].shape[1]
            rew_pad=np.zeros((1, self.max_len - rlen, 1))
            r[-1] = np.concatenate([rew_pad, r[-1]], axis=1)

            tlen=timesteps[-1].shape[1]
            time_pad=np.zeros((1, self.max_len - tlen))
            timesteps[-1] = np.concatenate([time_pad, timesteps[-1]], axis=1)

            masked=np.zeros((1, self.max_len - alen))
            unmasked=np.ones((1, alen))
            mask.append(np.concatenate([masked,unmasked], axis=1))

        s = torch.from_numpy(np.concatenate(s, axis=0)).float()
        a = torch.from_numpy(np.concatenate(a, axis=0)).float()
        r = torch.from_numpy(np.concatenate(r, axis=0)).float()
        rtg = torch.from_numpy(np.concatenate(rtg, axis=0)).float()
        timesteps = torch.from_numpy(np.concatenate(timesteps, axis=0)).long()
        mask = torch.from_numpy(np.concatenate(mask, axis=0)).float()
        return {
            "states": s,
            "actions": a,
            "rewards": r,
            "returns_to_go": rtg,
            "timesteps": timesteps,
            "attention_mask": mask,
        }

In [9]:
class TrainableDT(DecisionTransformerModel):
    def __init__(self, config):
        super().__init__(config)
        self.soft=torch.nn.Softmax()

    def forward(self, **kwargs):
        output = super().forward(**kwargs)
        # add the DT loss
        action_preds = output[1]
        action_targets = kwargs["actions"]
        attention_mask = kwargs["attention_mask"]
        act_dim = action_preds.shape[2]
        action_preds = action_preds.reshape(-1, act_dim)[attention_mask.reshape(-1) > 0]
        action_targets = action_targets.reshape(-1, act_dim)[attention_mask.reshape(-1) > 0]
        action_preds=self.soft(action_preds)        
        losses = torch.binary_cross_entropy_with_logits(action_preds ,action_targets) 
        loss=torch.mean(losses)
        return {"loss": loss}

    def original_forward(self, **kwargs):
        return super().forward(**kwargs)
    
    def get_action(self, states, actions, rewards, returns_to_go, timesteps):
        # This implementation does not condition on past rewards

        states = states.reshape(1, -1, self.config.state_dim)
        actions = actions.reshape(1, -1, self.config.act_dim)
        returns_to_go = returns_to_go.reshape(1, -1, 1)
        timesteps = timesteps.reshape(1, -1)

        states = states[:, -self.config.max_length :]
        actions = actions[:, -self.config.max_length :]
        returns_to_go = returns_to_go[:, -self.config.max_length :]
        timesteps = timesteps[:, -self.config.max_length :]
        padding = self.config.max_length - states.shape[1]
        # pad all tokens to sequence length
        attention_mask = torch.cat([torch.zeros(padding), torch.ones(states.shape[1])])
        attention_mask = attention_mask.to(dtype=torch.long).reshape(1, -1)
        states = torch.cat([torch.zeros((1, padding, self.config.state_dim)), states], dim=1).float()
        returns_to_go = torch.cat([torch.zeros((1, padding, 1)), returns_to_go], dim=1).float()
        timesteps = torch.cat([torch.zeros((1, padding), dtype=torch.long), timesteps], dim=1)

        state_preds, action_preds, return_preds = self.original_forward(
            states=states,
            actions=actions,
            rewards=rewards,
            returns_to_go=returns_to_go,
            timesteps=timesteps,
            attention_mask=attention_mask,
            return_dict=False,
        )
        action_preds=self.soft(action_preds) 
        action=int(action_preds[0, -1])

        return action

    

In [10]:

collator = DecisionTransformerGymDataCollator(demonstrations)

config = DecisionTransformerConfig(state_dim=collator.state_dim, act_dim=collator.act_dim)
model = TrainableDT(config)
     

In [12]:
training_args = TrainingArguments(
    output_dir="decision_models/",
    logging_dir="Logs/DTRL/",
    remove_unused_columns=False,
    num_train_epochs=100,
    per_device_train_batch_size=60,
    logging_strategy='epoch',
    learning_rate=1e-4,
    weight_decay=1e-4,
    warmup_ratio=0.1,
    optim="adamw_torch",
    max_grad_norm=0.25,

    report_to='tensorboard'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=traj_dataset,
    data_collator=collator,
)

trainer.train()

  0%|          | 0/1700 [00:00<?, ?it/s]

{'loss': 0.8484, 'learning_rate': 1e-05, 'epoch': 1.0}
{'loss': 0.8481, 'learning_rate': 2e-05, 'epoch': 2.0}


In [None]:

best_trial = trainer.hyperparameter_search(
    direction="maximize", 
    backend="ray", 
    n_trials=10 # number of trials
)

In [None]:

model = model.to("cpu")
env = env

max_ep_len = 67
device = "cpu"
scale = 1000.0  # normalization for rewards/returns
TARGET_RETURN = 600 / scale  # evaluation is conditioned on a return of 600, scaled accordingly

state_mean = collator.state_mean.astype(np.float32)
state_std = collator.state_std.astype(np.float32)
print(state_mean)

state_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
# Create the decision transformer model

state_mean = torch.from_numpy(state_mean).to(device=device)
state_std = torch.from_numpy(state_std).to(device=device)

In [None]:
env=TournamentEnv(team_stats=team_data,
                     season=2023,
                     verbose=False,
                     exclude_seasons=[])

In [None]:
# Interact with the environment and create a video
episode_return, episode_length = 0, 0
state = env.reset(season=2023)
target_return = torch.tensor(TARGET_RETURN, device=device, dtype=torch.float32).reshape(1, 1)
states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
rewards = torch.zeros(0, device=device, dtype=torch.float32)

timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)
done=False
while not done:
    actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
    rewards = torch.cat([rewards, torch.zeros(1, device=device)])

    action = model.get_action(
        (states - state_mean) / state_std,
        actions,
        rewards,
        target_return,
        timesteps,
    )
    actions[-1] = action
    action = action.detach().cpu().numpy()

    state, reward, done,truncated, info = env.step(action)

    cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
    states = torch.cat([states, cur_state], dim=0)
    rewards[-1] = reward

    pred_return = target_return[0, -1] - (reward / scale)
    target_return = torch.cat([target_return, pred_return.reshape(1, 1)], dim=1)
    timesteps = torch.cat([timesteps, torch.ones((1, 1), device=device, dtype=torch.long) * (t + 1)], dim=1)

    episode_return += reward
    episode_length += 1

info
