In [None]:
from ray.rllib.agents.registry import get_trainer_class
import ray
import numpy as np
import gym
from gym import spaces
from ray.rllib.env.env_context import EnvContext

In [None]:
class GymEnvUbiquant(gym.Env):
    """
    Dummy Environment class for to allow tuner to retrieve action_space and observation space dimensions.
    """
    def __init__(self, config:EnvContext):
        self.num_investment = config['num_investment'] # evaluate one by one
        self.num_features = config['num_features'] # fix to 300 features
        self.pred_dict = {}
        self.action_space = spaces.Box(
            low=-15,
            high=15,
            shape=(self.num_investment,)
        )
        # observation from the investment id with num_features
        self.observation_space = spaces.Box(
            low=-100,
            high=100,
            shape=(self.num_investment, self.num_features)
        )
    def step(self,a):
        """
        given an action a, return the state of the environment with rewards
        """
        return None, None, None, None
    
    def reset(self):
        """
        provide the next input
        """
        return None
    
    def render(self):
        pass
    
    def close(self):
        pass
    

In [None]:
config = {
    # or "corridor" if registered above
    "env": GymEnvUbiquant,
    "log_level": "INFO",
    "env_config": {'num_features': 300,
                       'num_investment': 1,},
    "framework": "torch",
    "model": {
        "use_lstm": True,
        "lstm_cell_size": 128,
        "fcnet_hiddens": [512, 512, 256],
        "fcnet_activation": "swish"
    },
    "num_envs_per_worker": 1,
    "num_workers": 1,
    "lambda": 0.95,
    "shuffle_sequences": False,
    "sgd_minibatch_size": 512,
    "vf_clip_param": 100.0,
    "vf_loss_coeff": 0.5,
    "batch_mode": "complete_episodes",
    "lr": 1e-4,
}


# Initiates state for LSTM, a r are none since we are not using previous rewards and actions as inputs to LSTM.
init_prev_a = prev_a = None
init_prev_r = prev_r = None
lstm_cell_size = config["model"]["lstm_cell_size"]
init_state = state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)]

In [None]:
ray.init()

In [None]:
# Create a "trainer" with policy to compute action.
trainer = get_trainer_class("PPO")(config=config)

In [None]:
# Import checkpoint model:
trainer.restore('../input/rllibchkpt-run100/checkpoint_005100/checkpoint-5100')

In [None]:
# A naive way to store outputs from the input to keep track of state... 
# may be useful if using LSTM that requires one to keep track of action states
MAIN_DICT = {} 

# Data structur of dictionary:
#investment_id : {time_id#: {features: f_0 to f_299, # feature was not used.
#                            predictions: x,
#                            state: y}
#                 }

In [None]:
def update_dict(df, df_out):
    """
    Takes the test df and update internal dictionary
    Run the prediction model and output it to a dataframe
    """
    global MAIN_DICT
    
    for group, df in df.groupby('investment_id'):
        # using time-id to uniquely identify each entry for each investment id
        time_id = df.row_id.iloc[0] #df.row_id.apply(lambda x: x.split('_'))[1][0]
        # extract features from the df..
        obs = df.loc[:,[f for f in df.columns if 'f' in f]].to_numpy() # required to be stored?
        assert obs.shape == (1,300), f'obs shape: {obs.shape}'
        
        if group in MAIN_DICT:
            # get the previous state for this investment_id:
            prev_time_id = MAIN_DICT[group]['last_time']
            state = MAIN_DICT[group][prev_time_id]['state']
        else:
            MAIN_DICT[group] = {}
            state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)]
                    
        a, state_out, _ = trainer.compute_single_action(
            observation=obs,
            state=state,
            prev_action=prev_a,
            prev_reward=prev_r,
            explore=False, # set to false or should be stochastic??
            policy_id="default_policy",  # <- default value
        )
        
        # update the MAIN_DICT:
        MAIN_DICT[group].update({time_id: {'state': state_out,
                                           'a': a},
                                 'last_time': time_id, # pointer to the current instance for quick lookup of state_out.
                                })
        
        # update the prediction dataframe for competition
        df_out.loc[df_out.row_id==time_id,'target'] = a
        
    return df_out
        

In [None]:
import ubiquant
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

for (test_df, sample_prediction_df) in iter_test:
    sample_prediction_df = update_dict(test_df, sample_prediction_df)
    env.predict(sample_prediction_df)
