## Tabular experiments
Run this notebook to obtain the data for the tabular experiments. Use then the `plot_results_tabular.ipynb` notebook to plot the results.

In [1]:
import os
import time
import numpy as np
import lzma
import pickle
import multiprocessing
from tabular.agents.agent import Experience
from tabular.make_agent import make_agent, AgentParameters
from tabular.simulation_parameters import make_env, SimulationParameters
from tabular.utils.utils import Results
from typing import List, NamedTuple, Sequence
from tabular.config import CONFIG
from copy import deepcopy

class DataResults(NamedTuple):
    """ This is object type saved in the results """
    simulation_parameters: SimulationParameters
    agent_type: str
    data: Sequence[Sequence[Results]]

def run(seed:int, agent_parameters: any, p: SimulationParameters) -> List[Results]:
    """Run a simulation 

    Args:
        seed (int): simulation seed
        agent_parameters (any): parameters of the agent
        p (SimulationParameters): simulation parameters

    Returns:
        List[Results]: A list of results (evaluated every p.sim_parameters.freq_eval steps)
    """
    np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
    np.random.seed(seed)
    env = make_env(env = p.env_parameters)
    print(np.random.uniform())
    
    start_time = time.time()
    s = env.reset()
    discount_factor = p.sim_parameters.discount_factor
    agent = make_agent(agent_parameters)

    results = []

    R_basis = env.generate_boundary_rewards()
    R_random = env.generate_random_rewards(N=p.sim_parameters.num_rewards)
    R = np.vstack([R_basis, R_random])

    for t in range(p.env_parameters.horizon):
        a = agent.forward(s, t)
        next_state, _ = env.step(a)
        exp = Experience(s, a, next_state)
        reset = agent.backward(exp, t)

        s = env.reset() if reset else next_state

        # Evaluate the agent
        if (t +1) % p.sim_parameters.freq_eval == 0:
            
            V_res, pi_res, Q_res = env.eval_transition(
                Phat=agent.empirical_transition(), R=R, discount_factor=discount_factor)
            print(f'[{t}] {agent.U_t} {agent.Z_t} - {agent.beta} -  {V_res.mean()} - {pi_res.mean()} - {agent.state_action_visits}')
            print('--------')
            
            # Append results to be saved
            results.append(
                 Results(step=t, omega=deepcopy(agent.omega), total_state_visits=deepcopy(agent.total_state_visits),
                         last_visit= deepcopy(agent.last_visit), exp_visits=deepcopy(agent.exp_visits), V_res=V_res,
                         Q_res=Q_res, pi_res=pi_res, elapsed_time=time.time() - start_time))
    return results


In [None]:
cfg = CONFIG
NUM_CPUS = 10 # Change this parameter to define the number of vCPUs to use

# Loop through the configurations
for env_params, agents in cfg.envs:
        env = make_env(env_params)
        agent_parameters = AgentParameters(
                 dim_state_space=env.dim_state, dim_action_space=env.dim_action,
                         discount_factor=cfg.sim_parameters.discount_factor, 
                         horizon=env_params.horizon,
                         frequency_evaluation=cfg.sim_parameters.freq_eval,
                         delta=cfg.sim_parameters.delta)
        
        # Loop through the agents
        for agent in agents:
            print(f'> Evaluating {agent.type} on {env_params.env_type.value}({env_params.horizon})', end='... ')
            agent = agent._replace(agent_parameters = agent_parameters)
            
            # Create path if it does not exists
            path = f'./tabular/data/{env_params.env_type.value}/{env_params.horizon}/'
            if not os.path.exists(path):
                os.makedirs(path)

            data = {}

            data['simulation_parameters'] = SimulationParameters(
                env_parameters=env_params,
                sim_parameters=cfg.sim_parameters
            )
            data['agent_type'] = agent

            iterations = [(seed, agent, data['simulation_parameters']) for seed in  range(data['simulation_parameters'].sim_parameters.num_sims)]
            start_time = time.time()
            data_returned = []

            # Run simulations
            with multiprocessing.Pool(NUM_CPUS) as pool:
                returns = [pool.apply_async(run, p) for p in iterations]

                for r in returns:
                     data_returned.append(r.get())


            data['data'] = data_returned
            print(f'done in {np.round(time.time() - start_time, 2)} seconds.')
                        
            data = DataResults(data['simulation_parameters'], data['agent_type'], data['data'])

            # Save compressed results
            with lzma.open(f'{path}/{agent.type}.pkl.lzma', 'wb') as f:
                pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
