In [1]:
'''
The code is used to train BC imitator, or pretrained GAIL imitator
'''

import argparse
import tempfile
import os.path as osp
import gym
import logging
from tqdm import tqdm

import tensorflow as tf

##from baselines.common.mpi_adam import MpiAdam

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [3]:
def generate_expert_data_with_rewards(scenario_file, num_interactions=1000, file_name="expert_data", act_ndim=19):
    #from gfrl.base.run_my_ppo2 import create_single_scenic_environment
    import numpy as np
    expert_observations = []
    expert_actions = []
    expert_rewards = []

    gf_env_settings = {
        "stacked": True,
        "rewards": 'scoring',
        "representation": 'extracted',
        "players": [f"agent:left_players=1"],
        "action_set": "default",#"default" "v2"
    }

    from scenic.simulators.gfootball.rl_interface import GFScenicEnv
    from scenic.simulators.gfootball.utilities.scenic_helper import buildScenario
    scenario = buildScenario(scenario_file)
    env = GFScenicEnv(initial_scenario=scenario, gf_env_settings=gf_env_settings, use_scenic_behavior_in_step=True, constraints_checking=True)
    
    obs = env.reset()
    tr = 0
    for i in tqdm(range(num_interactions)):
        expert_observations.append(obs)

        obs, reward, done, info = env.step(env.action_space.sample())
        tr+=reward
        # print(info)
        action = info["action_taken"]
        expert_actions.append(action)

        if done:
            obs = env.reset()
            expert_rewards.append(tr)
            tr = 0

    expert_observations = np.array(expert_observations)
    #expert_observations = np.moveaxis(expert_observations, [3], [1])
    acts = np.array(expert_actions)

    acts_oh = np.zeros((acts.shape[0], act_ndim))
    acts_oh[np.arange(acts.shape[0]), acts] = 1

    expert_rewards = np.array(expert_rewards)
    print("Expert observation shape: ", expert_observations.shape)
    print("Expert actions shape: ", acts_oh.shape)
    print("Num Episode: ", expert_rewards.shape[0])
    print("Mean Reward: ", expert_rewards.mean())

    #np.savez(expert_file_name, obs=mb_obs, acs=mb_act_oh, num_epi = num_episodes, mean_reward = np.sum(mb_rewards)/num_episodes)
    np.savez_compressed(
        file_name,
        acs=acts_oh,
        obs=expert_observations,
        num_epi = expert_rewards.shape[0],
        mean_reward = expert_rewards.mean(),
        rewards = expert_rewards
    )
    return expert_observations, acts_oh, expert_rewards



In [4]:
def generate_expert_successful_data(scenario_file, num_interactions=1000, file_name="expert_data", act_ndim=19):
    #from gfrl.base.run_my_ppo2 import create_single_scenic_environment
    import numpy as np
    expert_observations = []
    expert_actions = []
    expert_rewards = []

    gf_env_settings = {
        "stacked": True,
        "rewards": 'scoring',
        "representation": 'extracted',
        "players": [f"agent:left_players=1"],
        "action_set": "default",#"default" "v2"
    }

    from scenic.simulators.gfootball.rl_interface import GFScenicEnv
    from scenic.simulators.gfootball.utilities.scenic_helper import buildScenario
    scenario = buildScenario(scenario_file)
    env = GFScenicEnv(initial_scenario=scenario, gf_env_settings=gf_env_settings, use_scenic_behavior_in_step=True, constraints_checking=True)
    
    
    tr = 0
    
    obs_buf, acts_buf, rew_buf  = [], [], []
    policy_rews = []
    
    with tqdm(total=num_interactions) as pbar:
        
        while(len(expert_observations)<num_interactions):
            
            
            if len(obs_buf)==0:
                obs = env.reset()
            
            obs_buf.append(obs)
            obs, reward, done, info = env.step(env.action_space.sample())
            
            #rew_buf.append(reward)

            tr+=reward
            # print(info)
            action = info["action_taken"]
            acts_buf.append(action)

            if done:
                
                #print(f"New Epi: {len(obs_buf)} R: {tr}")
                
                policy_rews.append(tr)
                
                if tr>0: 
                    expert_observations.extend(obs_buf)
                    expert_actions.extend(acts_buf)
                    expert_rewards.append(tr)
                    
                    if len(expert_observations)> num_interactions:
                        pbar.update(num_interactions)
                    else:
                        pbar.update(len(obs_buf)) 
                        
                    #print("Added new Api. Current Size: ", len(expert_observations))


                obs_buf, acts_buf, rew_buf  = [], [], []
                obs = env.reset()
                tr = 0

            
    print("Collection Done")        
        

    expert_observations = np.array(expert_observations)
    #expert_observations = np.moveaxis(expert_observations, [3], [1])
    acts = np.array(expert_actions)

    acts_oh = np.zeros((acts.shape[0], act_ndim))
    acts_oh[np.arange(acts.shape[0]), acts] = 1

    expert_rewards = np.array(expert_rewards)
    
    print("Expert observation shape: ", expert_observations.shape)
    print("Expert actions shape: ", acts_oh.shape)
    print("Num Expert Episode: ", expert_rewards.shape[0])
    print("Mean Expert Reward: ", expert_rewards.mean())
    
    print("Num Trajectories Collected: ", len(policy_rews))
    print("Mean Policy Reward: ", np.mean(policy_rews))
    

    #np.savez(expert_file_name, obs=mb_obs, acs=mb_act_oh, num_epi = num_episodes, mean_reward = np.sum(mb_rewards)/num_episodes)
    np.savez_compressed(
        file_name,
        acs=acts_oh,
        obs=expert_observations,
        num_epi = expert_rewards.shape[0],
        mean_reward = expert_rewards.mean(),
        rewards = expert_rewards,
        policy_mean_reward = np.mean(policy_rews),
        policy_total_trajectories = len(policy_rews)
    )
    return expert_observations, acts_oh, expert_rewards



In [7]:
#from gfrl.base.run_my_ppo2 import create_single_scenic_environment

#scenario = "/Users//codebase/scenic/training/gfrl/_scenarios/sc4rl/wb/fg_11v1_wb_rns_rand1.scenic"
scenario ="/Users//codebase/scenic/training/gfrl/_scenarios/sc4rl/wb/ps_3v2_0_wb_0.scenic"
num_interactions = 10000
expert_file_name = f"../_data/sc4rl_ps_3v2_0_v0_rand0_{num_interactions}.npz"

obs, act, rew = generate_expert_successful_data(scenario_file=scenario, num_interactions=num_interactions, file_name=expert_file_name)



  0%|          | 0/10000 [00:00<?, ?it/s]

Environment will ignore actions passed to step() and take action provided by Scenic


19973it [43:26,  7.66it/s]                            


Collection Done
Expert observation shape:  (10024, 72, 96, 16)
Expert actions shape:  (10024, 19)
Num Expert Episode:  143
Mean Expert Reward:  1.0
Num Trajectories Collected:  813
Mean Policy Reward:  0.17589175891758918


In [2]:

#num_interactions=1000
#expert_file_name = f"../_data/sc4rl_3v1_v1_succ_{num_interactions}.npz"

expert_file_name = "/Users//codebase/scenic/training/gfrl/_data/sc4rl_fg11v1_rns_rand1_succ_10000.npz"
from gfrl.common.mybase.cloning.dataset import GFDset
dataset = GFDset(expert_file_name)
print(dataset.num_epi, dataset.mean_reward, dataset.size, dataset.acts.shape, "policy episodes, mean rew: ", dataset.policy_total_trajectories, dataset.policy_mean_reward)


92 1.0 10089 (10089, 19) policy episodes, mean rew:  143 0.6433566433566433


In [None]:
from gfrl.base.run_my_ppo2 import create_single_scenic_environment

scenario = "../_scenarios/academy/wb/pass_n_shoot_wb.scenic"

num_interactions=1000
expert_file_name = f"../_data/pns_{num_interactions}.npz"

obs, act, rew = generate_expert_data_with_rewards(scenario_file=scenario, num_interactions=num_interactions, file_name=expert_file_name)

In [None]:
import numpy as np
td = np.load(expert_file_name)

In [None]:
list(td.keys())

In [None]:
scenario = "../_scenarios/academy/wb/pass_n_shoot_wb.scenic"

num_interactions=5000
expert_file_name = f"../_data/pns_{num_interactions}.npz"

obs, act, rew = generate_expert_data_with_rewards(scenario_file=scenario, num_interactions=num_interactions, file_name=expert_file_name)

from gfrl.common.mybase.cloning.dataset import GFDset
dataset = GFDset(expert_file_name)
print(dataset.num_epi, dataset.mean_reward, dataset.size, dataset.acts.shape)


In [None]:
from gfrl.base.run_my_ppo2 import create_single_scenic_environment

scenario = "../_scenarios/academy/wb/pass_n_shoot_wb.scenic"

num_interactions=10000
expert_file_name = f"../_data/pns_{num_interactions}.npz"

obs, act, rew = generate_expert_data_with_rewards(scenario_file=scenario, num_interactions=num_interactions, file_name=expert_file_name)

In [None]:
from gfrl.common.mybase.cloning.dataset import GFDset
dataset = GFDset(expert_file_name)
print(dataset.num_epi, dataset.mean_reward, dataset.size, dataset.acts.shape)

In [None]:
"""
scenario = "../_scenarios/academy/wb/pass_n_shoot_wb.scenic"

num_interactions=20000
expert_file_name = f"../_data/pns_{num_interactions}.npz"

obs, act, rew = generate_expert_data_with_rewards(scenario_file=scenario, num_interactions=num_interactions, file_name=expert_file_name)
from gfrl.common.mybase.cloning.dataset import GFDset
dataset = GFDset(expert_file_name)
print(dataset.num_epi, dataset.mean_reward, dataset.size, dataset.acts.shape)
"""

In [None]:
scenario = "../_scenarios/academy/wb/pass_n_shoot_wb.scenic"

num_interactions=50000
expert_file_name = f"../_data/pns_{num_interactions}.npz"

obs, act, rew = generate_expert_data_with_rewards(scenario_file=scenario, num_interactions=num_interactions, file_name=expert_file_name)

from gfrl.common.mybase.cloning.dataset import GFDset
dataset = GFDset(expert_file_name)
print(dataset.num_epi, dataset.mean_reward, dataset.size, dataset.acts.shape)