In [1]:
from scenic.simulators.gfootball import rl_interface
from stable_baselines3 import PPO
from scenic.simulators.gfootball.rl_interface import GFScenicEnv
import pretrain_template
from gfootball_impala_cnn import GfootballImpalaCNN
import gym
from tqdm import tqdm
import numpy as np
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.evaluation import evaluate_policy

from torch.utils.data.dataset import Dataset, random_split
import os



In [2]:
class ExpertDataSet(Dataset):
    def __init__(self, expert_observations, expert_actions):
        self.observations = expert_observations
        self.actions = expert_actions

    def __getitem__(self, index):
        return (self.observations[index], self.actions[index])

    def __len__(self):
        return len(self.observations)


In [3]:
def pretrain_agent(
        student,
        env,
        expert_dataset,
        batch_size=64,
        epochs=10,
        scheduler_gamma=0.7,
        learning_rate=1.0,
        log_interval=100,
        no_cuda=True,
        seed=1,
        test_batch_size=64,
):
    train_size = int(0.8 * len(expert_dataset))

    test_size = len(expert_dataset) - train_size

    train_expert_dataset, test_expert_dataset = random_split(
        expert_dataset, [train_size, test_size]
    )

    print("test_expert_dataset: ", len(test_expert_dataset))
    print("train_expert_dataset: ", len(train_expert_dataset))


    use_cuda = not no_cuda and th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    if isinstance(env.action_space, gym.spaces.Box):
        criterion = nn.MSELoss()
    else:
        criterion = nn.CrossEntropyLoss()

    # Extract initial policy
    model = student.policy.to(device)

    def train(model, device, train_loader, optimizer):
        model.train()

        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()

            if isinstance(env.action_space, gym.spaces.Box):
                # A2C/PPO policy outputs actions, values, log_prob
                # SAC/TD3 policy outputs actions only
                if isinstance(student, (A2C, PPO)):
                    action, _, _ = model(data)
                else:
                    # SAC/TD3:
                    action = model(data)
                action_prediction = action.double()
            else:
                # Retrieve the logits for A2C/PPO when using discrete actions
                latent_pi, _, _ = model._get_latent(data)
                logits = model.action_net(latent_pi)
                action_prediction = logits
                target = target.long()

            loss = criterion(action_prediction, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        with th.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)

                if isinstance(env.action_space, gym.spaces.Box):
                    # A2C/PPO policy outputs actions, values, log_prob
                    # SAC/TD3 policy outputs actions only
                    if isinstance(student, (A2C, PPO)):
                        action, _, _ = model(data)
                    else:
                        # SAC/TD3:
                        action = model(data)
                    action_prediction = action.double()
                else:
                    # Retrieve the logits for A2C/PPO when using discrete actions
                    latent_pi, _, _ = model._get_latent(data)
                    logits = model.action_net(latent_pi)
                    action_prediction = logits
                    target = target.long()

                test_loss = criterion(action_prediction, target)
        test_loss /= len(test_loader.dataset)
        print(f"Test set: Average loss: {test_loss:.4f}")

    # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # and testing
    train_loader = th.utils.data.DataLoader(
        dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    )
    test_loader = th.utils.data.DataLoader(
        dataset=test_expert_dataset, batch_size=test_batch_size, shuffle=True, **kwargs,
    )

    # Define an Optimizer and a learning rate schedule.
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=scheduler_gamma)

    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)
        scheduler.step()

    # Implant the trained policy network back into the RL student agent
    student.policy = model


In [4]:
cwd = os.getcwd()
rewards = "scoring"
scenario_name = f"{cwd}/exp_0_4/academy_run_to_score.scenic"

save_dir = f"{cwd}/saved_models"
logdir = f"{cwd}/tboard/dev"
tracedir = f"{cwd}/game_trace"



In [5]:
#Create Environment

gf_env_settings = {
        "stacked": True,
        "rewards": rewards,
        "representation": 'extracted',
        "players": [f"agent:left_players=1"],
        "real_time": False,
        "action_set": "default"
    }

from scenic.simulators.gfootball.utilities.scenic_helper import buildScenario
scenario = buildScenario(scenario_name)
env = GFScenicEnv(initial_scenario=scenario, gf_env_settings=gf_env_settings)

pygame 2.0.1 (SDL 2.0.14, Python 3.9.1)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [6]:
#get the PPO object with required parameters
model, parameter_dict = pretrain_template.get_model_and_params(
    env=env, ALGO=PPO, features_extractor_class = GfootballImpalaCNN, scenario_name=scenario_name,
    logdir=logdir, override_params={}, rewards=rewards)

print("env (from model) observation space: ", model.get_env().observation_space)

Using scoring Parameters
Using cpu device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
device:  cpu
env (from model) observation space:  Box(0, 255, (16, 72, 96), uint8)


In [13]:
#Generate random data for now as proxy to expert data

num_interactions = 1000
expert_observations = []
expert_actions = []

obs = env.reset()

for i in tqdm(range(num_interactions)):
    action = env.action_space.sample()
    expert_observations.append(obs)
    expert_actions.append(action)
    obs, reward, done, info = env.step(action)
    if done:
        obs = env.reset()

expert_observations = np.array(expert_observations)
print("expert collected obs shape", expert_observations.shape)
expert_observations = np.moveaxis(expert_observations, [3], [1])
print("expert obs shape updated: ", expert_observations.shape)
expert_actions = np.array(expert_actions)

np.savez_compressed(
    "dummy_expert_data",
    expert_actions=expert_actions,
    expert_observations=expert_observations,
)

100%|██████████| 1000/1000 [00:04<00:00, 222.87it/s]


expert collected obs shape (1000, 72, 96, 16)
expert obs shape updated:  (1000, 16, 72, 96)


In [14]:
loaded_data = np.load("dummy_expert_data.npz")

In [15]:
expert_observations = loaded_data["expert_observations"]
expert_actions = loaded_data["expert_actions"]
print(f"Loaded data obs: {expert_observations.shape}, actions: {expert_actions.shape}")
expert_dataset = ExpertDataSet(expert_observations, expert_actions)

Loaded data obs: (1000, 16, 72, 96), act: (1000,)


In [18]:
pretrain_agent(
    student=model,
    env=env,
    expert_dataset=expert_dataset,
    epochs=2
)


test_expert_dataset:  200
train_expert_dataset:  800
Test set: Average loss: 0.0150
Test set: Average loss: 0.0152


In [20]:
#do training
pretrain_template.train(model=model, parameters=parameter_dict,
                            n_eval_episodes=5, total_training_timesteps=5000,
                            eval_freq=5000,
                            save_dir=save_dir, logdir=logdir, dump_info={"rewards": rewards})

Logging to /Users/azadsalam/codebase/scenic/rl_training/tboard/dev/HM_19_40__DM_25_3_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 131      |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 97       |
|    iterations      | 1        |
|    time_elapsed    | 21       |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 135          |
|    ep_rew_mean          | 0            |
| time/                   |              |
|    fps                  | 17           |
|    iterations           | 2            |
|    time_elapsed         | 236          |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0018331283 |
|    clip_fraction        | 0            |
|    clip_range           | 0.115        |
|    entropy_l