In [1]:
from scenic.simulators.gfootball import rl_interface
from stable_baselines3 import PPO
from scenic.simulators.gfootball.rl_interface import GFScenicEnv
import pretrain_template
#from gfootball_impala_cnn import GfootballImpalaCNN
import gym
from tqdm import tqdm
import numpy as np
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.evaluation import evaluate_policy

from torch.utils.data.dataset import Dataset, random_split
import os

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.preprocessing import is_image_space
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from torch import nn
import torch as th
import torch
import os



In [2]:
class GfootballImpalaCNN(BaseFeaturesExtractor):
    """
    gfootball_impala_cnn is architecture used in the paper
    (https://arxiv.org/pdf/1907.11180.pdf).
    It is illustrated in the appendix. It is similar to Large architecture
    from IMPALA paper; we use 4 big blocks instead of 3 though.
    """

    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(GfootballImpalaCNN, self).__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
        assert is_image_space(observation_space), (
            "You should use CNN only with images"
        )
        assert features_dim==256, "To replicate the same network"
        n_input_channels = observation_space.shape[0]


        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print("device: ", self.device)

        self.conv_layers_config = [(16, 2), (32, 2), (32, 2), (32, 2)]
        self.pool = nn.MaxPool2d(kernel_size=3, stride=2)


        self.conv_blocks = [
            nn.Conv2d(in_channels=n_input_channels, out_channels=16, kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1)
        ]
            

        

        #if "cuda" in self.device.type:
        #    self.conv_blocks = [c.cuda() for c in self.conv_blocks]

        #https://www.tensorflow.org/api_docs/python/tf/nn/pool  -> If padding = "SAME": output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
        self.pools = [nn.MaxPool2d(kernel_size=3, stride=2, padding=1) for _ in range(4)]

        self.resblocks_1 = [
            self.create_basic_res_block(16, 16),
            self.create_basic_res_block(32, 32),
            self.create_basic_res_block(32, 32),
            self.create_basic_res_block(32, 32)
        ]
        self.resblocks_2 = [
            self.create_basic_res_block(16, 16),
            self.create_basic_res_block(32, 32),
            self.create_basic_res_block(32, 32),
            self.create_basic_res_block(32, 32)
        ]


        
        if "cuda" in self.device.type:
            self.conv_blocks = [c.cuda() for c in self.conv_blocks]
            self.resblocks_1 = [c.cuda() for c in self.resblocks_1]
            self.resblocks_2 = [c.cuda() for c in self.resblocks_2]

        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()
        
        
        #hack so that all the weights are saved correctly, by default stable_baselines3 only saves weights all torch tensors
        #So it
        self.conv_0 = self.conv_blocks[0]
        self.conv_1 = self.conv_blocks[1]
        self.conv_2 = self.conv_blocks[2]
        self.conv_3 = self.conv_blocks[3]
        
        
        self.res_1_0 = self.resblocks_1[0]
        self.res_1_1 = self.resblocks_1[1]
        self.res_1_2 = self.resblocks_1[2]
        self.res_1_3 = self.resblocks_1[3]

        self.res_2_0 = self.resblocks_2[0]
        self.res_2_1 = self.resblocks_2[1]
        self.res_2_2 = self.resblocks_2[2]
        self.res_2_3 = self.resblocks_2[3]
        #################################################



        # Compute shape by doing one forward pass
        """
        with th.no_grad():
            n_flatten = self.feat_extract(
                th.as_tensor(observation_space.sample()[None]).float()
            )
            n_flatten = n_flatten.shape[1]
        """
        n_flatten = 960
        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU()) #n_flatten=960




    def create_basic_res_block(self, in_channel, out_channel):
        return nn.Sequential(
            nn.ReLU(),
            nn.Conv2d(in_channels=in_channel, out_channels=out_channel, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=out_channel, out_channels=out_channel, kernel_size=3, stride=1, padding=1),
        )

    def feat_extract(self, observations: th.Tensor) -> th.Tensor:
        #observations = observations.to(self.device)
        observations = observations.float()
        observations /= 255

        conv_out = observations
        for i in range(4):
            #print("", i)
            #print(" 1. conv_out.is_cuda() ", conv_out.is_cuda)
            #print("     conv block weight", self.conv_blocks[i].weight.is_cuda)
            conv_out = self.conv_blocks[i](conv_out)
            #print(" 2. conv_out.is_cuda() ", conv_out.is_cuda)
            conv_out = self.pools[i](conv_out)

            block_input = conv_out
            conv_out = self.resblocks_1[i](conv_out)
            conv_out += block_input

            block_input = conv_out
            conv_out = self.resblocks_2[i](conv_out)
            conv_out += block_input
            #print(" 3. conv_out.is_cuda() ", conv_out.is_cuda)

        #print(" before relu . conv_out.is_cuda() ", conv_out.is_cuda)
        conv_out = self.relu(conv_out)
        #print(" after relu . conv_out.is_cuda() ", conv_out.is_cuda)
        conv_out = self.flatten(conv_out)
        #print(" after flatten . conv_out.is_cuda() ", conv_out.is_cuda)
        return conv_out

    def forward(self, observations: th.Tensor) -> th.Tensor:
        conv_out = self.feat_extract(observations)
        conv_out = self.linear(conv_out)

        return conv_out
        



In [3]:

def get_weight_sum(model):
    """model is a PPO object"""
    
    tensors = [
        model.policy.features_extractor.conv_blocks[0].weight.sum().detach(),
        model.policy.features_extractor.conv_blocks[1].weight.sum().detach(),
        model.policy.features_extractor.conv_blocks[2].weight.sum().detach(),
        model.policy.features_extractor.conv_blocks[3].weight.sum().detach(),
        model.policy.features_extractor.resblocks_1[0][1].weight.sum().detach(),
        model.policy.features_extractor.resblocks_1[0][3].weight.sum().detach(),
        model.policy.features_extractor.resblocks_1[1][1].weight.sum().detach(),
        model.policy.features_extractor.resblocks_1[1][3].weight.sum().detach(),
        model.policy.features_extractor.resblocks_1[2][1].weight.sum().detach(),
        model.policy.features_extractor.resblocks_1[2][3].weight.sum().detach(),
        model.policy.features_extractor.resblocks_1[3][1].weight.sum().detach(),
        model.policy.features_extractor.resblocks_1[3][3].weight.sum().detach(),
        model.policy.features_extractor.resblocks_2[0][1].weight.sum().detach(),
        model.policy.features_extractor.resblocks_2[0][3].weight.sum().detach(),
        model.policy.features_extractor.resblocks_2[1][1].weight.sum().detach(),
        model.policy.features_extractor.resblocks_2[1][3].weight.sum().detach(),
        model.policy.features_extractor.resblocks_2[2][1].weight.sum().detach(),
        model.policy.features_extractor.resblocks_2[2][3].weight.sum().detach(),
        model.policy.features_extractor.resblocks_2[3][1].weight.sum().detach(),
        model.policy.features_extractor.resblocks_2[3][3].weight.sum().detach(),
    ]
    if tensors[0].is_cuda:
        saved_sums = [tens.cpu().numpy() for tens in tensors]
    else:
        saved_sums = [tens.numpy() for tens in tensors]

    return saved_sums

In [4]:
class ExpertDataSet(Dataset):
    def __init__(self, expert_observations, expert_actions):
        self.observations = expert_observations
        self.actions = expert_actions

    def __getitem__(self, index):
        return (self.observations[index], self.actions[index])

    def __len__(self):
        return len(self.observations)

In [5]:
def pretrain_agent(
        student,
        env,
        expert_dataset,
        batch_size=64,
        epochs=10,
        scheduler_gamma=0.7,
        learning_rate=1.0,
        log_interval=100,
        no_cuda=True,
        seed=1,
        test_batch_size=64,
):
    train_size = int(0.8 * len(expert_dataset))

    test_size = len(expert_dataset) - train_size

    train_expert_dataset, test_expert_dataset = random_split(
        expert_dataset, [train_size, test_size]
    )

    print("test_expert_dataset: ", len(test_expert_dataset))
    print("train_expert_dataset: ", len(train_expert_dataset))


    use_cuda = not no_cuda and th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    if isinstance(env.action_space, gym.spaces.Box):
        criterion = nn.MSELoss()
    else:
        criterion = nn.CrossEntropyLoss()

    # Extract initial policy
    model = student.policy.to(device)

    def train(model, device, train_loader, optimizer):
        model.train()

        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()

            if isinstance(env.action_space, gym.spaces.Box):
                # A2C/PPO policy outputs actions, values, log_prob
                # SAC/TD3 policy outputs actions only
                if isinstance(student, (A2C, PPO)):
                    action, _, _ = model(data)
                else:
                    # SAC/TD3:
                    action = model(data)
                action_prediction = action.double()
            else:
                # Retrieve the logits for A2C/PPO when using discrete actions
                latent_pi, _, _ = model._get_latent(data)
                logits = model.action_net(latent_pi)
                action_prediction = logits
                target = target.long()

            loss = criterion(action_prediction, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        with th.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)

                if isinstance(env.action_space, gym.spaces.Box):
                    # A2C/PPO policy outputs actions, values, log_prob
                    # SAC/TD3 policy outputs actions only
                    if isinstance(student, (A2C, PPO)):
                        action, _, _ = model(data)
                    else:
                        # SAC/TD3:
                        action = model(data)
                    action_prediction = action.double()
                else:
                    # Retrieve the logits for A2C/PPO when using discrete actions
                    latent_pi, _, _ = model._get_latent(data)
                    logits = model.action_net(latent_pi)
                    action_prediction = logits
                    target = target.long()

                test_loss = criterion(action_prediction, target)
        test_loss /= len(test_loader.dataset)
        print(f"Test set: Average loss: {test_loss:.4f}")

    # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # and testing
    train_loader = th.utils.data.DataLoader(
        dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    )
    test_loader = th.utils.data.DataLoader(
        dataset=test_expert_dataset, batch_size=test_batch_size, shuffle=True, **kwargs,
    )

    # Define an Optimizer and a learning rate schedule.
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=scheduler_gamma)

    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)
        scheduler.step()

    # Implant the trained policy network back into the RL student agent
    student.policy = model


In [6]:
def generate_expert_data(env, num_interactions=1000, save_file_name="expert_data"):

    expert_observations = []
    expert_actions = []

    obs = env.reset()

    for i in tqdm(range(num_interactions)):
        expert_observations.append(obs)

        obs, reward, done, info = env.step(env.action_space.sample())
        #print(info)
        action = info["action_taken"]
        expert_actions.append(action)

        if done:
            obs = env.reset()
    
    expert_observations = np.array(expert_observations)
    expert_observations = np.moveaxis(expert_observations, [3], [1])
    expert_actions = np.array(expert_actions)
    print("Expert observation shape: ", expert_observations.shape)
    print("Expert actions shape: ", expert_actions.shape)

    np.savez_compressed(
        save_file_name,
        expert_actions=expert_actions,
        expert_observations=expert_observations,
    )
    return expert_observations, expert_actions

In [7]:
def mean_perf_random_agent(env, num_trials=1):

    obs = env.reset()
    #env.render()
    num_epi = 0
    all_rewards = []
    from tqdm import tqdm
    for i in tqdm(range(0, num_trials)):

        done = False
        total_r = 0
        while not done:
            action = env.action_space.sample()
            obs, reward, done, info = env.step(action)
            #env.render()
            total_r+=reward
            if done:
                obs = env.reset()
                all_rewards.append(total_r)
                total_r=0
                num_epi +=1
                
    all_rewards = np.array(all_rewards)
    return np.mean(all_rewards), np.std(all_rewards)

In [8]:
def mean_perf_agent(agent, env, num_trials=5):
    
    #env.render()
    num_epi = 0
    all_rewards = []
    from tqdm import tqdm
    for i in tqdm(range(0, num_trials)):

        done = False
        total_r = 0
        obs = env.reset()
        while not done:
            action = agent.predict(obs, deterministic=True)[0]
            obs, reward, done, info = env.step(action)
            #env.render()
            total_r+=reward
            if done:
                all_rewards.append(total_r)
                num_epi +=1 
                
    all_rewards = np.array(all_rewards)
    return np.mean(all_rewards), np.std(all_rewards)

In [9]:
cwd = os.getcwd()
print("Current Directory:", cwd)
rewards = "scoring"
target_scenario_name = f"{cwd}/pretrain/run_to_score.scenic"



save_dir = f"{cwd}/pretrain_saved_models"
logdir = f"{cwd}/tboard/dev"
tracedir = f"{cwd}/game_trace"

Current Directory: /home/ubuntu/ScenicGFootBall/rl_training


In [10]:
#create target environment
gf_env_settings = {
        "stacked": True,
        "rewards": rewards,
        "representation": 'extracted',
        "players": [f"agent:left_players=1"],
        "real_time": False,
        "action_set": "default"
    }

from scenic.simulators.gfootball.utilities.scenic_helper import buildScenario
scenario = buildScenario(target_scenario_name)
target_env = GFScenicEnv(initial_scenario=scenario, gf_env_settings=gf_env_settings)

pygame 2.0.1 (SDL 2.0.14, Python 3.8.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [11]:
#generate expert data
num_interactions=20000
expert_data_file = f"pretrain/expert_data/expertdata_rts_{num_interactions}"


#create data_gen_environment
gf_env_settings = {
    "stacked": True,
    "rewards": 'scoring',
    "representation": 'extracted',
    "players": [f"agent:left_players=1"],
    "real_time": False,
    "action_set": "default",#"default" "v2"
}

datagen_scenario_file = f"{cwd}/pretrain/run_to_score_with_behave.scenic"
datagen_scenario = buildScenario(datagen_scenario_file)
from scenic.simulators.gfootball.rl_interface import GFScenicEnv

#import scenic.syntax.veneer as veneer
#veneer.reset()

datagen_env = GFScenicEnv(initial_scenario=datagen_scenario, gf_env_settings=gf_env_settings, use_scenic_behavior_in_step=True)
print("Mean Reward and STD of Scenic Behavior Agent", mean_perf_random_agent(datagen_env, num_trials=5))

expert_observations, expert_actions = generate_expert_data(datagen_env, num_interactions=num_interactions, save_file_name=expert_data_file) 



Environment will ignore actions passed to step() and take action provided by Scenic


100%|██████████| 5/5 [00:02<00:00,  2.19it/s]
  0%|          | 20/20000 [00:00<01:43, 193.43it/s]

Mean Reward and STD of Scenic Behavior Agent (1.0, 0.0)


100%|██████████| 20000/20000 [02:10<00:00, 152.73it/s]


Expert observation shape:  (20000, 16, 72, 96)
Expert actions shape:  (20000,)


Pretrain Agent

In [12]:
#Create the PPO object with required parameters

ppo_agent, parameter_dict = pretrain_template.get_model_and_params(
    env=target_env, ALGO=PPO, features_extractor_class = GfootballImpalaCNN, scenario_name=target_scenario_name,
    logdir=logdir, override_params={}, rewards=rewards)

print("env (from model) observation space: ", ppo_agent.get_env().observation_space)


Using scoring Parameters
Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
device:  cuda:0
env (from model) observation space:  Box(0, 255, (16, 72, 96), uint8)


In [13]:
#Load expert Data
saved_exp_data = f"pretrain/expert_data/expertdata_rts_{num_interactions}"
loaded_data = np.load(f"{saved_exp_data}.npz")
expert_observations = loaded_data["expert_observations"]
expert_actions = loaded_data["expert_actions"]
expert_dataset = ExpertDataSet(expert_observations, expert_actions)

print(f"Loaded data obs: {expert_observations.shape}, actions: {expert_actions.shape}")

Loaded data obs: (20000, 16, 72, 96), actions: (20000,)


In [14]:
pretrain_agent(
    student=ppo_agent,
    env=target_env,
    expert_dataset=expert_dataset,
    epochs=5
)


test_expert_dataset:  4000
train_expert_dataset:  16000
Test set: Average loss: 0.0003
Test set: Average loss: 0.0003
Test set: Average loss: 0.0003
Test set: Average loss: 0.0003
Test set: Average loss: 0.0003


In [15]:
checsum_pretrained = np.sum(get_weight_sum(ppo_agent))
print(f"Checksum of pretrained agent: {checsum_pretrained}")

Checksum of pretrained agent: -198.20127868652344


In [16]:
mean_performance_pretrained = mean_perf_agent(agent=ppo_agent, env=target_env, num_trials=20)
print(mean_performance_pretrained)


100%|██████████| 20/20 [00:36<00:00,  1.84s/it]

(0.0, 0.0)





In [17]:
#save pretrained agent
saved_pretrained_agent_file = f"pretrain/saved_agents/pretrained_PPO_rts_{num_interactions}"
ppo_agent.save(saved_pretrained_agent_file)
del ppo_agent

<h1>Run normal training</h1>

In [None]:
assert False

In [31]:
#Load PPO Agent
saved_pretrained_agent_file = f"pretrain/saved_agents/pretrained_PPO_rts_{20000}"
loaded_PPO = PPO.load(saved_pretrained_agent_file)
checksum_pretrain_loaded = np.sum(get_weight_sum(loaded_PPO))
print(f"Checksum of loaded pretrained agent: {checksum_pretrain_loaded}")

device:  cuda:0
Checksum of loaded pretrained agent: -198.2012939453125


In [None]:
mean_performance_pretrained = mean_perf_agent(agent=loaded_PPO, env=target_env, num_trials=20)
print("Pretrained Agent Performance: ", mean_performance_pretrained)

In [33]:
final_agent, parameter_dict = pretrain_template.get_model_and_params(
    env=target_env, ALGO=PPO, features_extractor_class = GfootballImpalaCNN, scenario_name=target_scenario_name,
    logdir=logdir, override_params={}, rewards=rewards)

#print("weight sum with default loading: ", np.sum(get_weight_sum(final_agent)))
final_agent.policy = loaded_PPO.policy
print("weight sum after initializing weight from saved policy: ", np.sum(get_weight_sum(final_agent)))
print("env (from model) observation space: ", final_agent.get_env().observation_space)

Using scoring Parameters
Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
device:  cuda:0
weight sum after initializing weight from saved policy:  -198.2013
env (from model) observation space:  Box(0, 255, (16, 72, 96), uint8)


In [21]:
mean_performance_warm = mean_perf_agent(agent=final_agent, env=target_env, num_trials=20)
print("Mean Performance Loaded Pretrained Agent", mean_performance_warm)

100%|██████████| 20/20 [00:30<00:00,  1.53s/it]

Mean Performance Loaded Pretrained Agent (0.0, 0.0)





In [34]:
total_training_timesteps = 5000

In [35]:
pretrain_template.train(model=final_agent, parameters=parameter_dict,
                            n_eval_episodes=10, total_training_timesteps=total_training_timesteps,
                            eval_freq=10000,
                            save_dir=save_dir, logdir=logdir, dump_info={"rewards": rewards})

Logging to /home/ubuntu/ScenicGFootBall/rl_training/tboard/dev/HM_0_55__DM_1_4_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 78.6     |
|    ep_rew_mean     | 0.68     |
| time/              |          |
|    fps             | 92       |
|    iterations      | 1        |
|    time_elapsed    | 22       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 78.4        |
|    ep_rew_mean          | 0.635       |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 2           |
|    time_elapsed         | 57          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.002079396 |
|    clip_fraction        | 0.0297      |
|    clip_range           | 0.115       |
|    entropy_loss         | -1.23

In [24]:
mean_performance_warm = mean_perf_agent(agent=final_agent, env=target_env, num_trials=20)
print(mean_performance_warm)

100%|██████████| 20/20 [00:35<00:00,  1.78s/it]

(0.0, 0.0)





In [36]:
print("weight sum after initializing weight from saved policy: ", np.sum(get_weight_sum(final_agent)))

weight sum after initializing weight from saved policy:  -177.51831


In [37]:
#save final agent
saved_final_agent_file = f"pretrain/saved_agents/final_PPO_rts_{total_training_timesteps}"
final_agent.save(saved_final_agent_file)
del final_agent

In [40]:
#Load RL Model
loaded_final = PPO.load(f"pretrain/saved_agents/final_PPO_rts_{total_training_timesteps}")

device:  cuda:0


In [42]:
print("weight sum after initializing weight from saved policy: ", np.sum(get_weight_sum(loaded_final)))
mean_performance_warm = mean_perf_agent(agent=loaded_final, env=target_env, num_trials=20)
print(mean_performance_warm)

  0%|          | 0/20 [00:00<?, ?it/s]

weight sum after initializing weight from saved policy:  -177.51831


100%|██████████| 20/20 [00:27<00:00,  1.35s/it]

(0.0, 0.0)



