In [52]:
from scenic.simulators.gfootball import rl_interface
from stable_baselines3 import PPO
from scenic.simulators.gfootball.rl_interface import GFScenicEnv
import pretrain_template
import gym
from tqdm import tqdm
import numpy as np
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.evaluation import evaluate_policy
from torch.utils.data.dataset import Dataset, random_split
import os
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.preprocessing import is_image_space
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from torch import nn
import torch as th
import torch
import os
from pretrain.helper import *

In [96]:
def pretrain_agent(
        student,
        env,
        expert_dataset,
        batch_size=64,
        epochs=10,
        scheduler_gamma=0.7,
        learning_rate=1.0,
        log_interval=100,
        no_cuda=True,
        seed=1,
        test_batch_size=64,
):
    train_size = int(0.8 * len(expert_dataset))

    test_size = len(expert_dataset) - train_size

    train_expert_dataset, test_expert_dataset = random_split(
        expert_dataset, [train_size, test_size]
    )

    print("test_expert_dataset: ", len(test_expert_dataset))
    print("train_expert_dataset: ", len(train_expert_dataset))


    use_cuda = th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    if isinstance(env.action_space, gym.spaces.Box):
        criterion = nn.MSELoss()
    else:
        criterion = nn.CrossEntropyLoss()

    # Extract initial policy
    model = student.policy.to(device)

    def train(model, device, train_loader, optimizer):
        model.train()

        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()

            if isinstance(env.action_space, gym.spaces.Box):
                # A2C/PPO policy outputs actions, values, log_prob
                # SAC/TD3 policy outputs actions only
                if isinstance(student, (A2C, PPO)):
                    action, _, _ = model(data)
                else:
                    # SAC/TD3:
                    action = model(data)
                action_prediction = action.double()
            else:
                # Retrieve the logits for A2C/PPO when using discrete actions
                latent_pi, _, _ = model._get_latent(data)
                logits = model.action_net(latent_pi)
                action_prediction = logits
                target = target.long()

            loss = criterion(action_prediction, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        with th.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)

                if isinstance(env.action_space, gym.spaces.Box):
                    # A2C/PPO policy outputs actions, values, log_prob
                    # SAC/TD3 policy outputs actions only
                    if isinstance(student, (A2C, PPO)):
                        action, _, _ = model(data)
                    else:
                        # SAC/TD3:
                        action = model(data)
                    action_prediction = action.double()
                else:
                    # Retrieve the logits for A2C/PPO when using discrete actions
                    latent_pi, _, _ = model._get_latent(data)
                    logits = model.action_net(latent_pi)
                    action_prediction = logits
                    target = target.long()

                test_loss = criterion(action_prediction, target)
        test_loss /= len(test_loader.dataset)
        print(f"Test set: Average loss: {test_loss:.4f}")
        return test_loss

    # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # and testing
    train_loader = th.utils.data.DataLoader(
        dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    )
    test_loader = th.utils.data.DataLoader(
        dataset=test_expert_dataset, batch_size=test_batch_size, shuffle=True, **kwargs,
    )

    # Define an Optimizer and a learning rate schedule.
    #1.
    #optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
    #scheduler = StepLR(optimizer, step_size=1, gamma=scheduler_gamma)
    
    optimizer = optim.Adam(model.parameters())
    
    
    cons_lim = 2
    cons=0
    eps = 1e-5
    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer)
        test_loss = test(model, device, test_loader)
        
        if test_loss < eps:
            cons+=1 
            if cons==cons_lim:
                break
        else:
            cons=0
            
        #scheduler.step()

    # Implant the trained policy network back into the RL student agent
    print(f"Trained for {epoch} epochs. Test Loss: {test_loss}")
    student.policy = model

In [97]:
def mean_perf_agent(agent, env, num_trials=5):
    
    #env.render()
    num_epi = 0
    all_rewards = []
    from tqdm import tqdm
    for i in tqdm(range(0, num_trials)):

        done = False
        total_r = 0
        obs = env.reset()
        while not done:
            action = agent.predict(obs, deterministic=True)[0]
            obs, reward, done, info = env.step(action)
            #env.render()
            total_r+=reward
            if done:
                all_rewards.append(total_r)
                num_epi +=1 
                
    all_rewards = np.array(all_rewards)
    return np.mean(all_rewards), np.std(all_rewards), all_rewards

In [98]:
cwd = os.getcwd()
print("Current Directory:", cwd)
rewards = "scoring"
#target_scenario_name = f"{cwd}/pretrain/run_to_score.scenic"
target_scenario_name = f"{cwd}/pretrain/pass_n_shoot.scenic"

save_dir = f"{cwd}/pretrain/saved_models_hp"
logdir = f"{cwd}/tboard/dev/pretrain"
tracedir = f"{cwd}/game_trace"

Current Directory: /home/ubuntu/ScenicGFootBall/rl_training


In [99]:
#create target environment
gf_env_settings = {
        "stacked": True,
        "rewards": rewards,
        "representation": 'extracted',
        "players": [f"agent:left_players=1"],
        "real_time": False,
        "action_set": "default"
    }

from scenic.simulators.gfootball.utilities.scenic_helper import buildScenario
scenario = buildScenario(target_scenario_name)
target_env = GFScenicEnv(initial_scenario=scenario, gf_env_settings=gf_env_settings)

In [100]:
#Load Expert Data
num_interactions = 2500
saved_exp_data = f"pretrain/expert_data/pass_n_shoot_{num_interactions}"
loaded_data = np.load(f"{saved_exp_data}.npz")
expert_observations = loaded_data["expert_observations"]
expert_actions = loaded_data["expert_actions"]
expert_rewards = loaded_data["expert_rewards"]
expert_dataset = ExpertDataSet(expert_observations, expert_actions)

print(f"Loaded data obs: {expert_observations.shape}, actions: {expert_actions.shape}")


Loaded data obs: (2500, 16, 72, 96), actions: (2500,)


In [101]:
print(f"Expert data mean rewards(std): {np.mean(expert_rewards)}({np.std(expert_rewards)})")
print(f"Total {expert_rewards.shape[0]} Trajectories of mean length: {expert_actions.shape[0]/expert_rewards.shape[0]}")

Expert data mean rewards(std): 0.546875(0.4977978850648122)
Total 64 Trajectories of mean length: 39.0625


In [102]:
ppo_agent = PPO("CnnPolicy", target_env, verbose=1)
#ppo_agent.policy

n_epochs = 50
pretrain_agent(
    student=ppo_agent,
    env=target_env,
    expert_dataset=expert_dataset,
    epochs=n_epochs
)


Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
test_expert_dataset:  500
train_expert_dataset:  2000
Test set: Average loss: 0.0027
Test set: Average loss: 0.0019
Test set: Average loss: 0.0004
Test set: Average loss: 0.0006
Test set: Average loss: 0.0004
Test set: Average loss: 0.0003
Test set: Average loss: 0.0002
Test set: Average loss: 0.0003
Test set: Average loss: 0.0001
Test set: Average loss: 0.0001
Test set: Average loss: 0.0001
Test set: Average loss: 0.0000
Test set: Average loss: 0.0001
Test set: Average loss: 0.0004
Test set: Average loss: 0.0004
Test set: Average loss: 0.0000
Test set: Average loss: 0.0003
Test set: Average loss: 0.0001
Test set: Average loss: 0.0004
Test set: Average loss: 0.0002
Test set: Average loss: 0.0000
Test set: Average loss: 0.0004
Test set: Average loss: 0.0003
Test set: Average loss: 0.0002
Test set: Average loss: 0.0003
Test set: Average loss: 0.0001
Test set: Average loss: 0.0002
Test set: Avera

In [103]:

mean_performance_pretrained = mean_perf_agent(agent=ppo_agent, env=target_env, num_trials=20)
print(mean_performance_pretrained)

100%|██████████| 20/20 [00:06<00:00,  3.01it/s]

(0.5, 0.5, array([0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0.,
       1., 0., 0.]))





In [104]:
ppo_agent.save(f"cnn_adam_pass_n_shoot_{num_interactions}_{n_epochs}")
del ppo_agent

In [105]:
loaded_agent = PPO.load(f"cnn_adam_pass_n_shoot_{num_interactions}_{n_epochs}")

In [106]:
mean_performance_pretrained = mean_perf_agent(agent=loaded_agent, env=target_env, num_trials=100)
print(mean_performance_pretrained)

100%|██████████| 100/100 [00:34<00:00,  2.92it/s]

(0.45, 0.49749371855331004, array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1.,
       1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1.,
       1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0.]))



