In [1]:
import os
import random
import time
from dataclasses import dataclass

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import tyro
from torch.distributions.categorical import Categorical
from tqdm import tqdm
import torchvision.transforms as transforms
import cv2
import sklearn
import pandas as pd

In [2]:
def make_env(env_id, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array")
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        return env

    return thunk

In [3]:
# initializing layers with better starting weights for training. 
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

In [4]:
class Agent(nn.Module):
    def __init__(self, envs):
        
        self.mult = torch.tensor([512,384])
        self.mult = self.mult.to(device)
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Conv2d(in_channels=4, out_channels=64, kernel_size=4)),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            layer_init(nn.Conv2d(in_channels=64, out_channels=64, kernel_size=4)),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(1),
            layer_init(nn.Linear(378, 64)),
            nn.Sigmoid(),
            layer_init(nn.Linear(64, 64)),
            nn.Sigmoid(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor = nn.Sequential(
            layer_init(nn.Conv2d(in_channels=4, out_channels=64, kernel_size=4)),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            layer_init(nn.Conv2d(in_channels=64, out_channels=64, kernel_size=4)),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(1),
            layer_init(nn.Linear(378, 64)),
            nn.Sigmoid(),
            layer_init(nn.Linear(64, 64)),
            nn.Sigmoid(),
            layer_init(nn.Linear(64, 2), std=0.01),
        )
        self.logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        means = self.actor(x)
        logstd = self.logstd.expand_as(means)
        std = torch.exp(logstd)
        probs = torch.distributions.Normal(means, std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)

In [5]:
class OsuEnv(gym.Env):
    
    def __init__(self, data):
        self.data = data
        self.misses = 0
        self.count = 0
        self.single_action_space = gym.spaces.Box(0, 512, [2])
        self.single_observation_space = gym.spaces.Box(0, 1, [4, 68, 120])
        
    def _get_obs(self):
        return self.data['data'].iloc[self.count]
    
    def _get_info(self):
        return self.misses, self.count
    
    def reset(self):
        # reset env, so set misses to 0 and counter back to 0
        
        self.misses = 0
        self.count = 0

        observation = self._get_obs()
        info = self._get_info()

        return observation, info
    
    def step(self, action):
        # reward is just a recalced loss. 
        # the maximium possible score is 438400, so subtract from 1/2 so close values are positive and far are negative. 
        gt = np.array(self.data['labels'].iloc[self.count])
        reward = 219200 - np.linalg.norm(action-gt)
        terminated = False
        # An environment is completed if and only if the agent has finished the map or dies
        if self.misses == 5:
            terminated = True
            # dying is bad
            reward += -10000
        if self.count == self.data.shape[0]:
            terminated = True
            # finishing is good. 
            reward += 10000
        # i never truncate
        truncated = False
        observation = self._get_obs()
        info = self._get_info()
        self.count += 1

        return observation, reward, terminated, truncated, info
    

interpret arguments and setup wandb writer (online dashboard for monitoring). 

In [6]:
# This was a dataclass I found from clean RL and organizes all the values in a convenient way. 
# the code required significant debugging to function in jupyter notebook. 
@dataclass
class Args:
    exp_name: str = 'OsuPPO'
    """the name of this experiment"""
    seed: int = 39
    """seed of the experiment"""
    torch_deterministic: bool = True
    """if toggled, `torch.backends.cudnn.deterministic=False`"""
    cuda: bool = True
    """if toggled, cuda will be enabled by default"""
    track: bool = False
    """if toggled, this experiment will be tracked with Weights and Biases"""
    wandb_project_name: str = "cleanRL"
    """the wandb's project name"""
    wandb_entity: str = None
    """the entity (team) of wandb's project"""
    capture_video: bool = True
    """whether to capture videos of the agent performances (check out `videos` folder)"""

    # Algorithm specific arguments
    env_id: str = "LunarLander-v3"
    """the id of the environment"""
    total_timesteps: int = 4000
    """total timesteps of the experiments"""
    learning_rate: float = 2.5e-4
    """the learning rate of the optimizer"""
    num_envs: int = 4
    """the number of parallel game environments"""
    num_steps: int = 128
    """the number of steps to run in each environment per policy rollout"""
    anneal_lr: bool = True
    """Toggle learning rate annealing for policy and value networks"""
    gamma: float = 0.99
    """the discount factor gamma"""
    gae_lambda: float = 0.95
    """the lambda for the general advantage estimation"""
    num_minibatches: int = 4
    """the number of mini-batches"""
    update_epochs: int = 4
    """the K epochs to update the policy"""
    norm_adv: bool = True
    """Toggles advantages normalization"""
    clip_coef: float = 0.2
    """the surrogate clipping coefficient"""
    clip_vloss: bool = True
    """Toggles whether or not to use a clipped loss for the value function, as per the paper."""
    ent_coef: float = 0.01
    """coefficient of the entropy"""
    vf_coef: float = 0.5
    """coefficient of the value function"""
    max_grad_norm: float = 0.5
    """the maximum norm for the gradient clipping"""
    target_kl: float = None
    """the target KL divergence threshold"""

    # to be filled in runtime
    batch_size: int = 0
    """the batch size (computed in runtime)"""
    minibatch_size: int = 0
    """the mini-batch size (computed in runtime)"""
    num_iterations: int = 0
    """the number of iterations (computed in runtime)"""


def make_env(env_id, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id, render_mode="rgb_array", continuous = False)
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id, render_mode="rgb_array", continuous = False)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        return env

    return thunk
args = Args()
args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.num_minibatches)
args.num_iterations = args.total_timesteps // args.batch_size
run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
if args.track:
    import wandb

    wandb.init(
        project=args.wandb_project_name,
        entity=args.wandb_entity,
        sync_tensorboard=True,
        config=vars(args),
        name=run_name,
        monitor_gym=True,
        save_code=True,
    )



In [7]:
# compile dataset.
dataset_path = "C:/Users/Yile0/PycharmProjects/osutime/map1_data.csv"
data = pd.read_csv(dataset_path)

print(data.head())
print(data.columns)
# small data for changing, basically just for trialing new changes.
small_data = sklearn.utils.resample(data, n_samples= 1000)

# frame 4 is the latest/ most recent.

#originally these were one piece, changed for the dataloader to function
processed_data = []
processed_labels = []


def process_img(paths):
    # I had another self-made thing here that I decided to replace with premade functions
    images = []
    transform = transforms.Compose([transforms.ToTensor()])
    for path in paths:
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        # 100 by 75 because slightly better quality
        # over 80 by 60
        img = cv2.resize(img, (120, 68), interpolation=cv2.INTER_AREA)
        img_normalized = cv2.normalize(img, None, 0, 1.0,
                                       cv2.NORM_MINMAX, dtype=cv2.CV_32F)
        images.append(transform(img_normalized))
    out = torch.stack(images)
    # i played with trying to reshape to 120 16, but I came back
    # to this resolution because it was just so much better and faster
    # with regards to my training speed.
    out = out.reshape(len(paths),68,120)
    # played with preloading here and loading later, seems like loading later is better.
    # img = torch.from_numpy(img)
    return out


for index, row in tqdm(data.iterrows()):
    # trying without normalization.
    processed_labels.append([row['x'], row['y']])
    # try to predict just on 1 frame for testing
    processed_data.append(process_img([row['frame 4'],row['frame 3'],row['frame 2'],row['frame 1']]))
    
df = pd.DataFrame({'data':processed_data,
                   'labels': processed_labels})

          x         y                                            frame 4  \
0  253.3333  256.4445  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
1  253.3333  256.0000  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
2  252.8889  256.0000  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
3  252.8889  256.0000  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
4  252.8889  256.0000  C:/Users/Yile0/PycharmProjects/osutime/frames/...   

                                             frame 3  \
0  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
1  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
2  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
3  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
4  C:/Users/Yile0/PycharmProjects/osutime/frames/...   

                                             frame 2  \
0  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
1  C:/Users/Yile0/PycharmProjects/osutime/frames/...   
2  C:/Users/Yile0/PycharmProjects/osut

8930it [02:30, 59.34it/s]


In [8]:
# Seeding
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda:1" if torch.cuda.is_available() and args.cuda else "cpu")

# env setup
env = OsuEnv(df)

agent = Agent(env).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)

# ALGO Logic: Storage setup
obs = torch.zeros((args.num_steps, args.num_envs) + env.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, 64,2)).to(device)
logprobs = torch.zeros((args.num_steps, 64)).to(device)
rewards = torch.zeros((args.num_steps, 64)).to(device)
dones = torch.zeros((args.num_steps, 1)).to(device)
values = torch.zeros((args.num_steps, 64)).to(device)

# TRY NOT TO MODIFY: start the game
global_step = 0
start_time = time.time()
next_obs, _ = env.reset()
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros((1)).to(device)

Training 

In [9]:
for iteration in range(1, args.num_iterations + 1):
        # Annealing the rate if instructed to do so.
        if args.anneal_lr:
            frac = 1.0 - (iteration - 1.0) / args.num_iterations
            lrnow = frac * args.learning_rate
            optimizer.param_groups[0]["lr"] = lrnow

        for step in range(0, args.num_steps):
            global_step += args.num_envs
            obs[step] = next_obs
            dones[step] = next_done

            # ALGO LOGIC: action logic
            with torch.no_grad():
                action, logprob, _, value = agent.get_action_and_value(next_obs)
                values[step] = value.flatten()
            actions[step] = action
            logprobs[step] = logprob

            # TRY NOT TO MODIFY: execute the game and log data.
            next_obs, reward, terminations, truncations, infos = env.step(action.cpu().numpy())
            next_done = 1 if terminations else 0
            next_done = torch.tensor(next_done)
            rewards[step] = torch.tensor(reward).to(device).view(-1)
            next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)
            if "final_info" in infos:
                for info in infos["final_info"]:
                    if info and "episode" in info:
                        print(f"global_step={global_step}, episodic_return={info['episode']['r']}")

        # bootstrap value if not done
        with torch.no_grad():
            next_value = agent.get_value(next_obs).reshape(1, -1)
            advantages = torch.zeros_like(rewards).to(device)
            lastgaelam = 0
            for t in reversed(range(args.num_steps)):
                if t == args.num_steps - 1:
                    nextnonterminal = (1.0 - next_done).item()
                    nextvalues = next_value
                else:
                    nextnonterminal = (1.0 - dones[t + 1]).item()
                    nextvalues = values[t + 1]
                print(nextvalues.shape)
                print(values[t].shape)
                print(nextvalues - values[t])
                print(rewards[t].shape)
                delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
                advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
            returns = advantages + values

        # flatten the batch
        b_obs = obs.reshape((-1,) + env.single_observation_space.shape)
        b_logprobs = logprobs.reshape(-1)
        b_actions = actions.reshape((-1,) + env.single_action_space.shape)
        b_advantages = advantages.reshape(-1)
        b_returns = returns.reshape(-1)
        b_values = values.reshape(-1)

        # Optimizing the policy and value network
        b_inds = np.arange(args.batch_size)
        clipfracs = []
        for epoch in range(args.update_epochs):
            np.random.shuffle(b_inds)
            for start in range(0, args.batch_size, args.minibatch_size):
                end = start + args.minibatch_size
                mb_inds = b_inds[start:end]

                _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions[mb_inds])
                logratio = newlogprob - b_logprobs[mb_inds]
                ratio = logratio.exp()

                with torch.no_grad():
                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
                    old_approx_kl = (-logratio).mean()
                    approx_kl = ((ratio - 1) - logratio).mean()
                    clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]

                mb_advantages = b_advantages[mb_inds]
                if args.norm_adv:
                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

                # Policy loss
                pg_loss1 = -mb_advantages * ratio
                pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
                pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                # Value loss
                newvalue = newvalue.view(-1)
                if args.clip_vloss:
                    v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                    v_clipped = b_values[mb_inds] + torch.clamp(
                        newvalue - b_values[mb_inds],
                        -args.clip_coef,
                        args.clip_coef,
                    )
                    v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()
                else:
                    v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

                entropy_loss = entropy.mean()
                loss = pg_loss - args.ent_coef * entropy_loss + v_loss * args.vf_coef

                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
                optimizer.step()

            if args.target_kl is not None and approx_kl > args.target_kl:
                break

        y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y  


torch.Size([1, 64])
torch.Size([64])
tensor([[ 4.0267e-03, -1.5619e-02,  6.2262e-03, -2.2824e-03,  9.6192e-03,
         -4.4835e-04, -7.4052e-03, -2.6205e-03,  9.0958e-04,  1.1818e-02,
          9.8451e-04, -7.3536e-04,  4.8540e-03, -1.7064e-03, -1.2626e-02,
          1.5895e-02,  1.8678e-03,  7.0287e-03, -1.0684e-02,  6.0613e-03,
         -5.4848e-03,  1.8587e-03,  2.5119e-04, -4.7136e-03,  2.8132e-04,
          5.7146e-03, -1.3537e-02,  2.0213e-05, -6.7881e-04, -1.0238e-02,
          3.8046e-03, -2.6544e-03, -2.0081e-03,  1.6871e-03,  1.5294e-04,
          1.7695e-03, -5.8654e-03, -5.5060e-06, -5.7167e-03, -1.3752e-03,
         -1.5497e-02,  1.6035e-04, -2.2808e-03, -9.1036e-03, -3.0328e-03,
          1.1242e-02,  2.7449e-04, -1.4126e-03, -2.3052e-04,  5.1909e-03,
         -2.2349e-03, -3.0704e-04,  5.0258e-04,  2.5097e-03,  8.1580e-04,
         -2.9919e-03,  3.6722e-04,  2.2529e-03, -2.4678e-03, -5.6951e-03,
         -1.2553e-02,  3.6332e-03,  9.9390e-04, -1.1203e-02]], device='cuda

RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x24192 and 378x64)

closing things out. 

In [None]:
 if args.save_model:
    model_path = f"runs/{run_name}/{args.exp_name}.cleanrl_model"
    torch.save(agent.state_dict(), model_path)
    print(f"model saved to {model_path}")

In [None]:
env.close()