# Meta-World+SAC and related scripts
- many of these cells were standalone scripts, that I am placing in notebook for convenience

## Example SB3

In [None]:
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
vec_env = make_vec_env("CartPole-v1", n_envs=4)

model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=25000)
model.save("ppo_cartpole")

del model # remove to demonstrate saving and loading

model = PPO.load("ppo_cartpole")

obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")

## Generating Semantic Embeddings 

In [15]:
all_env_mt10=['reach-v2', 'push-v2','pick-place-v2', 'door-open-v2', 'drawer-open-v2','drawer-close-v2','button-press-topdown-v2','peg-insert-side-v2','window-open-v2','window-close-v2']
chat_phrase_mt10={'reach-v2':'Reach target with robotic arm', 'push-v2': 'Push object to target position', 'pick-place-v2':'Pick and place object precisely', 'door-open-v2':'Open door by rotating handle','drawer-open-v2':'Open drawer by pulling outward','drawer-close-v2':'Close drawer by pushing backward', 'button-press-topdown-v2':'Press button top-down until depressed','peg-insert-side-v2':' Insert peg side into slot', 'window-open-v2':'Open window by sliding panel','window-close-v2':'Close window by sliding panel'}
for key in chat_phrase_mt10:
    print(key in all_env_mt10)

True
True
True
True
True
True
True
True
True
True


In [None]:
# Pasting one-hot PCA embeddings from other script...
import numpy as np
contextArray = np.array([
    [ 2.55721937e+00,  2.31504669e+00,  1.53086022e+00,  3.67148980e+00,
     -7.59686234e-01, -7.61520546e-01,  7.11087296e-01, -2.76179305e-01,
     -9.09933731e-03, -4.92187850e-16],
    [ 3.19798408e+00,  2.29742495e-01,  1.50240787e+00, -7.02709710e-01,
     -1.11196737e-01,  2.74924940e+00, -1.53620950e+00, -1.18464051e-01,
      3.12674877e-02, -4.92187850e-16],
    [ 3.28570139e+00,  1.92692870e+00,  2.16563078e-01, -3.17126572e+00,
      1.22878090e+00, -1.05485421e+00,  1.25747231e+00,  3.66652176e-01,
      2.40985359e-02, -4.92187850e-16],
    [-2.08496350e+00,  4.64285575e-01,  5.41231677e-01,  6.38912049e-01,
      2.08218306e+00, -1.46809404e+00, -2.13341308e+00,  7.39046881e-01,
     -2.00950098e-01, -4.92187850e-16],
    [ 4.27420596e-01, -8.70185256e-01, -2.86820046e+00,  3.71623910e-01,
      1.72775859e+00, -2.57396544e-02, -3.41897534e-02, -1.64198974e+00,
     -1.41869863e-01, -4.92187850e-16],
    [ 1.15915385e-01, -1.24388372e+00, -2.73867371e+00,  1.45164344e+00,
      5.18528498e-01,  1.21674038e+00,  9.57762804e-01,  1.37271833e+00,
      3.60078564e-01, -4.92187850e-16],
    [ 1.14941507e+00, -4.93749060e+00,  2.04079053e+00, -2.26013967e-01,
     -7.33097844e-01, -1.07109692e+00,  3.29521154e-01, -4.60878162e-02,
     -9.44210158e-03, -4.92187850e-16],
    [-1.88857968e-01,  8.08267667e-01, -2.38279611e+00, -1.02331244e+00,
     -3.41088830e+00, -9.48025784e-01, -1.05931369e+00,  2.31463462e-02,
      9.69085706e-02, -4.92187850e-16],
    [-4.71406517e+00,  8.02744073e-01,  1.52039680e+00, -5.83708654e-01,
      3.05733353e-02,  4.24301726e-01,  5.31541815e-01, -4.67244928e-01,
      1.12817178e+00, -4.92187850e-16],
    [-3.74576926e+00,  5.04544376e-01,  6.37420105e-01, -4.26658712e-01,
     -5.72955276e-01,  9.39039647e-01,  9.75740641e-01,  4.84021130e-02,
     -1.27916354e+00, -4.92187850e-16]
], dtype=float)

## MT-SAC with one-hot embeddings!
 - adjust timesteps, algorithmic hyperparameters accordingly (they are hardcoded rn)

In [None]:
import random
import numpy as np
import metaworld
import gymnasium as gym
from gymnasium import ObservationWrapper, spaces
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
import torch.nn as nn
from stable_baselines3.common.vec_env import VecNormalize


#have to write this to control env creation in multi-task setting
class OneHotTaskWrapper(ObservationWrapper):
    def __init__(self, env, env_idx, all_tasks, n_envs):
        super().__init__(env)
        self.env_idx= env_idx
        self.all_tasks= all_tasks
        self.n_envs=n_envs

        self.n_configs= len(all_tasks) // n_envs

        orig_low,orig_high=env.observation_space.low,env.observation_space.high
        low=np.concatenate([orig_low,np.zeros(n_envs,dtype=np.float32)])
        high=np.concatenate([orig_high,np.ones(n_envs,dtype=np.float32)])
        self.observation_space=spaces.Box(low=low,high=high,dtype=np.float32)

#sample a task and config within task upon each env restart. task is random
#can also restrict range of new tasks (<n for MTn) if you'd like
    def reset(self, **kwargs):
        cfg_idx=random.randrange(self.n_configs)
        task=self.all_tasks[self.env_idx*self.n_configs+cfg_idx]
        self.env.unwrapped.set_task(task)
        obs,info=self.env.reset(**kwargs)
        return self._append_one_hot(obs),info

    def observation(self,obs):
        return self._append_one_hot(obs)

    def _append_one_hot(self, obs):
        one_hot = np.zeros(self.n_envs,dtype=np.float32)
        one_hot[self.env_idx]=1.0
        return np.concatenate([obs, one_hot],axis=-1)

#wrappers 
def make_mt10_env(env_idx: int, seed: int = None):
    def _init():
        ml10=metaworld.MT10()
        all_tasks= ml10.train_tasks  
        n_envs=len(ml10.train_classes)   

        block_size=len(all_tasks)//n_envs
        env_name=all_tasks[env_idx * block_size].env_name
        raw_env=ml10.train_classes[env_name]()
        if seed is not None:
            raw_env.seed(seed)
        monitored=Monitor(raw_env)
        wrapped=OneHotTaskWrapper(monitored,env_idx,all_tasks,n_envs)
        return wrapped

    return _init

#calling wrappers for env creation
n_envs=10
env_fns=[make_mt10_env(i, seed=i) for i in range(n_envs)]
vec_env=DummyVecEnv(env_fns)
vec_env=VecNormalize(vec_env, norm_obs=True, norm_reward=True)


## SAC!!!!!

# Very confident hyperparameters could be better!

policy_kwargs = dict(net_arch=[256, 256], activation_fn=nn.ReLU)
model = SAC(
    "MlpPolicy",
    vec_env,
    verbose=1,
    tensorboard_log="./sac_mt10_1Hot_norm/", #TODO: organize logging process
    learning_rate=3e-4,
    buffer_size=100_000,
    gamma=0.99,
    batch_size=500,
    train_freq=(1, "step"),
    gradient_steps=1,
    tau=5e-3,
    ent_coef="auto",
    policy_kwargs=policy_kwargs,
)

TOTAL_STEPS=5_000_000
model.learn(total_timesteps=TOTAL_STEPS)

model.save(f"norm/sac_mt10_{TOTAL_STEPS}steps_1Hot_allTask")
print("Model saved!")
model.env.save("vecnorm_stats.pkl")


  gym.logger.warn(
  gym.logger.warn(


Using cuda device
Logging to ./sac_mt10_1Hot_norm/SAC_7
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | 209      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 750      |
|    time_elapsed    | 6        |
|    total_timesteps | 5000     |
| train/             |          |
|    actor_loss      | -9.23    |
|    critic_loss     | 0.0493   |
|    ent_coef        | 0.864    |
|    ent_coef_loss   | -0.984   |
|    learning_rate   | 0.0003   |
|    n_updates       | 489      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | 209      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 750      |
|    time_elapsed    | 6        |
|    total_timesteps | 5000     |
---------------------------------
--------------------------

## Evaluation of multi-SAC

In [None]:
import numpy as np
import pandas as pd
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.evaluation import evaluate_policy


#eval loop
n_envs=10
n_eval_episodes=100
records=[]
ml10=metaworld.MT10()
all_tasks=ml10.train_tasks 
block_size=50
for env_idx in range(n_envs):
    print(env_idx)
    print(all_tasks[env_idx * block_size].env_name)

    eval_env=DummyVecEnv([make_mt10_env(env_idx, seed=None)])

    eval_env=VecNormalize.load("vecnorm_stats.pkl", eval_env)

    eval_env.training=False
    eval_env.norm_reward=False

    mean_reward, std_reward = evaluate_policy(
        model,
        eval_env,
        n_eval_episodes=n_eval_episodes,
        deterministic=True,
    )
    #manually count the amount of successes for each episode!
    successes=0
    for _ in range(n_eval_episodes):
        print(_)
        obs=eval_env.reset()
        done=False
        while not done:
            action, _=model.predict(obs,deterministic=True)
            obs,rewards,dones,infos=eval_env.step(action)
            done=dones[0]
        successes+=int(infos[0].get("success", False))

    success_rate=successes/n_eval_episodes

    records.append({
        "env_idx": env_idx,
        "mean_reward":mean_reward,
        "std_reward":std_reward,
        "success_rate":success_rate,
    })
df = pd.DataFrame(records)
print(df)


0
reach-v2


  gym.logger.warn(
  gym.logger.warn(


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
1
push-v2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
2
pick-place-v2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
3
door-open-v2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


## Do it again but include context this time
- basically the same but you multiply onehot by semantic matrix

In [None]:
#pasted again
contextArray = np.array([
    [ 2.55721937e+00,  2.31504669e+00,  1.53086022e+00,  3.67148980e+00,
     -7.59686234e-01, -7.61520546e-01,  7.11087296e-01, -2.76179305e-01,
     -9.09933731e-03, -4.92187850e-16],
    [ 3.19798408e+00,  2.29742495e-01,  1.50240787e+00, -7.02709710e-01,
     -1.11196737e-01,  2.74924940e+00, -1.53620950e+00, -1.18464051e-01,
      3.12674877e-02, -4.92187850e-16],
    [ 3.28570139e+00,  1.92692870e+00,  2.16563078e-01, -3.17126572e+00,
      1.22878090e+00, -1.05485421e+00,  1.25747231e+00,  3.66652176e-01,
      2.40985359e-02, -4.92187850e-16],
    [-2.08496350e+00,  4.64285575e-01,  5.41231677e-01,  6.38912049e-01,
      2.08218306e+00, -1.46809404e+00, -2.13341308e+00,  7.39046881e-01,
     -2.00950098e-01, -4.92187850e-16],
    [ 4.27420596e-01, -8.70185256e-01, -2.86820046e+00,  3.71623910e-01,
      1.72775859e+00, -2.57396544e-02, -3.41897534e-02, -1.64198974e+00,
     -1.41869863e-01, -4.92187850e-16],
    [ 1.15915385e-01, -1.24388372e+00, -2.73867371e+00,  1.45164344e+00,
      5.18528498e-01,  1.21674038e+00,  9.57762804e-01,  1.37271833e+00,
      3.60078564e-01, -4.92187850e-16],
    [ 1.14941507e+00, -4.93749060e+00,  2.04079053e+00, -2.26013967e-01,
     -7.33097844e-01, -1.07109692e+00,  3.29521154e-01, -4.60878162e-02,
     -9.44210158e-03, -4.92187850e-16],
    [-1.88857968e-01,  8.08267667e-01, -2.38279611e+00, -1.02331244e+00,
     -3.41088830e+00, -9.48025784e-01, -1.05931369e+00,  2.31463462e-02,
      9.69085706e-02, -4.92187850e-16],
    [-4.71406517e+00,  8.02744073e-01,  1.52039680e+00, -5.83708654e-01,
      3.05733353e-02,  4.24301726e-01,  5.31541815e-01, -4.67244928e-01,
      1.12817178e+00, -4.92187850e-16],
    [-3.74576926e+00,  5.04544376e-01,  6.37420105e-01, -4.26658712e-01,
     -5.72955276e-01,  9.39039647e-01,  9.75740641e-01,  4.84021130e-02,
     -1.27916354e+00, -4.92187850e-16]
], dtype=float)

In [None]:
import random
import numpy as np
import metaworld
import gymnasium as gym
from gymnasium import ObservationWrapper, spaces
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
import torch.nn as nn
from stable_baselines3.common.vec_env import VecNormalize


#have to write this to control env creation in multi-task setting
class OneHotTaskWrapper(ObservationWrapper):
    def __init__(self, env, env_idx, all_tasks, n_envs):
        super().__init__(env)
        self.env_idx= env_idx
        self.all_tasks= all_tasks
        self.n_envs=n_envs

        self.n_configs= len(all_tasks) // n_envs

        orig_low,orig_high=env.observation_space.low,env.observation_space.high
        low=np.concatenate([orig_low,np.zeros(n_envs,dtype=np.float32)])
        high=np.concatenate([orig_high,np.ones(n_envs,dtype=np.float32)])
        self.observation_space=spaces.Box(low=low,high=high,dtype=np.float32)

#sample a task and config within task upon each env restart. task is random
#can also restrict range of new tasks (<n for MTn) if you'd like
    def reset(self, **kwargs):
        cfg_idx=random.randrange(self.n_configs)
        task=self.all_tasks[self.env_idx*self.n_configs+cfg_idx]
        self.env.unwrapped.set_task(task)
        obs,info=self.env.reset(**kwargs)
        return self._append_one_hot(obs),info

    def observation(self,obs):
        return self._append_one_hot(obs)

    def _append_one_hot(self, obs):
        one_hot = np.zeros(self.n_envs, dtype=np.float32)
        one_hot[self.env_idx] = 1.0
        #to get the semantic embedding from semantic matrix + onehot
        #use jax here in the future
        one_hot_mod=np.einsum('ik,kj->ij', np.array([one_hot]),contextArray)[0]
        return np.concatenate([obs, one_hot_mod], axis=-1)

#wrappers 
def make_mt10_env(env_idx: int, seed: int = None):
    def _init():
        ml10=metaworld.MT10()
        all_tasks= ml10.train_tasks  
        n_envs=len(ml10.train_classes)   

        block_size=len(all_tasks)//n_envs
        env_name=all_tasks[env_idx * block_size].env_name
        raw_env=ml10.train_classes[env_name]()
        if seed is not None:
            raw_env.seed(seed)
        monitored=Monitor(raw_env)
        wrapped=OneHotTaskWrapper(monitored,env_idx,all_tasks,n_envs)
        return wrapped

    return _init

#calling wrappers for env creation
n_envs=10
env_fns=[make_mt10_env(i, seed=i) for i in range(n_envs)]
vec_env=DummyVecEnv(env_fns)
vec_env=VecNormalize(vec_env, norm_obs=True, norm_reward=True)


## SAC!!!!!

# Very confident hyperparameters could be better!

policy_kwargs = dict(net_arch=[256, 256], activation_fn=nn.ReLU)
model = SAC(
    "MlpPolicy",
    vec_env,
    verbose=1,
    tensorboard_log="./sac_mt10_1Hot_norm/", #TODO: organize logging process
    learning_rate=3e-4,
    buffer_size=100_000,
    gamma=0.99,
    batch_size=500,
    train_freq=(1, "step"),
    gradient_steps=1,
    tau=5e-3,
    ent_coef="auto",
    policy_kwargs=policy_kwargs,
)

TOTAL_STEPS=5_000_000
model.learn(total_timesteps=TOTAL_STEPS)

model.save(f"norm/sac_mt10_{TOTAL_STEPS}steps_1Hot_allTask")
print("Model saved!")
model.env.save("vecnorm_stats.pkl")


  gym.logger.warn(
  gym.logger.warn(


Using cuda device
Logging to ./sac_mt10_1Hot_norm/SAC_9
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | 230      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 743      |
|    time_elapsed    | 6        |
|    total_timesteps | 5000     |
| train/             |          |
|    actor_loss      | -9.21    |
|    critic_loss     | 0.0553   |
|    ent_coef        | 0.864    |
|    ent_coef_loss   | -0.984   |
|    learning_rate   | 0.0003   |
|    n_updates       | 489      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | 230      |
| time/              |          |
|    episodes        | 8        |
|    fps             | 743      |
|    time_elapsed    | 6        |
|    total_timesteps | 5000     |
---------------------------------
--------------------------

# Evaluation script

In [None]:
import numpy as np
import pandas as pd
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.evaluation import evaluate_policy


#eval loop
n_envs=10
n_eval_episodes=100
records=[]
ml10=metaworld.MT10()
all_tasks=ml10.train_tasks 
block_size=50
for env_idx in range(n_envs):
    print(env_idx)
    print(all_tasks[env_idx * block_size].env_name)

    eval_env=DummyVecEnv([make_mt10_env(env_idx, seed=None)])

    eval_env=VecNormalize.load("vecnorm_stats.pkl", eval_env)

    eval_env.training=False
    eval_env.norm_reward=False

    mean_reward, std_reward = evaluate_policy(
        model,
        eval_env,
        n_eval_episodes=n_eval_episodes,
        deterministic=True,
    )
    #manually count the amount of successes for each episode!
    successes=0
    for _ in range(n_eval_episodes):
        print(_)
        obs=eval_env.reset()
        done=False
        while not done:
            action, _=model.predict(obs,deterministic=True)
            obs,rewards,dones,infos=eval_env.step(action)
            done=dones[0]
        successes+=int(infos[0].get("success", False))

    success_rate=successes/n_eval_episodes

    records.append({
        "env_idx": env_idx,
        "mean_reward":mean_reward,
        "std_reward":std_reward,
        "success_rate":success_rate,
    })
df = pd.DataFrame(records)
print(df)


0
reach-v2


  gym.logger.warn(
  gym.logger.warn(


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
1
push-v2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
2
pick-place-v2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
3
door-open-v2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


In [None]:
#all_env_mt10=['reach-v2', 'push-v2','pick-place-v2', 'door-open-v2', 'drawer-open-v2','drawer-close-v2','button-press-topdown-v2','peg-insert-side-v2','window-open-v2','window-close-v2']
