# 1. Import Dependencies

In [1]:
!pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-1.6.0-py3-none-any.whl (177 kB)
Collecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
Collecting matplotlib
  Downloading matplotlib-3.5.3-cp38-cp38-win_amd64.whl (7.2 MB)
Collecting torch>=1.11
  Downloading torch-1.12.1-cp38-cp38-win_amd64.whl (161.9 MB)
Collecting cloudpickle
  Downloading cloudpickle-2.2.0-py3-none-any.whl (25 kB)
Collecting pillow
  Downloading Pillow-9.2.0-cp38-cp38-win_amd64.whl (3.3 MB)
Collecting protobuf~=3.19.0
  Using cached protobuf-3.19.4-cp38-cp38-win_amd64.whl (895 kB)
Collecting tensorboard>=2.2.0
  Downloading tensorboard-2.10.0-py3-none-any.whl (5.9 MB)
Collecting ale-py==0.7.4
  Downloading ale_py-0.7.4-cp38-cp38-win_amd64.whl (904 kB)
Collecting opencv-python
  Downloading opencv_python-4.6.0.66-cp36-abi3-win_amd64.whl (35.6 MB)
Collecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting psutil
  Downloading psutil-5.9.2-cp3

In [2]:
import stable_baselines3


In [3]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

# 3. Building an Environment

In [4]:
class ShowerEnv(Env):
    def __init__(self):
        
        # number of pentomino pieces
        no_pieces = 9 # + 1 + 1 #pieces + uncertainty + lang team
        
        self.uncertainty_action = 10
        self.lang_action = 11
        
        # ACTIONS
        '''for now: we only decide for one coordinate --> no of actions = number of coordinates)
        (for later: we only decide for one coordinate out of all absolute coordinates and moving coordinate, 
        boolean value expressing uncertainty)'''
        self.action_space = Discrete(no_pieces)
        
        # Temperature array
        self.observation_space = Box(low=0, high=100, shape=(no_pieces,))
        
        # POSSIBLE STATES
        '''Set start state (in our case start state is the only state and here we need to get our training data in or 
        construct a random formula that will randomly generate possible scenarios each time the function is called.
        for now: create random probabilities for 9 coordinates that add up to one
        uplevel: do the same and make sure that all follow the true distribution which is p=1 for the gold_coordinate and p=0 for all others
        uplevel: do the same + add possiblity for moving vector'''
        no_pieces = 9
        self.p_gold  = [0.0 for i in range(no_pieces-1)] + [1.0]
        random.shuffle(self.p_gold)


        noise_std = 0.1  
        p_noise = [np.abs(np.random.normal(scale = noise_std, loc = i)) for i in self.p_gold]
        smax = [i/np.sum(p_noise) for i in p_noise]

        self.state = smax
 
        
    def step(self, action):
        '''Our actions do not affect our state, because we only have one state, the start state. Also we don't need 
        the shower_length which represents the lenght of the sequence, but at the end we use it in the evalulation-
        function soo .. dunno.
        '''
    

        if action == np.argmax(self.p_gold):
            reward = 1
        else:
            reward = -1
        
        # Check if sequence is done
        '''we have a sequence of one state: the output that the other groups give us. based on that 
        we make a decision and a new round starts. so we don't need to count down the seconds of a 60min long
        shower to know when a round is over.'''
        done = True
        
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        '''Reset start state (output of previous groups) when new round starts (use "random-formula" that we need to 
        create above)'''
        bleh = [random.uniform(0, 0.628) for _ in range(8)]
        bleh.append(random.uniform(2, 5))

        self.state = np.exp(bleh)/np.sum(np.exp(bleh))

        return self.state

In [5]:
env=ShowerEnv()

In [6]:
#CLARA
env.observation_space 
env.observation_space.sample()
env.action_space.sample()

5

In [7]:
env.reset()

array([0.01074775, 0.01684012, 0.01446465, 0.01308205, 0.01130682,
       0.01182561, 0.0121615 , 0.01082418, 0.89874732])

# 4. Test Environment

In [8]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:1
Episode:2 Score:-1
Episode:3 Score:-1
Episode:4 Score:-1
Episode:5 Score:-1


In [9]:
env.close()

# 5. Train Model

In [10]:
log_path = os.path.join('Training', 'Logs')

In [11]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

# Random Agent, before training   
'''stolen from: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_getting_started.ipynb#scrollTo=xDHLMA6NFk95
since the evaluate_policy function from this tutorial errored'''
evaluate_policy(model, env, n_eval_episodes=100)            

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




(1.0, 0.0)

In [12]:
model.learn(total_timesteps=100000)

Logging to Training\Logs\PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -0.74    |
| time/              |          |
|    fps             | 1274     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1          |
|    ep_rew_mean          | -0.36      |
| time/                   |            |
|    fps                  | 878        |
|    iterations           | 2          |
|    time_elapsed         | 4          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.17418805 |
|    clip_fraction        | 0.768      |
|    clip_range           | 0.2        |
|    entropy_loss         | -2.01      |
|    explained_variance   | -0.00712   |
|    learning_

<stable_baselines3.ppo.ppo.PPO at 0x27d2e81b7f0>

# 6. Save Model

In [13]:
#ORIGINAL
#model.save('PPO')

In [14]:
#ORIGINAL
#evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [15]:
# FROM YT - CLARA
shower_path = os.path.join('Training', 'Saved Models', 'Shower_Model_PPO')


In [16]:
model.save(shower_path)

In [17]:
del model

In [18]:
model = PPO.load(shower_path,env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [19]:
#evaluate_policy(model,env,n_eval_episodes=10,render=True)

In [20]:
# Random Agent, after training                            
'''stolen from: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_getting_started.ipynb#scrollTo=xDHLMA6NFk95
since the evaluate_policy function from this tutorial errored'''
evaluate_policy(model, env, n_eval_episodes=100)           

(1.0, 0.0)

In [21]:
'''MODEL ACTUALLY LEARNS!

(mean shower length, variance in shower length)
(however i dont know why we re interested in shower length since the goal of our agent is to regulate the temperature)
total_timesteps = 20 000: (-57.76, 2.486443242867209)
total_timesteps = 100 000:(59.38, 0.9249864863877744)'''

'MODEL ACTUALLY LEARNS!\n\n(mean shower length, variance in shower length)\n(however i dont know why we re interested in shower length since the goal of our agent is to regulate the temperature)\ntotal_timesteps = 20 000: (-57.76, 2.486443242867209)\ntotal_timesteps = 100 000:(59.38, 0.9249864863877744)'

In [22]:
bleh = np.random.randn(1, 9)
sft = np.exp(bleh)/np.sum(np.exp(bleh))
print(np.argmax(sft))
print(np.max(sft))

4
0.4760460442234847


In [23]:
bleh = [random.uniform(0, 0.628) for _ in range(8)]
bleh.append(random.uniform(2, 5))

sft = np.exp(bleh)/np.sum(np.exp(bleh))
print(bleh)
random.shuffle(bleh)
print(bleh)
# print(sft)
# print(np.argmax(sft))
# print(np.max(sft))

[0.5358925115066222, 0.2706150227228086, 0.1720668895990129, 0.4377482911256423, 0.10029355568712371, 0.4589329246155352, 0.1346392009510646, 0.31398168912080665, 4.823779320462388]
[4.823779320462388, 0.1346392009510646, 0.1720668895990129, 0.5358925115066222, 0.2706150227228086, 0.31398168912080665, 0.4589329246155352, 0.10029355568712371, 0.4377482911256423]


In [24]:
print(sft)

[0.0126224  0.00968129 0.00877272 0.01144244 0.00816514 0.01168742
 0.00845045 0.01011038 0.91906775]


In [25]:
no_pieces = 9
p_gold  = [0.0 for i in range(no_pieces-1)] + [1.0]
random.shuffle(p_gold)


noise_std = 0.1  
p_noise = [np.abs(np.random.normal(scale = noise_std, loc = i)) for i in p_gold]
smax = [i/np.sum(p_noise) for i in p_noise]
print(p_noise)
print(smax)
print(sum(smax))

[0.17797209551960547, 0.023450190705157983, 0.22445618413934976, 0.12561120388192143, 0.07265502721625859, 1.035757304976535, 0.0978092801319897, 0.13646983626791512, 0.15077125988059803]
[0.08702994603861741, 0.011467352933653379, 0.10976108100907125, 0.061425001845219755, 0.035528957950426016, 0.5064945833111325, 0.04782961254184567, 0.06673497017394638, 0.07372849419608776]
1.0000000000000002
