# 1. Import Dependencies

In [10]:
#!pip install stable-baselines3[extra]

In [11]:
import stable_baselines3


In [12]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

# 3. Building an Environment

In [13]:
class ShowerEnv(Env):
    def __init__(self):
        
        # number of pentomino pieces
        no_pieces = 9 # + 1 + 1 #pieces + uncertainty + lang team
        
        self.uncertainty_action = 10
        self.lang_action = 11
        
        # ACTIONS
        '''for now: we only decide for one coordinate --> no of actions = number of coordinates)
        (for later: we only decide for one coordinate out of all absolute coordinates and moving coordinate, 
        boolean value expressing uncertainty)'''
        self.action_space = Discrete(no_pieces)
        
        # Temperature array
        self.observation_space = Box(low=0, high=100, shape=(no_pieces,))
        
        # POSSIBLE STATES
        '''Set start state (in our case start state is the only state and here we need to get our training data in or 
        construct a random formula that will randomly generate possible scenarios each time the function is called.
        for now: create random probabilities for 9 coordinates that add up to one
        uplevel: do the same and make sure that all follow the true distribution which is p=1 for the gold_coordinate and p=0 for all others
        uplevel: do the same + add possiblity for moving vector'''
        no_pieces = 9
        self.p_gold  = [0.0 for i in range(no_pieces-1)] + [1.0]
        random.shuffle(p_gold)


        noise_std = 0.1  
        p_noise = [np.abs(np.random.normal(scale = noise_std, loc = i)) for i in p_gold]
        smax = [i/np.sum(p_noise) for i in p_noise]

        self.state = smax
 
        
    def step(self, action):
        '''Our actions do not affect our state, because we only have one state, the start state. Also we don't need 
        the shower_length which represents the lenght of the sequence, but at the end we use it in the evalulation-
        function soo .. dunno.
        '''
    

        if action == np.argmax(self.p_gold):
            reward = 1
        else:
            reward = -1
        
        # Check if sequence is done
        '''we have a sequence of one state: the output that the other groups give us. based on that 
        we make a decision and a new round starts. so we don't need to count down the seconds of a 60min long
        shower to know when a round is over.'''
        done = True
        
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        '''Reset start state (output of previous groups) when new round starts (use "random-formula" that we need to 
        create above)'''
        bleh = [random.uniform(0, 0.628) for _ in range(8)]
        bleh.append(random.uniform(2, 5))

        self.state = np.exp(bleh)/np.sum(np.exp(bleh))

        return self.state

In [14]:
env=ShowerEnv()

In [15]:
#CLARA
env.observation_space 
env.observation_space.sample()
env.action_space.sample()

2

In [16]:
env.reset()

array([0.06366576, 0.05938566, 0.04329654, 0.06496438, 0.06436485,
       0.05115506, 0.05863587, 0.05098671, 0.54354519])

# 4. Test Environment

In [17]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-1
Episode:2 Score:-1
Episode:3 Score:-1
Episode:4 Score:-1
Episode:5 Score:-1


In [18]:
env.close()

# 5. Train Model

In [19]:
log_path = os.path.join('Training', 'Logs')

In [20]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

# Random Agent, before training   
'''stolen from: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_getting_started.ipynb#scrollTo=xDHLMA6NFk95
since the evaluate_policy function from this tutorial errored'''
evaluate_policy(model, env, n_eval_episodes=100)            

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




(-1.0, 0.0)

In [21]:
model.learn(total_timesteps=100000)

Logging to Training/Logs/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -0.8     |
| time/              |          |
|    fps             | 315      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1          |
|    ep_rew_mean          | -0.4       |
| time/                   |            |
|    fps                  | 417        |
|    iterations           | 2          |
|    time_elapsed         | 9          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.19942847 |
|    clip_fraction        | 0.754      |
|    clip_range           | 0.2        |
|    entropy_loss         | -2.01      |
|    explained_variance   | -0.000636  |
|    learning_

<stable_baselines3.ppo.ppo.PPO at 0x7f6c5d805390>

# 6. Save Model

In [22]:
#ORIGINAL
#model.save('PPO')

In [23]:
#ORIGINAL
#evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [24]:
# FROM YT - CLARA
shower_path = os.path.join('Training', 'Saved Models', 'Shower_Model_PPO')


In [25]:
model.save(shower_path)



In [26]:
del model

In [27]:
model = PPO.load(shower_path,env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [28]:
#evaluate_policy(model,env,n_eval_episodes=10,render=True)

In [29]:
# Random Agent, after training                            
'''stolen from: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_getting_started.ipynb#scrollTo=xDHLMA6NFk95
since the evaluate_policy function from this tutorial errored'''
evaluate_policy(model, env, n_eval_episodes=100)           



(1.0, 0.0)

In [30]:
'''MODEL ACTUALLY LEARNS!

(mean shower length, variance in shower length)
(however i dont know why we re interested in shower length since the goal of our agent is to regulate the temperature)
total_timesteps = 20 000: (-57.76, 2.486443242867209)
total_timesteps = 100 000:(59.38, 0.9249864863877744)'''

'MODEL ACTUALLY LEARNS!\n\n(mean shower length, variance in shower length)\n(however i dont know why we re interested in shower length since the goal of our agent is to regulate the temperature)\ntotal_timesteps = 20 000: (-57.76, 2.486443242867209)\ntotal_timesteps = 100 000:(59.38, 0.9249864863877744)'

In [31]:
bleh = np.random.randn(1, 9)
sft = np.exp(bleh)/np.sum(np.exp(bleh))
print(np.argmax(sft))
print(np.max(sft))

6
0.6175825595351309


In [32]:
bleh = [random.uniform(0, 0.628) for _ in range(8)]
bleh.append(random.uniform(2, 5))

sft = np.exp(bleh)/np.sum(np.exp(bleh))
print(bleh)
random.shuffle(bleh)
print(bleh)
# print(sft)
# print(np.argmax(sft))
# print(np.max(sft))

[0.12409718781736806, 0.012405218841902729, 0.005146497221804358, 0.5616135514282242, 0.3939891446264752, 0.47490561370025214, 0.10262597394999803, 0.1560173156794322, 4.185103944192127]
[0.12409718781736806, 0.1560173156794322, 4.185103944192127, 0.3939891446264752, 0.5616135514282242, 0.012405218841902729, 0.10262597394999803, 0.005146497221804358, 0.47490561370025214]


In [33]:
print(sft)

[0.01490203 0.01332718 0.01323079 0.02308109 0.01951902 0.02116409
 0.01458547 0.01538538 0.86480497]


In [34]:
no_pieces = 9
p_gold  = [0.0 for i in range(no_pieces-1)] + [1.0]
random.shuffle(p_gold)


noise_std = 0.1  
p_noise = [np.abs(np.random.normal(scale = noise_std, loc = i)) for i in p_gold]
smax = [i/np.sum(p_noise) for i in p_noise]
print(p_noise)
print(smax)
print(sum(smax))

[0.05416693546890216, 0.11098288949254514, 0.06179192314618518, 0.02083836311468572, 1.0202686584283205, 0.10698712777203301, 0.13641039920419593, 0.10682346425104382, 0.004223949757622388]
[0.033384989484911386, 0.06840266237400262, 0.03808453785745719, 0.012843416882351608, 0.6288274966740421, 0.06593993373948176, 0.08407453188263128, 0.065839062149092, 0.002603368956029949]
0.9999999999999998
