# 1. Import Dependencies

In [1]:
#!pip install stable-baselines3[extra]

In [2]:
import stable_baselines3


In [3]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

# 3. Building an Environment

In [68]:
class ShowerEnv(Env):
    def __init__(self):
        
        # number of pentomino pieces
        no_pieces = 9 # + 1 + 1 #pieces + uncertainty + lang team
        
        #self.uncertainty_action = 10
        #self.lang_action = 11
        
        # ACTIONS
        '''for now: we only decide for one coordinate --> no of actions = number of coordinates)
        (for later: we only decide for one coordinate out of all absolute coordinates and moving coordinate, 
        boolean value expressing uncertainty)'''
        self.action_space = Discrete(no_pieces)
        
        # Temperature array
        self.observation_space = Box(low=0, high=100, shape=(no_pieces,))   ## it was shape = 1 before
        
        # POSSIBLE STATES
        '''Set start state (in our case start state is the only state and here we need to get our training data in or 
        construct a random formula that will randomly generate possible scenarios each time the function is called.
        for now: create random probabilities for 9 coordinates that add up to one
        uplevel: do the same and make sure that all follow the true distribution which is p=1 for the gold_coordinate and p=0 for all others
        uplevel: do the same + add possiblity for moving vector'''
        #no_pieces = 9
        self.p_gold  = [0.0 for i in range(no_pieces-1)] + [1.0]
        random.shuffle(self.p_gold)


        noise_std = 100
        p_noise = [np.abs(np.random.normal(scale = noise_std, loc = i)) for i in self.p_gold]
        smax = [i/np.sum(p_noise) for i in p_noise]

        self.state = smax
 
        
    def step(self, action):
        '''Our actions do not affect our state, because we only have one state, the start state. Also we don't need 
        the shower_length which represents the lenght of the sequence, but at the end we use it in the evalulation-
        function soo .. dunno.
        '''
    

        if action == np.argmax(self.p_gold):
            reward = 1
        else:
            reward = -1
        
        # Check if sequence is done
        '''we have a sequence of one state: the output that the other groups give us. based on that 
        we make a decision and a new round starts. so we don't need to count down the seconds of a 60min long
        shower to know when a round is over.'''
        done = True
        
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        '''Reset start state (output of previous groups) when new round starts (use "random-formula" that we need to 
        create above)'''
        #bleh = [random.uniform(0, 0.628) for _ in range(8)]
        #bleh.append(random.uniform(2, 5))
        #self.state = np.exp(bleh)/np.sum(np.exp(bleh))
        
                # number of pentomino pieces
        no_pieces = 9 # + 1 + 1 #pieces + uncertainty + lang team
        
        #self.uncertainty_action = 10
        #self.lang_action = 11
        
        # ACTIONS
        '''for now: we only decide for one coordinate --> no of actions = number of coordinates)
        (for later: we only decide for one coordinate out of all absolute coordinates and moving coordinate, 
        boolean value expressing uncertainty)'''
        self.action_space = Discrete(no_pieces)
        
        # Temperature array
        self.observation_space = Box(low=0, high=100, shape=(no_pieces,))   ## it was shape = 1 before
        
        # POSSIBLE STATES
        '''Set start state (in our case start state is the only state and here we need to get our training data in or 
        construct a random formula that will randomly generate possible scenarios each time the function is called.
        for now: create random probabilities for 9 coordinates that add up to one
        uplevel: do the same and make sure that all follow the true distribution which is p=1 for the gold_coordinate and p=0 for all others
        uplevel: do the same + add possiblity for moving vector'''
        #no_pieces = 9
        self.p_gold  = [0.0 for i in range(no_pieces-1)] + [1.0]
        random.shuffle(self.p_gold)


        noise_std = 100  
        p_noise = [np.abs(np.random.normal(scale = noise_std, loc = i)) for i in self.p_gold]
        smax = [i/np.sum(p_noise) for i in p_noise]

        self.state = smax

        return self.state

In [69]:
env=ShowerEnv()

In [70]:
#CLARA
#env.observation_space 
#env.observation_space.sample()
#env.action_space.sample()

In [71]:
env.reset()

[0.11800315187075001,
 0.15397184111856263,
 0.12730024338499382,
 0.040599978086624765,
 0.31851661857336744,
 0.025934738348204908,
 0.15509614585661943,
 0.008037654228970235,
 0.05253962853190663]

# 4. Test Environment

In [72]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:1
Episode:2 Score:-1
Episode:3 Score:-1
Episode:4 Score:-1
Episode:5 Score:-1


In [73]:
env.close()

# 5. Train Model

In [74]:
log_path = os.path.join('Training', 'Logs')

In [75]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

# Random Agent, before training   
'''stolen from: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_getting_started.ipynb#scrollTo=xDHLMA6NFk95
since the evaluate_policy function from this tutorial errored'''
evaluate_policy(model, env, n_eval_episodes=100)            

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


(-0.78, 0.6257795138864806)

In [76]:
model.learn(total_timesteps=100000)

Logging to Training/Logs/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -0.74    |
| time/              |          |
|    fps             | 2609     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1            |
|    ep_rew_mean          | -0.76        |
| time/                   |              |
|    fps                  | 1822         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0068495963 |
|    clip_fraction        | 0.00405      |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.2         |
|    explained_variance   |

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -0.74       |
| time/                   |             |
|    fps                  | 1436        |
|    iterations           | 11          |
|    time_elapsed         | 15          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.008682967 |
|    clip_fraction        | 0.075       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.11       |
|    explained_variance   | -0.00518    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.246       |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.00892    |
|    value_loss           | 0.41        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1     

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -0.86       |
| time/                   |             |
|    fps                  | 1418        |
|    iterations           | 21          |
|    time_elapsed         | 30          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008277015 |
|    clip_fraction        | 0.0424      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.08       |
|    explained_variance   | -0.00174    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0958      |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.00659    |
|    value_loss           | 0.399       |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1   

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1         |
|    ep_rew_mean          | -0.88     |
| time/                   |           |
|    fps                  | 1408      |
|    iterations           | 31        |
|    time_elapsed         | 45        |
|    total_timesteps      | 63488     |
| train/                  |           |
|    approx_kl            | 0.0129722 |
|    clip_fraction        | 0.101     |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.98     |
|    explained_variance   | -0.00243  |
|    learning_rate        | 0.0003    |
|    loss                 | 0.278     |
|    n_updates            | 300       |
|    policy_gradient_loss | -0.00939  |
|    value_loss           | 0.402     |
---------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1            |
|    ep_rew_mean          | -0.

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -0.7        |
| time/                   |             |
|    fps                  | 1406        |
|    iterations           | 41          |
|    time_elapsed         | 59          |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009328717 |
|    clip_fraction        | 0.0773      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.58       |
|    explained_variance   | -0.00615    |
|    learning_rate        | 0.0003      |
|    loss                 | 0.176       |
|    n_updates            | 400         |
|    policy_gradient_loss | -0.00736    |
|    value_loss           | 0.393       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1     

<stable_baselines3.ppo.ppo.PPO at 0x7f989b111d00>


# 6. Save Model

In [77]:
#ORIGINAL
#model.save('PPO')

In [78]:
#ORIGINAL
#evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [79]:
# FROM YT - CLARA
shower_path = os.path.join('Training', 'Saved Models', 'Shower_Model_PPO')


In [80]:
model.save(shower_path)

In [81]:
del model

In [82]:
model = PPO.load(shower_path,env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [83]:
#evaluate_policy(model,env,n_eval_episodes=10,render=True)

In [84]:
# Random Agent, after training                            
'''stolen from: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_getting_started.ipynb#scrollTo=xDHLMA6NFk95
since the evaluate_policy function from this tutorial errored'''


'''Mean reward per episode, std of reward per episode returns'''

evaluate_policy(model, env, n_eval_episodes=100)   


(-0.76, 0.6499230723708769)

In [85]:
'''MODEL ACTUALLY LEARNS!

(mean shower length, variance in shower length)
(however i dont know why we re interested in shower length since the goal of our agent is to regulate the temperature)
total_timesteps = 20 000: (-57.76, 2.486443242867209)
total_timesteps = 100 000:(59.38, 0.9249864863877744)'''

'MODEL ACTUALLY LEARNS!\n\n(mean shower length, variance in shower length)\n(however i dont know why we re interested in shower length since the goal of our agent is to regulate the temperature)\ntotal_timesteps = 20 000: (-57.76, 2.486443242867209)\ntotal_timesteps = 100 000:(59.38, 0.9249864863877744)'

In [48]:
bleh = np.random.randn(1, 9)
sft = np.exp(bleh)/np.sum(np.exp(bleh))
print(np.argmax(sft))
print(np.max(sft))

0
0.23763499846199734


In [32]:
bleh = [random.uniform(0, 0.628) for _ in range(8)]
bleh.append(random.uniform(2, 5))

sft = np.exp(bleh)/np.sum(np.exp(bleh))
print(bleh)
random.shuffle(bleh)
print(bleh)
# print(sft)
# print(np.argmax(sft))
# print(np.max(sft))

[0.12409718781736806, 0.012405218841902729, 0.005146497221804358, 0.5616135514282242, 0.3939891446264752, 0.47490561370025214, 0.10262597394999803, 0.1560173156794322, 4.185103944192127]
[0.12409718781736806, 0.1560173156794322, 4.185103944192127, 0.3939891446264752, 0.5616135514282242, 0.012405218841902729, 0.10262597394999803, 0.005146497221804358, 0.47490561370025214]


In [33]:
print(sft)

[0.01490203 0.01332718 0.01323079 0.02308109 0.01951902 0.02116409
 0.01458547 0.01538538 0.86480497]


In [34]:
no_pieces = 9
p_gold  = [0.0 for i in range(no_pieces-1)] + [1.0]
random.shuffle(p_gold)


noise_std = 0.1  
p_noise = [np.abs(np.random.normal(scale = noise_std, loc = i)) for i in p_gold]
smax = [i/np.sum(p_noise) for i in p_noise]
print(p_noise)
print(smax)
print(sum(smax))

[0.05416693546890216, 0.11098288949254514, 0.06179192314618518, 0.02083836311468572, 1.0202686584283205, 0.10698712777203301, 0.13641039920419593, 0.10682346425104382, 0.004223949757622388]
[0.033384989484911386, 0.06840266237400262, 0.03808453785745719, 0.012843416882351608, 0.6288274966740421, 0.06593993373948176, 0.08407453188263128, 0.065839062149092, 0.002603368956029949]
0.9999999999999998


In [1]:
class UnoEnv(Env, no_pieces, X,y):
    def __init__(self):
        
        # STATE
        self.state = X[i]
        
        # POSSIBLE OBSERVATION
        self.observation_space = Box(low=0, high=100, shape=(no_pieces,))  
 
        # POSSIBLE ACTION
        self.action_space = Discrete(self.no_pieces)
        
        
    def step(self, action):
        
        if action == y[i]:
            reward = 1
        else:
            reward = -1
        
        done = True

        return reward, done

    
    def reset(self):
        pass

NameError: name 'Env' is not defined