# Introduction

Follow tutorial https://www.youtube.com/watch?v=Mut_u40Sqz4

# Setup

## Imports

In [9]:
repo_root, *_ = !git rev-parse --show-toplevel
%cd {repo_root}

/mnt/batch/tasks/shared/LS_root/mounts/clusters/rubchume1/code/Users/rubchume/SalesReinforcer


In [1]:
from pathlib import Path

import gymnasium as gym 
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy

## Variables

In [10]:
log_path = Path("data/training/logs")
log_path.mkdir(parents=True, exist_ok=True)

# Create environment

In [3]:
class UserFlowEnv(Env):
    def __init__(self):
        # Actions we can take, down, stay, up
        self.action_space = Discrete(3)
        # Temperature array
        self.observation_space = Box(0, 100, shape=(1,), dtype="int")
        # Set start temp
        self.state = 38 + random.randint(-3,3)
        # Set shower length
        self.shower_length = 60
        
    def _get_obs(self):
        return np.array([self.state])

    def _get_info(self):
        return {
            "degrees_from_ideal": np.abs(38) - self.state
        }
        
    def step(self, action):
        # Apply action
        # 0 -1 = -1 temperature
        # 1 -1 = 0 
        # 2 -1 = 1 temperature 
        self.state += action -1 
        # Reduce shower length by 1 second
        self.shower_length -= 1 
        
        # Calculate reward
        if self.state >=37 and self.state <=39: 
            reward =1 
        else: 
            reward = -1 
        
        # Check if shower is done
        if self.shower_length <= 0: 
            terminated = True
        else:
            terminated = False
        
        truncated = False
        return self._get_obs(), reward, terminated, truncated, self._get_info()

    def render(self):
        print(f"Temperature: {self.state}. Remaining Shower Length: {self.shower_length}")
    
    def reset(self, seed=0):
        self.state = 38 + random.randint(-3,3)
        self.shower_length = 60
        
        return self._get_obs(), self._get_info()

In [4]:
env = ShowerEnv()

In [5]:
check_env(ShowerEnv(), warn=True)

# Test environment

In [11]:
# episodes = 5
# for episode in range(1, episodes+1):
#     state = env.reset()
#     terminated = False
#     truncated = False
#     score = 0 
    
#     while not terminated and not truncated:
#         env.render()
#         action = env.action_space.sample()
#         state, reward, terminated, truncated, info = env.step(action)
#         score += reward
#     print('Episode:{} Score:{}'.format(episode, score))
# env.close()

# Train model

In [6]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)


  return torch._C._cuda_getDeviceCount() > 0


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [16]:
model.learn(total_timesteps=100000)

Logging to data/training/logs/PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -27.1    |
| time/              |          |
|    fps             | 1018     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | -27.2       |
| time/                   |             |
|    fps                  | 721         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007918391 |
|    clip_fraction        | 0.05        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.996      |
|    explained_variance   | 0.000439

<stable_baselines3.ppo.ppo.PPO at 0x7fa38105bdf0>

In [17]:
evaluate_policy(model, env, n_eval_episodes=100, render=True, return_episode_rewards=True)

([60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  58.0,
  58.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  58.0,
  60.0,
  60.0,
  60.0,
  58.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  58.0,
  60.0,
  58.0,
  60.0,
  60.0,
  58.0,
  60.0,
  60.0,
  58.0,
  60.0,
  58.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  58.0,
  60.0,
  58.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  58.0,
  60.0,
  60.0,
  60.0,
  58.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  58.0,
  60.0,
  60.0,
  60.0,
  58.0,
  60.0,
  60.0,
  58.0,
  60.0,
  58.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  58.0,
  58.0,
  58.0,
  58.0,
  58.0,
  60.0,
  58.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0,
  60.0],
 [60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
  60,
 

In [18]:
from azureml.tensorboard import Tensorboard

# The TensorBoard constructor takes an array of jobs, so be sure and pass it in as a single-element array here
tb = Tensorboard([], local_root=log_path, port=6006)

# If successful, start() returns a string with the URI of the instance.
tb.start()

https://rubchume1-6006.westeurope.instances.azureml.ms


'https://rubchume1-6006.westeurope.instances.azureml.ms'