In [1]:
!pip install gymnasium[atari] --quiet
!pip install gymnasium --quiet
!pip install -U gymnasium[atari] --quiet
!pip install imageio_ffmpeg --quiet
!pip install npy_append_array --quiet
!pip install pyTelegramBotAPI --quiet
!pip install gymnasium[accept-rom-license] --quiet
!pip install gymnasium[box2d] --quiet
!pip install  mujoco-py

[0mCollecting mujoco-py
  Downloading mujoco_py-2.1.2.14-py3-none-any.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting glfw>=1.4.0
  Downloading glfw-2.5.9-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-manylinux2014_x86_64.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.8/207.8 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: glfw, mujoco-py
Successfully installed glfw-2.5.9 mujoco-py-2.1.2.14
[0m

In [2]:
import numpy as np 

class ExperienceReplayBuffer: 
    def __init__(self, max_memory, input_shape, n_actions, batch_size, cer=False): 
        self.mem_size = max_memory
        self.mem_counter = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape),
                                     dtype=np.float32)
        self.next_state_memory = np.zeros((self.mem_size, *input_shape),
                                         dtype=np.float32)

        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.int64)
        self.batch_size = batch_size
        self.cer = cer

    def store_experience(self, state, action, reward, next_state, done): 
        index = self.mem_counter % self.mem_size 

        self.state_memory[index] = state
        self.next_state_memory[index] = next_state
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done
        self.mem_counter += 1

    def sample_experience(self, batch_size):
        # used to get the last transition
        offset = 1 if self.cer else 0

        max_mem = min(self.mem_counter, self.mem_size) - offset
        batch_index = np.random.choice(max_mem, batch_size - offset, replace=False)

        states = self.state_memory[batch_index]
        next_states = self.next_state_memory[batch_index]
        rewards = self.reward_memory[batch_index]
        actions = self.action_memory[batch_index]
        terminals = self.terminal_memory[batch_index]
        
        if self.cer: 
            last_index = self.mem_counter % self.mem_size - 1
            last_state = self.state_memory[last_index]
            last_action = self.action_memory[last_index]
            last_terminal = self.terminal_memory[last_index]
            last_next_state = self.next_state_memory[last_index]
            last_reward = self.reward_memory[last_index]

            # for 2d and 3d use vstack to append, for 1d array use append() to append the data
            states = np.vstack((self.state_memory[batch_index], last_state))
            next_states = np.vstack((self.next_state_memory[batch_index], last_next_state))

            actions = np.append(actions, last_action)
            terminals = np.append(terminals, last_terminal)
            rewards = np.append(rewards, last_reward)
    
        return states, actions, rewards, next_states, terminals
    
    
    def is_sufficient(self): 
        return self.mem_counter > self.batch_size

In [16]:
from tensorflow.keras.layers import Dense, Input, Flatten, Conv2D
import tensorflow as tf 
import tensorflow.keras as keras 

class CriticNetwork(tf.keras.Model):
    def __init__(self):
        super(CriticNetwork, self).__init__()
        self.conv1 = Conv2D(64, 3, activation='relu', kernel_initializer="he_uniform", data_format="channels_first")
        self.conv2 = Conv2D(32, 3, activation="relu", kernel_initializer="he_uniform", data_format="channels_first")
        self.fc1 = Dense(64, activation="relu",  kernel_initializer="he_uniform")
        self.fc2 = Dense(32, activation="relu", kernel_initializer="he_uniform")
        self.fc3 = Dense(1, activation=None)
        self.flatten = Flatten()
        
    def call(self, x):
        state = x[0]
        action = x[0]
      #  x = self.conv1(tf.concat([state, action], axis=1))
       # x = self.conv2(x)
        #x = self.flatten(x)
        #x = self.fc1(x)
        #x = self.fc3(x)
        x = self.fc1(tf.concat([state, action], axis=1))
        x = self.fc2(x)
        x = self.fc3(x)
        return x                                                                                                                                                        

In [17]:
class ActorNetwork(keras.Model):
    def __init__(self, noise, n_actions):
        super(ActorNetwork, self).__init__()
        self.noise = noise
        self.conv1 = Conv2D(64, 3, activation='relu', kernel_initializer="he_uniform", data_format="channels_first")
        self.conv2 = Conv2D(32, 3, activation="relu", kernel_initializer="he_uniform", data_format="channels_first")
        self.fc1 = Dense(64, activation='relu')
        self.fc2 = Dense(32, activation='relu')
        self.mu = Dense(n_actions, activation=None)
        self.sigma = Dense(n_actions, activation=None)
        self.flatten = Flatten()

    def call(self, state):
      #  x = self.conv1(state)
       # x = self.conv2(x)
        #x = self.flatten(x)
        #x = self.fc1(x)
        prob = self.fc1(state)
        prob = self.fc2(prob)
        mu = self.mu(prob)
        sigma = self.sigma(prob)
        
        sigma = tf.clip_by_value(sigma, self.noise, 1)
        return mu, sigma

In [18]:
class ValueNetwork(keras.Model):
    def __init__(self):
        super(ValueNetwork, self).__init__()
        self.conv1 = Conv2D(64, 3, activation='relu', kernel_initializer="he_uniform", data_format="channels_first")
        self.conv2 = Conv2D(32, 3, activation="relu", kernel_initializer="he_uniform", data_format="channels_first")
        self.fc1 = Dense(64, activation='relu', kernel_initializer="he_uniform")
        self.fc2 = Dense(32, activation='relu', kernel_initializer="he_uniform")
        self.v = Dense(1, activation="linear")
        self.flatten = Flatten()
        
    def call(self, state):
     #   x = self.conv1(state)
      #  x = self.conv2(x)
       # x = self.flatten(x)
       # x = self.fc1(x)
        state_value = self.fc1(state)
        state_value = self.fc2(state_value)
        v = self.v(state_value)
        return v

In [19]:
from tensorflow.keras.optimizers import Adam
import tensorflow as tf 
import tensorflow_probability as tfp
import numpy as np

class SACAgent:
  
    def __init__(self, input_dims, out_dims, gamma, alpha, beta,
                                            batch_size, noise, tau, mem_size,
                                            min_action, max_action, chkpt="models/"): 
        self.gamma = gamma 
        self.batch_size = batch_size 
        self.chkpt = chkpt
        self.min_action = min_action 
        self.max_action = max_action 
        self.scale = 5
        self.tau = tau
        
        self.memory = ExperienceReplayBuffer(mem_size, input_dims, out_dims, batch_size, False)
        self.actor = ActorNetwork(noise, out_dims)
        self.target_actor = ActorNetwork(noise, out_dims)
        self.critic_1 = CriticNetwork()
        self.critic_2 = CriticNetwork()
        self.target_critic = CriticNetwork()
        self.value_network = ValueNetwork()
        self.target_value_network = ValueNetwork()
        
        self.critic_1.compile(optimizer=Adam(learning_rate=beta))
        self.critic_2.compile(optimizer=Adam(learning_rate=beta))
        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.value_network.compile(optimizer=Adam(learning_rate=beta))
        self.target_value_network.compile(optimizer=Adam(learning_rate=beta))
    
    def sample_normal(self, state): 
        mu, std = self.actor(state)
        probs = tfp.distributions.Normal(mu, std)
        actions = probs.sample()
            
        action = tf.math.tanh(actions)*self.max_action
        log_probs = probs.log_prob(actions)
        log_probs -= tf.math.log(1-tf.math.pow(action, 2)+self.actor.noise)
        log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True)
        return actions, log_probs  
    
    def get_action(self, state): 
      #  state = tf.reshape(tf.convert_to_tensor(state), (1, *state.shape))
        state = tf.convert_to_tensor([state])
        actions, _ = self.sample_normal(state)
        return actions[0].numpy()

    def save_models(self):
        self.actor.save(self.chkpt + "_" + "actor_network")
        self.critic_1.save(self.chkpt + "_" + "critic_1_network")
        self.critic_2.save(self.chkpt + "_" + "critic_2_network")
        self.target_value_network.save(self.chkpt + "_" + "target_value_network")
        self.value_network.save(self.chkpt + "_" + "value_network")
        print("[+] Saving the model")


    def load_models(self):
        self.actor = tf.keras.models.load_model(self.chkpt + "_" + "actor_network") 
        self.critic_1 = tf.keras.models.load_model(self.chkpt + "_" + "critic_1_network") 
        self.critic_2 = tf.keras.models.load_model(self.chkpt + "_" + "critic_2_network") 
        self.value_network = tf.keras.models.load_model(self.chkpt + "_" + "value_network") 
        self.target_value_network = tf.keras.models.load_model(self.chkpt + "_" + "target_value_network") 
        
        print("[+] Loading the model")
        
    def store_experience(self, state, action, reward, state_, done):
        self.memory.store_experience(state, action, reward, state_, done)

    def sample_experience(self):
        state, action, reward, new_state, done = \
                                  self.memory.sample_experience(self.batch_size)
        states = tf.convert_to_tensor(state)
        rewards = tf.convert_to_tensor(reward)
        dones = tf.convert_to_tensor(done)
        actions = tf.convert_to_tensor(action, dtype=tf.int32)
        states_ = tf.convert_to_tensor(new_state)
        return states, actions, rewards, states_, dones
    
    def update_network_parameters(self): 
        value_weights = self.value_network.get_weights()
        target_value_weights = self.target_value_network.get_weights()
        
        for i in range(len(value_weights)): 
            target_value_weights[i] = self.tau * value_weights[i] + (1 - self.tau) * target_value_weights[i]
            
        self.target_value_network.set_weights(target_value_weights)
        #weights = []
        #targets = self.target_value_network.weights
        #for i, weight in enumerate(self.value_network.weights):
         #   weights.append(weight * tau + targets[i]*(1-tau))

        #self.target_value_network.set_weights(weights)
  
    def learn(self): 
        if not self.memory.is_sufficient(): 
            return 
        
        states, actions, rewards, states_, done = self.sample_experience()
        done = tf.cast(done, tf.float32)
        
        with tf.GradientTape() as tape:
            value = tf.squeeze(self.value_network(states), 1)
            current_policy_actions, log_probs = self.sample_normal(states)
            
            log_probs = tf.squeeze(log_probs, 1)
            q1_new_pi = self.critic_1((states, current_policy_actions))
            q2_new_pi = self.critic_2((states, current_policy_actions))
            critic_value = tf.squeeze(
                                tf.math.minimum(q1_new_pi, q2_new_pi), 1)

            value_target = critic_value - log_probs
            value_loss = 0.5 * keras.losses.MSE(value, value_target)
        params = self.value_network.trainable_variables
        grads = tape.gradient(value_loss, params)
        self.value_network.optimizer.apply_gradients(zip(grads, params))

        with tf.GradientTape() as tape:
            new_policy_actions, log_probs = self.sample_normal(states)

            log_probs = tf.squeeze(log_probs, 1)
            q1_new_policy = self.critic_1((states, new_policy_actions))
            q2_new_policy = self.critic_2((states, new_policy_actions))
            critic_value = tf.squeeze(tf.math.minimum(
                                        q1_new_policy, q2_new_policy), 1)
            actor_loss = self.actor_loss(log_probs, critic_value)
        params = self.actor.trainable_variables
        grads = tape.gradient(actor_loss, params)
        self.actor.optimizer.apply_gradients(zip(grads, params))

        with tf.GradientTape(persistent=True) as tape:
            value_ = tf.squeeze(self.target_value_network(states_), 1)            
            target_q_val = self.scale*rewards + self.gamma*value_*(1-done)
            q_val = tf.squeeze(self.critic_1((states, actions)), 1)
            critic_1_loss = self.critic_loss(q_val, target_q_val)
           
        params_1 = self.critic_1.trainable_variables
        grads_1 = tape.gradient(critic_1_loss, params_1)
        self.critic_1.optimizer.apply_gradients(zip(grads_1, params_1))
       
        
        with tf.GradientTape(persistent=True) as tape:
            value_ = tf.squeeze(self.target_value_network(states_), 1)
            target_q_val = self.scale*rewards + self.gamma*value_*(1-done)
            q_val = tf.squeeze(self.critic_2((states, actions)), 1)
            critic_2_loss = self.critic_loss(q_val, target_q_val)
            
        params_2 = self.critic_2.trainable_variables
        grads_2 = tape.gradient(critic_2_loss, params_2)
        self.critic_2.optimizer.apply_gradients(zip(grads_2, params_2))

        self.update_network_parameters()

    def actor_loss(self, log_probs, q_vals): 
        actor_loss = log_probs - q_vals
        actor_loss = tf.math.reduce_mean(actor_loss)
        return actor_loss
  
    def critic_loss(self, q_val, target_q_val): 
        return 0.5 * keras.losses.MSE(q_val, target_q_val)
                                

In [20]:
import time
from telebot import TeleBot
import datetime
import telebot

token = "6238487424:AAG0jRhvbiVa90qUcf2fAirQr_-quPMs7cU"
chat_id = "1055055706"
bot = TeleBot(token=token) 

def telegram_send(message, bot):
    chat_id = "1055055706"
    bot.send_message(chat_id=chat_id, text=message)

def welcome_msg(multi_step, double_dqn, dueling):
    st = 'Hi! Starting learning with DQN Multi-step = %d, Double DQN = %r, Dueling DQN = %r' % (multi_step, double_dqn, dueling)
    telegram_send(st, bot)
    
def info_msg(episode, max_episode, reward, best_score, loss): 
    st = f"Current Episode: {episode}, Current Reward: {reward}, Max Episode: {max_episode}, Best Score: {best_score}, loss: {loss}"
    telegram_send(st, bot)

def end_msg(learning_time):
    st = 'Finished! Learning time: ' + str(datetime.timedelta(seconds=int(learning_time)))
    telegram_send(st, bot)
    print(st)

In [21]:
import collections
import cv2
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import tensorflow as tf
from gymnasium.wrappers import *


def manage_memory():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
        except RuntimeError as e:
            print(e)


def plot_learning_curve(scores, epsilons, filename, lines=None):
    x = [_ for _ in range(len(scores))]
    fig=plt.figure()
    ax=fig.add_subplot(111, label="1")
    ax2=fig.add_subplot(111, label="2", frame_on=False)

    ax.plot(x, epsilons, color="C0")
    ax.set_xlabel("Training Steps", color="C0")
    ax.set_ylabel("Epsilon", color="C0")
    ax.tick_params(axis='x', colors="C0")
    ax.tick_params(axis='y', colors="C0")

    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N):
	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])

    ax2.scatter(x, running_avg, color="C1")
    ax2.axes.get_xaxis().set_visible(False)
    ax2.yaxis.tick_right()
    ax2.set_ylabel('Score', color="C1")
    ax2.yaxis.set_label_position('right')
    ax2.tick_params(axis='y', colors="C1")

    if lines is not None:
        for line in lines:
            plt.axvline(x=line)

    plt.savefig(filename)


def make_env(env_name, video_file_name, episode_freq_fo_video): 
    env = gym.make(env_name, render_mode="rgb_array")
    
    if len(env.observation_space.shape) >= 3: 
        #env = AtariPreprocessing(env, 10, 4, 84, False, True)
        env = ResizeObservation(env, 84)
        env = GrayScaleObservation(env, keep_dim=False)
        env = FrameStack(env, 4, lz4_compress=False)
        env = NormalizeObservation(env)

    return env

In [22]:
class Writer:
    def __init__(self, fname): 
        self.fname = fname 

    def write_to_file(self, content): 
        with open(self.fname, "a") as file: 
            file.write(content + "\n")

    def read_file(self, fname):
        with open(fname, "r") as file: 
            return file.read()
            

In [23]:
import numpy as np 
import imageio


class RecordVideo: 
    
    def __init__(self, prefix_fname,  out_directory="videos/", fps=10): 
        self.prefix_fname = prefix_fname
        self.out_directory = out_directory
        self.fps = fps
        self.images = []
        
    def add_image(self, image): 
        self.images.append(image)
    
    def save(self, episode_no): 
        name = self.out_directory + self.prefix_fname + "_" + str(episode_no) + ".mp4"
        imageio.mimsave(name, [np.array(img) for i, img in enumerate(self.images)], fps=self.fps)
        self.images = []

In [24]:
from npy_append_array import NpyAppendArray
import numpy as np

class Trainer:   
    def __init__(self, env, gamma, alpha, beta, batch_size, tau, noe, max_steps, 
                                                 is_tg, tg_bot_freq_epi, record, mem_size, noise, chkpt): 
       
        self.env = env 
        self.target_score = 0
        self.noe = noe
        self.max_steps = max_steps
        self.is_tg = is_tg
        self.tg_bot_freq_epi = tg_bot_freq_epi
        self.record = record 
        self.writer = Writer("model_training_results.txt")
        self.recorder = RecordVideo("sac", "videos/", 20)
        self.agent = SACAgent(env.observation_space.shape, env.action_space.shape[0], gamma, alpha, 
                                                beta, batch_size, noise, tau, mem_size, env.action_space.low[0],
                                                env.action_space.high[0], chkpt
                                        )

    def train_rl_model(self): 
        avg_rewards = []
        best_reward = float("-inf")
        episode_rewards = []

        for episode in range(self.noe): 
            n_steps = 0 
            state, _ = self.env.reset()
            reward = 0 

            if record and episode % 50 == 0:
                img = self.env.render()
                self.recorder.add_image(img)

            for step in range(self.max_steps): 
                
                action = self.agent.get_action(state)
                next_info = self.env.step(action)
                next_state, reward_prob, terminated, truncated, _ = next_info
                done = truncated or terminated
                reward += reward_prob

                self.agent.store_experience(state, action, reward_prob, next_state, done)
                self.agent.learn()

                # record
                if record and episode % 50 == 0:
                    img = self.env.render()
                    self.recorder.add_image(img)

                # next state
                state = next_state
                n_steps += 1        
                if done: 
                    break
            
            episode_rewards.append(reward)
            avg_reward = np.mean(episode_rewards[-100:])
            avg_rewards.append(avg_reward)

            result = f"Episode: {episode}, Steps: {n_steps}, Reward: {reward}, Best reward: {best_reward}, Avg reward: {avg_reward}"
            self.writer.write_to_file(result)
            print(result)

            # Recording.
            if record and episode % 50 == 0:
                self.recorder.save(episode)
            
            # Saving Best Model
            if reward > best_reward and episode!=0: 
                best_reward = reward
                self.agent.save_models()
                
            # Telegram bot
            if self.is_tg and episode % self.tg_bot_freq_epi == 0: 
                info_msg(episode+1, self.noe, reward, best_reward, "d")
                
            # Eatly Stopping
            if episode > 100 and np.mean(episode_rewards[-50:]) >= self.target_score: 
                break
                
                
        return episode_rewards, avg_rewards, best_reward
    

In [25]:
import gymnasium as gym
import time
import signal
import time
import sys
import pickle
import os 

env = make_env('LunarLanderContinuous-v2', None, None)
print(env)
record = True
gamma = 0.95
alpha = 0.0001# actor lr
beta = 0.001# critic lr 
batch_size = 64
tau = 0.05
noe = 500
max_steps = int(1e7)
is_tg = True 
tg_bot_freq_epi = 20
record = True 
mem_size = 25000
noise = 0.1
chkpt = "models/sac/"
  
if not os.path.exists("videos"): 
    os.mkdir("videos")

if not os.path.exists("test_videos"):
    os.mkdir("test_videos")


if __name__ == "__main__": 
    
    try: 
        manage_memory()
        trainer = Trainer(env, gamma, alpha, beta, batch_size, tau, noe,
                                          max_steps, is_tg, tg_bot_freq_epi, record, mem_size, noise, chkpt)
        episode_rewards, avg_rewards, best_reward = trainer.train_rl_model()
        
        with open("sac_episode_rewards.obj", "wb") as f: 
            pickle.dump(episode_rewards, f)
        
        with open("sac_avg_rewards.obj", "wb") as f: 
            pickle.dump(avg_rewards, f)
            
        x = [i+1 for i in range(noe)]
        plot_learning_curve(x, episode_rewards, "sac_con_mountain_car")

       # model_path = "models/lunarlander_DQN_q_value/"

        #evaluator = Eval(env, action_space, model_path, "vanilla_dqn_lunarlander", 10)
        #evaluator.test()
        
    except Exception as error: 
        raise error

<TimeLimit<OrderEnforcing<PassiveEnvChecker<LunarLander<LunarLanderContinuous-v2>>>>>
Episode: 0, Steps: 79, Reward: -545.9136988131911, Best reward: -inf, Avg reward: -545.9136988131911




Episode: 1, Steps: 128, Reward: -742.1603003576906, Best reward: -inf, Avg reward: -644.0369995854409
[+] Saving the model
Episode: 2, Steps: 68, Reward: -286.8447580403556, Best reward: -742.1603003576906, Avg reward: -524.9729190704124
[+] Saving the model
Episode: 3, Steps: 119, Reward: -320.75902756918424, Best reward: -286.8447580403556, Avg reward: -473.91944619510537
Episode: 4, Steps: 102, Reward: -163.458229018368, Best reward: -286.8447580403556, Avg reward: -411.8272027597579
[+] Saving the model
Episode: 5, Steps: 101, Reward: -295.1655839535804, Best reward: -163.458229018368, Avg reward: -392.383599625395
Episode: 6, Steps: 66, Reward: -199.15670951172348, Best reward: -163.458229018368, Avg reward: -364.7797581805848
Episode: 7, Steps: 110, Reward: -133.0645324930437, Best reward: -163.458229018368, Avg reward: -335.81535496964216
[+] Saving the model
Episode: 8, Steps: 131, Reward: -329.6269743009766, Best reward: -133.0645324930437, Avg reward: -335.1277571175682
Episo



Episode: 51, Steps: 163, Reward: -86.13370852287889, Best reward: 19.530170739125268, Avg reward: -272.0751988966334
Episode: 52, Steps: 101, Reward: -112.14820523251643, Best reward: 19.530170739125268, Avg reward: -269.05770845014064
Episode: 53, Steps: 78, Reward: -291.4682514907656, Best reward: 19.530170739125268, Avg reward: -269.47271850644853
Episode: 54, Steps: 91, Reward: -344.08749550944333, Best reward: 19.530170739125268, Avg reward: -270.8293508155939
Episode: 55, Steps: 78, Reward: -79.38704062125464, Best reward: 19.530170739125268, Avg reward: -267.41073813355206
Episode: 56, Steps: 108, Reward: -289.37565598101395, Best reward: 19.530170739125268, Avg reward: -267.79608756947243
Episode: 57, Steps: 79, Reward: -194.68888775121357, Best reward: 19.530170739125268, Avg reward: -266.5356186070887
Episode: 58, Steps: 106, Reward: -235.10124029677888, Best reward: 19.530170739125268, Avg reward: -266.00283253403256
Episode: 59, Steps: 79, Reward: -82.17088048414634, Best r



Episode: 101, Steps: 112, Reward: -50.03836668693256, Best reward: 19.530170739125268, Avg reward: -228.89981739325827
Episode: 102, Steps: 97, Reward: -101.46069612082917, Best reward: 19.530170739125268, Avg reward: -227.04597677406298
Episode: 103, Steps: 85, Reward: -219.147495235421, Best reward: 19.530170739125268, Avg reward: -226.02986145072538
Episode: 104, Steps: 196, Reward: -535.3645091823139, Best reward: 19.530170739125268, Avg reward: -229.74892425236482
Episode: 105, Steps: 95, Reward: -335.37669801358095, Best reward: 19.530170739125268, Avg reward: -230.15103539296484
Episode: 106, Steps: 85, Reward: -102.72401212842507, Best reward: 19.530170739125268, Avg reward: -229.18670841913186
Episode: 107, Steps: 81, Reward: -68.2052856699024, Best reward: 19.530170739125268, Avg reward: -228.53811595090042
Episode: 108, Steps: 85, Reward: -70.66796557588768, Best reward: 19.530170739125268, Avg reward: -225.94852586364956
Episode: 109, Steps: 202, Reward: -288.607931627457, 



Episode: 151, Steps: 156, Reward: -39.64237234893703, Best reward: 19.530170739125268, Avg reward: -225.79454640763777
Episode: 152, Steps: 75, Reward: -35.14954980002669, Best reward: 19.530170739125268, Avg reward: -225.0245598533128
Episode: 153, Steps: 145, Reward: -324.4745219325579, Best reward: 19.530170739125268, Avg reward: -225.35462255773075
Episode: 154, Steps: 116, Reward: -445.59755353585655, Best reward: 19.530170739125268, Avg reward: -226.36972313799487
Episode: 155, Steps: 77, Reward: -218.57189186531315, Best reward: 19.530170739125268, Avg reward: -227.76157165043549
Episode: 156, Steps: 148, Reward: -281.2429725552033, Best reward: 19.530170739125268, Avg reward: -227.6802448161774
Episode: 157, Steps: 68, Reward: -32.164907058218134, Best reward: 19.530170739125268, Avg reward: -226.0550050092474
Episode: 158, Steps: 88, Reward: -47.13068032773004, Best reward: 19.530170739125268, Avg reward: -224.1752994095569
Episode: 159, Steps: 99, Reward: -328.88899884366276,



Episode: 201, Steps: 88, Reward: -39.010480355282645, Best reward: 38.78506531892563, Avg reward: -232.196767293034
Episode: 202, Steps: 70, Reward: -86.08745038393312, Best reward: 38.78506531892563, Avg reward: -232.04303483566503
Episode: 203, Steps: 105, Reward: -128.33312728167562, Best reward: 38.78506531892563, Avg reward: -231.13489115612757
Episode: 204, Steps: 93, Reward: -94.91487229500733, Best reward: 38.78506531892563, Avg reward: -226.7303947872545
Episode: 205, Steps: 88, Reward: -317.9201645398332, Best reward: 38.78506531892563, Avg reward: -226.55582945251703
Episode: 206, Steps: 99, Reward: -361.56242311849525, Best reward: 38.78506531892563, Avg reward: -229.14421356241772
Episode: 207, Steps: 128, Reward: -155.92080087944348, Best reward: 38.78506531892563, Avg reward: -230.02136871451316
Episode: 208, Steps: 83, Reward: -324.87520050606713, Best reward: 38.78506531892563, Avg reward: -232.5634410638149
Episode: 209, Steps: 107, Reward: -418.6863691049844, Best re



Episode: 251, Steps: 83, Reward: -56.73204346719436, Best reward: 38.78506531892563, Avg reward: -212.07973310234595
Episode: 252, Steps: 102, Reward: -246.58018324939079, Best reward: 38.78506531892563, Avg reward: -214.19403943683966
Episode: 253, Steps: 104, Reward: -181.55819921097168, Best reward: 38.78506531892563, Avg reward: -212.76487620962382
Episode: 254, Steps: 97, Reward: -261.3545053138582, Best reward: 38.78506531892563, Avg reward: -210.92244572740387
Episode: 255, Steps: 103, Reward: -517.8692454415619, Best reward: 38.78506531892563, Avg reward: -213.91541926316634
Episode: 256, Steps: 116, Reward: -77.83400017800273, Best reward: 38.78506531892563, Avg reward: -211.88132953939427
Episode: 257, Steps: 83, Reward: -55.4321342480217, Best reward: 38.78506531892563, Avg reward: -212.11400181129233
Episode: 258, Steps: 106, Reward: -392.2021850441712, Best reward: 38.78506531892563, Avg reward: -215.5647168584567
Episode: 259, Steps: 96, Reward: -60.79140289572746, Best r

KeyboardInterrupt: 