In [1]:
!pip install gymnasium[atari] --quiet
!pip install gymnasium --quiet
!pip install -U gymnasium[atari] --quiet
!pip install imageio_ffmpeg --quiet
!pip install npy_append_array --quiet
!pip install pyTelegramBotAPI --quiet
!pip install gymnasium[accept-rom-license] --quiet
!pip install gymnasium[box2d] --quiet

[0m

In [2]:
import numpy as np 

class ExperienceReplayBuffer: 
    def __init__(self, max_memory, input_shape, batch_size, n_actions, cer=False): 
        self.mem_size = max_memory
        self.mem_counter = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape),
                                     dtype=np.float32)
        self.next_state_memory = np.zeros((self.mem_size, *input_shape),
                                         dtype=np.float32)

        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=np.float32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)
        self.batch_size = batch_size
        self.cer = cer

    def store_experience(self, state, action, reward, next_state, done): 
        index = self.mem_counter % self.mem_size 

        self.state_memory[index] = state
        self.next_state_memory[index] = next_state
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done
        self.mem_counter += 1

    def sample_experience(self, batch_size):
        # used to get the last transition
        offset = 1 if self.cer else 0

        max_mem = min(self.mem_counter, self.mem_size) - offset
        batch_index = np.random.choice(max_mem, batch_size - offset, replace=False)

        states = self.state_memory[batch_index]
        next_states = self.next_state_memory[batch_index]
        rewards = self.reward_memory[batch_index]
        actions = self.action_memory[batch_index]
        terminals = self.terminal_memory[batch_index]

        if self.cer: 
            last_index = self.mem_counter % self.mem_size - 1
            last_state = self.state_memory[last_index]
            last_action = self.action_memory[last_index]
            last_terminal = self.terminal_memory[last_index]
            last_next_state = self.next_state_memory[last_index]
            last_reward = self.reward_memory[last_index]

            # for 2d and 3d use vstack to append, for 1d array use append() to append the data
            states = np.vstack((self.state_memory[batch_index], last_state))
            next_states = np.vstack((self.next_state_memory[batch_index], last_next_state))

            actions = np.append(actions, last_action)
            terminals = np.append(terminals, last_terminal)
            rewards = np.append(rewards, last_reward)
    
        return states, actions, rewards, next_states, terminals
    
    
    def is_sufficient(self): 
        return self.mem_counter > self.batch_size
        

In [18]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense, Conv2D, Input, Lambda, concatenate
 
class ActorNetwork(tf.keras.Model):
    def __init__(self, input_dims, action_bound, action_dims, name):
        super(ActorNetwork, self).__init__()
        self.fc1 = Dense(64, activation="relu", kernel_initializer="he_uniform")
        self.fc2 = Dense(32, activation="relu", kernel_initializer="he_uniform")
        self.fc3 = Dense(16, activation="relu", kernel_initializer="he_uniform")
        
        self.out = Dense(action_dims, activation='tanh')

    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.out(x)
        return x 


class CriticNetwork(tf.keras.Model):
    def __init__(self, input_dims, action_dims, name): 
        super(CriticNetwork, self).__init__()
        self.fc1 = Dense(64, activation="relu", kernel_initializer="he_uniform")
        self.fc2 = Dense(32, activation="relu", kernel_initializer="he_uniform")
        self.fc3 = Dense(16, activation="relu", kernel_initializer="he_uniform")
        self.out = Dense(1, activation='linear')

    def call(self, state, action):
        x = self.fc1(tf.concat([state, action], axis=1))
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.out(x)
        return x 

In [27]:
from tensorflow.keras.optimizers import Adam
import tensorflow as tf 
import tensorflow_probability as tfp
import numpy as np

class TD3Agent:
  
    def __init__(self, input_dims, n_actions, gamma, alpha, beta, 
                                batch_size, mem_size, soft_update, 
                                tau, min_action, max_action, noise, eval_noise_scale): 
        self.gamma = gamma 
        self.explore_noise = noise
        self.n_actions = n_actions
        self.soft_update = soft_update
        self.tau = tau
        self.fname = "models/ddpg/"
        self.min_action = min_action
        self.max_action = max_action
        self.batch_size = batch_size
        self.eval_noise_scale = eval_noise_scale
        self.Normal = tfp.distributions.Normal


        self.memory = ExperienceReplayBuffer(mem_size, input_dims, batch_size, n_actions, cer=False)
        self.actor = ActorNetwork(input_dims[0], max_action, n_actions, "actor")
        self.target_actor = ActorNetwork(input_dims[0], max_action, n_actions, "target_actor")
        self.critic_1 = CriticNetwork(input_dims, 1, "critic_1")
        self.critic_2 = CriticNetwork(input_dims, 1, "critic_2")
        self.target_critic_1 = CriticNetwork(input_dims, 1, "target_critic_1")
        self.target_critic_2 = CriticNetwork(input_dims, 1, "target_critic_2")

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic_1.compile(optimizer=Adam(learning_rate=beta))
        self.critic_2.compile(optimizer=Adam(learning_rate=beta))
        self.target_critic_1.compile(optimizer=Adam(learning_rate=beta))
        self.target_critic_2.compile(optimizer=Adam(learning_rate=beta))

      #  self.update_target_networks()
    
    def evaluate(self, state, eval_noise_scale):
        """ 
        Generating action by the target actor, to pass it into the target critic
        """
        action = self.target_actor(state)
        action = self.max_action * action

        # add noise
        normal = self.Normal(0, 1)
        eval_noise_clip = 2 * eval_noise_scale
        noise = normal.sample(action.shape) * eval_noise_scale
        noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip)
        action = action + noise
        return action
        
    def get_action(self, state, greedy=False): 
        state = tf.convert_to_tensor([state], dtype=tf.float32)
        mu = self.actor(state)[0]
        mu_prime = mu
        if not greedy: 
            mu_prime = mu_prime + np.random.normal(scale=self.explore_noise)
        mu_prime = tf.clip_by_value(mu_prime, self.min_action, self.max_action)
        
        return mu_prime

    def sample_action(self):
        """ generate random actions for exploration """
        mu = np.random.normal(scale=self.explore_noise, size=(self.n_actions,))
        mu_prime = mu + np.random.normal(scale=self.explore_noise)
        mu_prime = tf.clip_by_value(mu_prime, self.min_action, self.max_action)

        return mu_prime

    def store_experience(self, state, action, reward, state_, done):
        self.memory.store_experience(state, action, reward, state_, done)

    def sample_experience(self):
        state, action, reward, new_state, done = \
                                  self.memory.sample_experience(self.batch_size)
        states = tf.convert_to_tensor(state)
        rewards = tf.convert_to_tensor(reward)
        dones = tf.convert_to_tensor(done)
        actions = tf.convert_to_tensor(action)
        states_ = tf.convert_to_tensor(new_state)
        return states, actions, rewards, states_, dones
 
    def save_models(self):
        self.actor.save(self.fname + "td3_actor_network")
        self.target_actor.save(self.fname + "td3_target_actor_network")
        self.critic_1.save(self.fname  + "td3_critic_1_network")
        self.target_critic_1.save(self.fname  + "td3_target_critic_1_network")
        self.critic_2.save(self.fname  + "td3_critic_2_network")
        self.target_critic_2.save(self.fname  + "td3_target_critic_2_network")
        print("[+] Saving the models") 

    def load_models(self):
        self.actor = tf.keras.models.load_model(self.fname + "_" + "td3_actor_network") 
        self.target_actor = tf.keras.models.load_model(self.fname + "_" + "td3_target_actor_network") 
        self.critic_1 = tf.keras.models.load_model(self.fname + "_" + "td3_critic_1_network") 
        self.target_critic_1 = tf.keras.models.load_model(self.fname + "_" + "td3_target_critic_1_network") 
        self.critic_2 = tf.keras.models.load_model(self.fname + "_" + "td3_critic_2_network") 
        self.target_critic_2 = tf.keras.models.load_model(self.fname + "_" + "td3_target_critic_2_network") 
        print("[+] Loading the models")
  
    def learn(self, update_actor=False): 
        if not self.memory.is_sufficient():
            return
        states, actions, rewards, next_states, dones = self.sample_experience()
       
        target_actions = self.target_actor(next_states)
        target_actions = target_actions + tf.clip_by_value(np.random.normal(scale=0.2), -0.5, 0.5)
        target_actions = tf.clip_by_value(target_actions, self.min_action,
                                                                  self.max_action)
        one_step_lookahead_vals_1 = self.target_critic_1(next_states, target_actions)
        one_step_lookahead_vals_2 = self.target_critic_2(next_states, target_actions)
        one_step_lookahead_vals_1 = tf.squeeze(one_step_lookahead_vals_1, 1)
        one_step_lookahead_vals_2 = tf.squeeze(one_step_lookahead_vals_2, 1)
        one_step_lookahead_vals = tf.math.minimum(one_step_lookahead_vals_1, one_step_lookahead_vals_2)
        
        with tf.GradientTape() as tape:
            q_vals_1 = self.critic_1(states, actions)
            q_vals_1 = tf.squeeze(q_vals_1, 1)
            target_q_vals = rewards + self.gamma * one_step_lookahead_vals * ([1 - int(d) for d in dones])
            critic_1_loss = self.critic_loss(q_vals_1, target_q_vals)
      
        critic_1_params = self.critic_1.trainable_variables
        critic_1_grads = tape.gradient(critic_1_loss, critic_1_params)
        self.critic_1.optimizer.apply_gradients(zip(critic_1_grads, critic_1_params))
        
        with tf.GradientTape() as tape:
            q_vals_2 = self.critic_2(states, actions)
            q_vals_2 = tf.squeeze(q_vals_2, 1)
            target_q_vals = rewards + self.gamma * one_step_lookahead_vals * ([1 - int(d) for d in dones])
            critic_2_loss = self.critic_loss(q_vals_2, target_q_vals)
      
        critic_2_params = self.critic_2.trainable_variables
        critic_2_grads = tape.gradient(critic_2_loss, critic_2_params)
        self.critic_2.optimizer.apply_gradients(zip(critic_2_grads, critic_2_params))


        if not update_actor: 
            return 
        
        with tf.GradientTape() as tape:
            pred_actions = self.actor(states)
            q_vals = self.critic_1(states, pred_actions)
            actor_loss = self.actor_loss(q_vals)

        actor_params = self.actor.trainable_variables
        actor_grads = tape.gradient(actor_loss, actor_params)
        self.actor.optimizer.apply_gradients(zip(actor_grads, actor_params))
        self.update_target_networks()
    
    def critic_loss(self, q_vals, target_q_vals): 
        loss = tf.keras.losses.MSE(q_vals, target_q_vals)
        return loss
    
    def actor_loss(self, q_vals):
        return -tf.math.reduce_mean(q_vals)

    def update_target_networks(self):
        actor_weights = self.actor.get_weights()
        t_actor_weights = self.target_actor.get_weights()
        critic_1_weights = self.critic_1.get_weights()
        t_critic_1_weights = self.target_critic_1.get_weights()
        critic_2_weights = self.critic_2.get_weights()
        t_critic_2_weights = self.target_critic_2.get_weights()
        if self.soft_update: 
            for i in range(len(actor_weights)):
                t_actor_weights[i] = self.tau * actor_weights[i] + (1 - self.tau) * t_actor_weights[i]

            for i in range(len(critic_1_weights)):
                t_critic_1_weights[i] = self.tau * critic_1_weights[i] + (1 - self.tau) * t_critic_1_weights[i]

            for i in range(len(critic_2_weights)):
                t_critic_2_weights[i] = self.tau * critic_2_weights[i] + (1 - self.tau) * t_critic_2_weights[i]

        self.target_actor.set_weights(t_actor_weights)
        self.target_critic_1.set_weights(t_critic_1_weights)
        self.target_critic_2.set_weights(t_critic_2_weights)
  

In [28]:
import time
from telebot import TeleBot
import datetime
import telebot

token = "6238487424:AAG0jRhvbiVa90qUcf2fAirQr_-quPMs7cU"
chat_id = "1055055706"
bot = TeleBot(token=token) 

def telegram_send(message, bot):
    chat_id = "1055055706"
    bot.send_message(chat_id=chat_id, text=message)

def welcome_msg(multi_step, double_dqn, dueling):
    st = 'Hi! Starting learning with DQN Multi-step = %d, Double DQN = %r, Dueling DQN = %r' % (multi_step, double_dqn, dueling)
    telegram_send(st, bot)
    
def info_msg(episode, max_episode, reward, best_score, loss): 
    st = f"Current Episode: {episode}, Current Reward: {reward}, Max Episode: {max_episode}, Best Score: {best_score}, loss: {loss}"
    telegram_send(st, bot)

def end_msg(learning_time):
    st = 'Finished! Learning time: ' + str(datetime.timedelta(seconds=int(learning_time)))
    telegram_send(st, bot)
    print(st)

In [29]:
import collections
import cv2
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import tensorflow as tf
from gymnasium.wrappers import *


def manage_memory():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
        except RuntimeError as e:
            print(e)


def plot_learning_curve(scores, epsilons, filename, lines=None):
    x = [_ for _ in range(len(scores))]
    fig=plt.figure()
    ax=fig.add_subplot(111, label="1")
    ax2=fig.add_subplot(111, label="2", frame_on=False)

    ax.plot(x, epsilons, color="C0")
    ax.set_xlabel("Training Steps", color="C0")
    ax.set_ylabel("Epsilon", color="C0")
    ax.tick_params(axis='x', colors="C0")
    ax.tick_params(axis='y', colors="C0")

    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N):
	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])

    ax2.scatter(x, running_avg, color="C1")
    ax2.axes.get_xaxis().set_visible(False)
    ax2.yaxis.tick_right()
    ax2.set_ylabel('Score', color="C1")
    ax2.yaxis.set_label_position('right')
    ax2.tick_params(axis='y', colors="C1")

    if lines is not None:
        for line in lines:
            plt.axvline(x=line)

    plt.savefig(filename)


def make_env(env_name, video_file_name, episode_freq_fo_video): 
    env = gym.make(env_name, render_mode="rgb_array")
    
    if len(env.observation_space.shape) >= 3: 
        #env = AtariPreprocessing(env, 10, 4, 84, False, True)
        env = ResizeObservation(env, 84)
        env = GrayScaleObservation(env, keep_dim=False)
        env = FrameStack(env, 4, lz4_compress=False)
        env = NormalizeObservation(env)

    return env

In [30]:
class Writer:
    def __init__(self, fname): 
        self.fname = fname 

    def write_to_file(self, content): 
        with open(self.fname, "a") as file: 
            file.write(content + "\n")

    def read_file(self, fname):
        with open(fname, "r") as file: 
            return file.read()
            

In [31]:
import numpy as np 
import imageio


class RecordVideo: 
    
    def __init__(self, prefix_fname,  out_directory="videos/", fps=10): 
        self.prefix_fname = prefix_fname
        self.out_directory = out_directory
        self.fps = fps
        self.images = []
        
    def add_image(self, image): 
        self.images.append(image)
    
    def save(self, episode_no): 
        name = self.out_directory + self.prefix_fname + "_" + str(episode_no) + ".mp4"
        imageio.mimsave(name, [np.array(img) for i, img in enumerate(self.images)], fps=self.fps)
        self.images = []

In [32]:
from npy_append_array import NpyAppendArray
import numpy as np

class Trainer:   
    def __init__(self, env, gamma, alpha, beta, batch_size, 
                                     tau, sot_update, noe, max_steps, 
                                     is_tg, tg_bot_freq_epi, record, 
                                     mem_size, noise, explore_steps=1000, 
                                     policy_network_update_freq=3, eval_noise_scale=0.5): 
       
        self.env = env 
        self.target_score = 280
        self.explore_steps = explore_steps
        self.policy_network_update_freq = policy_network_update_freq
        self.noe = noe
        self.max_steps = max_steps
        self.is_tg = is_tg
        self.tg_bot_freq_epi = tg_bot_freq_epi
        self.record = record 
        self.writer = Writer("model_training_results.txt")
        self.recorder = RecordVideo("ddpg", "videos/", 20)
        self.agent = TD3Agent(env.observation_space.shape, 
                               env.action_space.shape[0], gamma, alpha,
                               beta, batch_size, mem_size,
                               soft_update, tau, env.action_space.low[0],
                               env.action_space.high[0], noise, eval_noise_scale
                            )
    def train_rl_model(self): 
        avg_rewards = []
        best_reward = float("-inf")
        steps_idx = 0
        episode_rewards = []

        for episode in range(self.noe): 
            n_steps = 0          
            state = self.env.reset()
            reward = 0 

            if record and episode % 50 == 0:
                img = self.env.render()
                self.recorder.add_image(img)

            for step in range(self.max_steps): 

                if type(state) == tuple: 
                    state = state[0]

                if steps_idx > self.explore_steps:
                    action = self.agent.get_action(state, greedy=False)

                else: 
                    action = self.agent.sample_action()

                next_info = self.env.step(action)
                next_state, reward_prob, terminated, truncated, _ = next_info
                done = truncated or terminated
                reward += reward_prob

                self.agent.store_experience(state, action, reward_prob, next_state, done)

                # updating agent for every n steps
                update_agent = False 
                if step % self.policy_network_update_freq == 0:
                    update_agent = True 
                self.agent.learn(update_agent)

                # record
                if record and episode % 50 == 0:
                    img = self.env.render()
                    self.recorder.add_image(img)

                # next state
                state = next_state
                n_steps += 1    
                steps_idx += 1            
                if done: 
                    break
            
            episode_rewards.append(reward)
            avg_reward = np.mean(episode_rewards[-100:])
            avg_rewards.append(avg_reward)

            result = f"Episode: {episode}, Steps: {n_steps}, Reward: {reward}, Best reward: {best_reward}, Avg reward: {avg_reward}"
            self.writer.write_to_file(result)
            print(result)

            # Recording.
            if record and episode % 50 == 0:
                self.recorder.save(episode)
            
            # Saving Best Model
            if reward > best_reward: 
                best_reward = reward
                self.agent.save_models()
                
            # Telegram bot
            if self.is_tg and episode % self.tg_bot_freq_epi == 0: 
                info_msg(episode+1, self.noe, reward, best_reward, "d")
                
            # Eatly Stopping
            if episode > 100 and np.mean(episode_rewards[-50:]) >= self.target_score: 
                break
                
                
        return episode_rewards, avg_rewards, best_reward
    

In [33]:
import gymnasium as gym
import time
import signal
import time
import sys
import pickle
import os 

env = make_env("LunarLanderContinuous-v2", "videos/", 50)
print(env.action_space)
record = True
gamma = 0.99
alpha = 0.0001# actor lr
beta = 0.001# critic lr 
batch_size = 64
tau = 0.01
soft_update = True 
noe = 500
max_steps = int(1e5)
is_tg = True 
tg_bot_freq_epi = 20
record = True 
mem_size = int(5e6)
noise = 0.5
explore_steps = 1000
policy_network_update_freq = 2
  
    
if not os.path.exists("videos"): 
    os.mkdir("videos")

if not os.path.exists("test_videos"):
    os.mkdir("test_videos")


if __name__ == "__main__": 
    
    try: 
        manage_memory()
        trainer = Trainer(env, gamma, alpha, beta, batch_size, tau,
                          soft_update, noe, max_steps, is_tg,
                          tg_bot_freq_epi, record, mem_size, noise,
                          explore_steps, policy_network_update_freq
                )
        episode_rewards, avg_rewards, best_reward = trainer.train_rl_model()
        
        with open("td3_episode_rewards.obj", "wb") as f: 
            pickle.dump(episode_rewards, f)
        
        with open("td3_avg_rewards.obj", "wb") as f:
            pickle.dump(avg_rewards, f)
            
        x = [i+1 for i in range(noe)]
        plot_learning_curve(x, episode_rewards, "td3_con_mountain_car")

       # model_path = "models/lunarlander_DQN_q_value/"

        #evaluator = Eval(env, action_space, model_path, "vanilla_dqn_lunarlander", 10)
        #evaluator.test()
        
    except Exception as error: 
        raise error

Box(-1.0, 1.0, (2,), float32)
Episode: 0, Steps: 121, Reward: -85.7920611581755, Best reward: -inf, Avg reward: -85.7920611581755




[+] Saving the models
Episode: 1, Steps: 95, Reward: -201.9269544407556, Best reward: -85.7920611581755, Avg reward: -143.85950779946555
Episode: 2, Steps: 106, Reward: -303.7181263031381, Best reward: -85.7920611581755, Avg reward: -197.14571396735641
Episode: 3, Steps: 64, Reward: -36.88672676612599, Best reward: -85.7920611581755, Avg reward: -157.0809671670488
[+] Saving the models
Episode: 4, Steps: 98, Reward: -293.1118251026838, Best reward: -36.88672676612599, Avg reward: -184.2871387541758
Episode: 5, Steps: 138, Reward: -265.6360749308625, Best reward: -36.88672676612599, Avg reward: -197.84529478362356
Episode: 6, Steps: 113, Reward: -105.69592777424819, Best reward: -36.88672676612599, Avg reward: -184.68109949656994
Episode: 7, Steps: 133, Reward: -207.1898456422914, Best reward: -36.88672676612599, Avg reward: -187.49469276478516
Episode: 8, Steps: 96, Reward: -261.0154022716815, Best reward: -36.88672676612599, Avg reward: -195.66366048777365
Episode: 9, Steps: 108, Rewa



Episode: 51, Steps: 109, Reward: 48.29906642859308, Best reward: 150.30938321492698, Avg reward: -229.7248981866764
Episode: 52, Steps: 1000, Reward: 132.26582581827552, Best reward: 150.30938321492698, Avg reward: -222.89488452620557
Episode: 53, Steps: 163, Reward: -15.978210661298164, Best reward: 150.30938321492698, Avg reward: -219.06309426944804
Episode: 54, Steps: 158, Reward: 14.250628285281607, Best reward: 150.30938321492698, Avg reward: -214.8210265866348
Episode: 55, Steps: 176, Reward: 34.320677523628945, Best reward: 150.30938321492698, Avg reward: -210.37206758466579
Episode: 56, Steps: 119, Reward: 21.39325185092818, Best reward: 150.30938321492698, Avg reward: -206.3060093489536
Episode: 57, Steps: 169, Reward: 13.01498309829033, Best reward: 150.30938321492698, Avg reward: -202.52461292744943
Episode: 58, Steps: 163, Reward: 10.965804426065361, Best reward: 150.30938321492698, Avg reward: -198.90613127738985
Episode: 59, Steps: 125, Reward: 40.974176038789096, Best re



Episode: 101, Steps: 181, Reward: 27.706508011557233, Best reward: 150.30938321492698, Avg reward: -115.61739675539579
Episode: 102, Steps: 290, Reward: -193.29250038773637, Best reward: 150.30938321492698, Avg reward: -114.51314049624177
Episode: 103, Steps: 331, Reward: -231.53539602498776, Best reward: 150.30938321492698, Avg reward: -116.45962718883042
Episode: 104, Steps: 223, Reward: -45.57045078451151, Best reward: 150.30938321492698, Avg reward: -113.98421344564868
Episode: 105, Steps: 199, Reward: 21.349024826393318, Best reward: 150.30938321492698, Avg reward: -111.1143624480761
Episode: 106, Steps: 205, Reward: 16.959141947114034, Best reward: 150.30938321492698, Avg reward: -109.88781175086247
Episode: 107, Steps: 262, Reward: -42.214880533004724, Best reward: 150.30938321492698, Avg reward: -108.2380620997696
Episode: 108, Steps: 266, Reward: -68.8634981398063, Best reward: 150.30938321492698, Avg reward: -106.31654305845086
Episode: 109, Steps: 1000, Reward: 65.0704286322



Episode: 151, Steps: 195, Reward: -151.7386061871189, Best reward: 150.30938321492698, Avg reward: -16.88721320479163
Episode: 152, Steps: 192, Reward: -20.520696474051107, Best reward: 150.30938321492698, Avg reward: -18.4150784277149
Episode: 153, Steps: 239, Reward: -1.5581032668744967, Best reward: 150.30938321492698, Avg reward: -18.27087735377066
Episode: 154, Steps: 1000, Reward: 146.49702923841355, Best reward: 150.30938321492698, Avg reward: -16.948413344239345
Episode: 155, Steps: 197, Reward: -27.805604630100476, Best reward: 150.30938321492698, Avg reward: -17.569676165776638
Episode: 156, Steps: 205, Reward: -71.78978265696377, Best reward: 150.30938321492698, Avg reward: -18.501506510855556
Episode: 157, Steps: 179, Reward: -44.96209197328977, Best reward: 150.30938321492698, Avg reward: -19.081277261571355
Episode: 158, Steps: 191, Reward: -24.50343070537653, Best reward: 150.30938321492698, Avg reward: -19.435969612885778
Episode: 159, Steps: 184, Reward: -20.9682823532



Episode: 201, Steps: 1000, Reward: 155.88625086083465, Best reward: 171.1722467345924, Avg reward: -22.965883790487283
Episode: 202, Steps: 1000, Reward: 40.28604554385778, Best reward: 171.1722467345924, Avg reward: -20.63009833117134
Episode: 203, Steps: 827, Reward: 126.39147682529858, Best reward: 171.1722467345924, Avg reward: -17.05082960266847
Episode: 204, Steps: 1000, Reward: -70.31059104372916, Best reward: 171.1722467345924, Avg reward: -17.298231005260646
Episode: 205, Steps: 1000, Reward: -62.14634504693565, Best reward: 171.1722467345924, Avg reward: -18.133184703993937
Episode: 206, Steps: 1000, Reward: -61.47178460447232, Best reward: 171.1722467345924, Avg reward: -18.917493969509803
Episode: 207, Steps: 1000, Reward: -59.5186700144834, Best reward: 171.1722467345924, Avg reward: -19.09053186432459
Episode: 208, Steps: 1000, Reward: -82.84604443770651, Best reward: 171.1722467345924, Avg reward: -19.23035732730359
Episode: 209, Steps: 1000, Reward: -205.4973876828546, 



Episode: 251, Steps: 1000, Reward: 104.47050518338948, Best reward: 270.06904296486215, Avg reward: -28.23428453789776
Episode: 252, Steps: 407, Reward: 178.0477573048347, Best reward: 270.06904296486215, Avg reward: -26.248600000108908
Episode: 253, Steps: 1000, Reward: 171.99786922697257, Best reward: 270.06904296486215, Avg reward: -24.51304027517044
Episode: 254, Steps: 1000, Reward: 81.25018376035658, Best reward: 270.06904296486215, Avg reward: -25.165508729951007
Episode: 255, Steps: 1000, Reward: 136.2845580970045, Best reward: 270.06904296486215, Avg reward: -23.524607102679955
Episode: 256, Steps: 1000, Reward: 142.64832254362196, Best reward: 270.06904296486215, Avg reward: -21.380226050674104
Episode: 257, Steps: 1000, Reward: 174.21533798552483, Best reward: 270.06904296486215, Avg reward: -19.18845175108595
Episode: 258, Steps: 1000, Reward: 156.41387764728802, Best reward: 270.06904296486215, Avg reward: -17.379278667559305
Episode: 259, Steps: 1000, Reward: 80.836548557



Episode: 301, Steps: 582, Reward: -182.49875179645926, Best reward: 270.06904296486215, Avg reward: -27.144481333918097
Episode: 302, Steps: 376, Reward: -97.87219496481451, Best reward: 270.06904296486215, Avg reward: -28.526063739004822
Episode: 303, Steps: 1000, Reward: 56.54930921668309, Best reward: 270.06904296486215, Avg reward: -29.22448541509097
Episode: 304, Steps: 1000, Reward: 140.07637302455956, Best reward: 270.06904296486215, Avg reward: -27.120615774408087
Episode: 305, Steps: 536, Reward: -138.15908970342312, Best reward: 270.06904296486215, Avg reward: -27.880743220972963
Episode: 306, Steps: 1000, Reward: 55.27302714160878, Best reward: 270.06904296486215, Avg reward: -26.713295103512152
Episode: 307, Steps: 1000, Reward: 74.47603312328778, Best reward: 270.06904296486215, Avg reward: -25.373348072134437
Episode: 308, Steps: 603, Reward: -137.37374938202612, Best reward: 270.06904296486215, Avg reward: -25.918625121577634
Episode: 309, Steps: 580, Reward: -180.383144



Episode: 351, Steps: 285, Reward: -126.68110423424349, Best reward: 270.06904296486215, Avg reward: 12.397198349178968
Episode: 352, Steps: 199, Reward: 41.979134948071106, Best reward: 270.06904296486215, Avg reward: 11.03651212561133
Episode: 353, Steps: 1000, Reward: 173.46040764238253, Best reward: 270.06904296486215, Avg reward: 11.051137509765432
Episode: 354, Steps: 257, Reward: -94.5213350066885, Best reward: 270.06904296486215, Avg reward: 9.29342232209498
Episode: 355, Steps: 273, Reward: -22.598958423786215, Best reward: 270.06904296486215, Avg reward: 7.704587156887075
Episode: 356, Steps: 318, Reward: 249.98698064923826, Best reward: 270.06904296486215, Avg reward: 8.777973737943235
Episode: 357, Steps: 1000, Reward: 38.37651343873393, Best reward: 270.06904296486215, Avg reward: 7.419585492475328
Episode: 358, Steps: 1000, Reward: 166.022160014957, Best reward: 270.06904296486215, Avg reward: 7.515668316152016
Episode: 359, Steps: 1000, Reward: 83.51758610253765, Best rew



Episode: 401, Steps: 1000, Reward: 141.78072032056463, Best reward: 270.06904296486215, Avg reward: 34.150617184589585
Episode: 402, Steps: 1000, Reward: 96.58115462592528, Best reward: 270.06904296486215, Avg reward: 36.09515068049698
Episode: 403, Steps: 1000, Reward: -120.43573253874571, Best reward: 270.06904296486215, Avg reward: 34.32530026294268
Episode: 404, Steps: 1000, Reward: 132.80705025286952, Best reward: 270.06904296486215, Avg reward: 34.25260703522578
Episode: 405, Steps: 1000, Reward: -69.21535077861257, Best reward: 270.06904296486215, Avg reward: 34.942044424473885
Episode: 406, Steps: 1000, Reward: 120.26212925119864, Best reward: 270.06904296486215, Avg reward: 35.59193544556978
Episode: 407, Steps: 1000, Reward: -109.40178571040276, Best reward: 270.06904296486215, Avg reward: 33.753157257232886
Episode: 408, Steps: 1000, Reward: 136.08048893137993, Best reward: 270.06904296486215, Avg reward: 36.48769964036694
Episode: 409, Steps: 1000, Reward: 158.7095402113503

KeyboardInterrupt: 