In [7]:
import gym
from gym import spaces
import numpy as np

class TrainSchedulingEnv(gym.Env):
    def __init__(self):
        # Environment configuration
        super(TrainSchedulingEnv, self).__init__()
        self.num_trains = 5  # Number of trains
        self.num_stations = 10  # Number of stations
        self.track_capacity = 3  # Maximum trains per track
        
        # Action space: Schedule for each train (departure station, departure time, and route)
        self.action_space = spaces.Box(
            low=0, high=self.num_stations - 1, shape=(self.num_trains, 2), dtype=np.int32
        )
        
        # Observation space: State of each train (position, time to next station, delay)
        self.observation_space = spaces.Box(
            low=0, high=100, shape=(self.num_trains, 3), dtype=np.float32
        )
        
        # Initial state
        self.state = None
        self.reset()
        
    def reset(self):
        # Reset train positions and times
        self.state = np.zeros((self.num_trains, 3))  # Position, time, delay
        return self.state
    
    def step(self, action):
        """
        action: Array of shape (num_trains, 2) where each row represents:
        [departure_station, departure_time]
        """
        reward = 0
        done = False
        
        # Update train positions based on the action
        for train_id, (station, departure_time) in enumerate(action):
            if self.state[train_id, 0] == station:  # If train is already at the station
                self.state[train_id, 1] += 1  # Simulate travel to the next station
                if self.state[train_id, 1] > departure_time:
                    # Calculate delay
                    delay = self.state[train_id, 1] - departure_time
                    reward -= delay  # Penalize delays
                else:
                    reward += 10  # Reward timely departure

        # Define terminal condition
        if np.all(self.state[:, 0] == self.num_stations - 1):  # All trains reach their destination
            done = True

        # Add noise for disruptions (e.g., random delays)
        self.state[:, 2] += np.random.randint(-1, 2, size=self.num_trains)
        
        return self.state, reward, done, {}
    
    def render(self, mode='human'):
        # Display train positions and delays
        for train_id, train_state in enumerate(self.state):
            print(f"Train {train_id}: Position: {train_state[0]}, Delay: {train_state[2]} mins")

# Instantiate and test the environment
env = TrainSchedulingEnv()
state = env.reset()

for _ in range(10):
    action = np.random.randint(0, env.num_stations, (env.num_trains, 2))
    state, reward, done, _ = env.step(action)
    env.render()
    if done:
        print("All trains reached their destinations!")
        break


Train 0: Position: 0.0, Delay: 0.0 mins
Train 1: Position: 0.0, Delay: 1.0 mins
Train 2: Position: 0.0, Delay: 0.0 mins
Train 3: Position: 0.0, Delay: 0.0 mins
Train 4: Position: 0.0, Delay: 0.0 mins
Train 0: Position: 0.0, Delay: 1.0 mins
Train 1: Position: 0.0, Delay: 0.0 mins
Train 2: Position: 0.0, Delay: 0.0 mins
Train 3: Position: 0.0, Delay: 1.0 mins
Train 4: Position: 0.0, Delay: 0.0 mins
Train 0: Position: 0.0, Delay: 1.0 mins
Train 1: Position: 0.0, Delay: 1.0 mins
Train 2: Position: 0.0, Delay: 1.0 mins
Train 3: Position: 0.0, Delay: 1.0 mins
Train 4: Position: 0.0, Delay: 0.0 mins
Train 0: Position: 0.0, Delay: 0.0 mins
Train 1: Position: 0.0, Delay: 1.0 mins
Train 2: Position: 0.0, Delay: 1.0 mins
Train 3: Position: 0.0, Delay: 0.0 mins
Train 4: Position: 0.0, Delay: 0.0 mins
Train 0: Position: 0.0, Delay: -1.0 mins
Train 1: Position: 0.0, Delay: 1.0 mins
Train 2: Position: 0.0, Delay: 2.0 mins
Train 3: Position: 0.0, Delay: 1.0 mins
Train 4: Position: 0.0, Delay: -1.0 min

In [6]:
pip install gym

Note: you may need to restart the kernel to use updated packages.Collecting gym
  Using cached gym-0.26.2.tar.gz (721 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting cloudpickle>=1.2.0 (from gym)
  Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)
Collecting gym_notices>=0.0.4 (from gym)
  Downloading gym_notices-0.0.8-py3-none-any.whl.metadata (1.0 kB)
Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)
Downloading gym_notices-0.0.8-py3-none-any.whl (3.0 kB)
Building wheels for collected packages: gym
  Building wheel for gym (pyproject.toml): started
  Building wheel for gym (pyproject.toml): finished with status 'done'
  Created wheel for gym: filename=gym-0.26.2-py3-none-a

DEPRECATION: Loading egg at c:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import gym
from gym import spaces
import numpy as np

class TrainSchedulingEnv(gym.Env):
    def __init__(self):
        super(TrainSchedulingEnv, self).__init__()
        
        # Environment Configuration
        self.num_trains = 5
        self.num_stations = 10
        self.track_capacity = 3
        
        # Action Space: Each train chooses its next station and departure time
        self.action_space = spaces.Box(low=0, high=1, shape=(self.num_trains, 2), dtype=np.float32)
        
        # Observation Space: Position, delay, passenger demand
        self.observation_space = spaces.Box(
            low=0, high=10, shape=(self.num_trains, 3), dtype=np.float32
        )
        
        self.reset()

    def reset(self):
        # Reset environment
        self.state = np.zeros((self.num_trains, 3))
        return self.state

    def step(self, action):
        # Update state based on actions
        reward = 0
        done = False
        
        for train_id, act in enumerate(action):
            self.state[train_id, 0] += act[0]  # Move train
            delay = max(0, self.state[train_id, 1] - act[1])
            reward -= delay
        
        return self.state, reward, done, {}

    def render(self, mode='human'):
        print(f"Train States: {self.state}")

env = TrainSchedulingEnv()


In [25]:
from stable_baselines3 import PPO

# Initialize environment
env = TrainSchedulingEnv()

# Train PPO Model
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Save and Test the Model
model.save("train_scheduling_rl")

ModuleNotFoundError: No module named 'stable_baselines3'

In [10]:
pip install stable-baselines3

Collecting stable-baselines3
  Downloading stable_baselines3-2.4.0-py3-none-any.whl.metadata (4.5 kB)
Collecting gymnasium<1.1.0,>=0.29.1 (from stable-baselines3)
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting torch>=1.13 (from stable-baselines3)
  Downloading torch-2.5.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium<1.1.0,>=0.29.1->stable-baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Collecting filelock (from torch>=1.13->stable-baselines3)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.3.0 (from gymnasium<1.1.0,>=0.29.1->stable-baselines3)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch>=1.13->stable-baselines3)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch>=1.13->stable-baselines3)
  Downloading fsspec-202

DEPRECATION: Loading egg at c:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330
ERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: 'C:\\Python311\\share\\man\\man1\\isympy.1'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
from stable_baselines3 import PPO

# Initialize environment
env = TrainSchedulingEnv()

# Train PPO Model
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Save and Test the Model
model.save("train_scheduling_rl")

ModuleNotFoundError: No module named 'stable_baselines3'

In [21]:
python3

NameError: name 'python3' is not defined

In [26]:
pip show stable-baselines3


Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [1]:
import numpy as np
import random

class RailEnvironment:
    def __init__(self, grid_size=(10, 10), n_agents=3):
        self.grid_size = grid_size
        self.n_agents = n_agents
        self.grid = np.zeros(grid_size)  # 0: empty, 1: station, 2: track
        self.agents = []
        self._initialize_environment()

    def _initialize_environment(self):
        # Place stations randomly
        num_stations = 5
        self.stations = [tuple(np.random.randint(0, self.grid_size[i], size=num_stations)) for i in range(2)]
        for x, y in zip(self.stations[0], self.stations[1]):
            self.grid[x, y] = 1

        # Initialize agents
        for i in range(self.n_agents):
            start = (random.choice(self.stations[0]), random.choice(self.stations[1]))
            target = (random.choice(self.stations[0]), random.choice(self.stations[1]))
            while target == start:
                target = (random.choice(self.stations[0]), random.choice(self.stations[1]))
            self.agents.append({
                "id": i,
                "start": start,
                "target": target,
                "position": start,
                "done": False,
                "reward": 0
            })

    def step(self, actions):
        rewards = {}
        done = True
        for agent, action in zip(self.agents, actions):
            if agent["done"]:
                rewards[agent["id"]] = 0
                continue

            # Compute new position
            new_position = self._move(agent["position"], action)
            if not self._is_valid(new_position):
                rewards[agent["id"]] = -5  # Invalid move penalty
                continue

            # Update position and check if target is reached
            agent["position"] = new_position
            if new_position == agent["target"]:
                agent["done"] = True
                rewards[agent["id"]] = 10  # Reaching target reward
            else:
                rewards[agent["id"]] = -1  # Step penalty

            done = done and agent["done"]

        return self.grid, rewards, done

    def _move(self, position, action):
        moves = {
            0: (-1, 0),  # Up
            1: (1, 0),   # Down
            2: (0, -1),  # Left
            3: (0, 1),   # Right
            4: (0, 0)    # Wait
        }
        return (position[0] + moves[action][0], position[1] + moves[action][1])

    def _is_valid(self, position):
        return (0 <= position[0] < self.grid_size[0] and
                0 <= position[1] < self.grid_size[1] and
                self.grid[position] != 2)  # No collision with other tracks

    def reset(self):
        self.__init__(self.grid_size, self.n_agents)
        return self.grid


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.fc(x)


In [3]:
pip install torch


Collecting torch
  Using cached torch-2.5.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Using cached torch-2.5.1-cp311-cp311-win_amd64.whl (203.1 MB)
Using cached fsspec-2024.10.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec, torch
Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330
ERROR: Could not install packages due to an OSError: [WinError 2] The system cannot find the file specified: 'C:\\Python311\\Scripts\\convert-caffe2-to-onnx.exe' -> 'C:\\Python311\\Scripts\\convert-caffe2-to-onnx.exe.deleteme'


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
class DQLAgent:
    def __init__(self, state_size, action_size, learning_rate=0.001, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=2000)
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_size)  # Random action
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()  # Exploitation

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).unsqueeze(0)
            next_state = torch.FloatTensor(next_state).unsqueeze(0)
            target = reward
            if not done:
                target += self.gamma * torch.max(self.model(next_state)).item()
            target_f = self.model(state).detach()
            target_f[0][action] = target
            self.model.zero_grad()
            loss = nn.functional.mse_loss(self.model(state), target_f)
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [6]:
if __name__ == "__main__":
    env = RailEnvironment(grid_size=(10, 10), n_agents=5)  # Single agent for simplicity
    agent = DQLAgent(state_size=env.grid.size, action_size=5)
    episodes = 1000
    batch_size = 32

    for episode in range(episodes):
        state = env.reset().flatten()  # Flatten grid to vector
        done = False
        total_reward = 0

        while not done:
            action = agent.act(state)
            next_state, rewards, done = env.step([action])
            next_state = next_state.flatten()
            reward = sum(rewards.values())
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

        agent.replay(batch_size)
        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")


Episode 1/1000, Total Reward: -6, Epsilon: 1.00
Episode 2/1000, Total Reward: -30, Epsilon: 1.00
Episode 3/1000, Total Reward: -6, Epsilon: 1.00
Episode 4/1000, Total Reward: -41, Epsilon: 0.99
Episode 5/1000, Total Reward: -71, Epsilon: 0.99
Episode 6/1000, Total Reward: -7, Epsilon: 0.99
Episode 7/1000, Total Reward: -12, Epsilon: 0.98
Episode 8/1000, Total Reward: -78, Epsilon: 0.98
Episode 9/1000, Total Reward: -15, Epsilon: 0.97
Episode 10/1000, Total Reward: -19, Epsilon: 0.97
Episode 11/1000, Total Reward: -6, Epsilon: 0.96
Episode 12/1000, Total Reward: -13, Epsilon: 0.96
Episode 13/1000, Total Reward: -7, Epsilon: 0.95
Episode 14/1000, Total Reward: -33, Epsilon: 0.95
Episode 15/1000, Total Reward: -5, Epsilon: 0.94
Episode 16/1000, Total Reward: -11, Epsilon: 0.94
Episode 17/1000, Total Reward: -13, Epsilon: 0.93
Episode 18/1000, Total Reward: -10, Epsilon: 0.93
Episode 19/1000, Total Reward: -21, Epsilon: 0.92
Episode 20/1000, Total Reward: -17, Epsilon: 0.92
Episode 21/1000

Episode 166/1000, Total Reward: -7, Epsilon: 0.44
Episode 167/1000, Total Reward: -18, Epsilon: 0.44
Episode 168/1000, Total Reward: -5, Epsilon: 0.44
Episode 169/1000, Total Reward: -5, Epsilon: 0.44
Episode 170/1000, Total Reward: -18, Epsilon: 0.43
Episode 171/1000, Total Reward: -10, Epsilon: 0.43
Episode 172/1000, Total Reward: -6, Epsilon: 0.43
Episode 173/1000, Total Reward: -9, Epsilon: 0.43
Episode 174/1000, Total Reward: -9, Epsilon: 0.42
Episode 175/1000, Total Reward: -6, Epsilon: 0.42
Episode 176/1000, Total Reward: -16, Epsilon: 0.42
Episode 177/1000, Total Reward: -6, Epsilon: 0.42
Episode 178/1000, Total Reward: -13, Epsilon: 0.42
Episode 179/1000, Total Reward: -6, Epsilon: 0.41
Episode 180/1000, Total Reward: -19, Epsilon: 0.41
Episode 181/1000, Total Reward: -11, Epsilon: 0.41
Episode 182/1000, Total Reward: 6, Epsilon: 0.41
Episode 183/1000, Total Reward: -12, Epsilon: 0.41
Episode 184/1000, Total Reward: -15, Epsilon: 0.40
Episode 185/1000, Total Reward: -20, Epsil

Episode 330/1000, Total Reward: -10, Epsilon: 0.19
Episode 331/1000, Total Reward: -6, Epsilon: 0.19
Episode 332/1000, Total Reward: -10, Epsilon: 0.19
Episode 333/1000, Total Reward: -15, Epsilon: 0.19
Episode 334/1000, Total Reward: -12, Epsilon: 0.19
Episode 335/1000, Total Reward: -6, Epsilon: 0.19
Episode 336/1000, Total Reward: -8, Epsilon: 0.19
Episode 337/1000, Total Reward: -5, Epsilon: 0.19
Episode 338/1000, Total Reward: -8, Epsilon: 0.19
Episode 339/1000, Total Reward: -7, Epsilon: 0.19
Episode 340/1000, Total Reward: 5, Epsilon: 0.18
Episode 341/1000, Total Reward: -17, Epsilon: 0.18
Episode 342/1000, Total Reward: -6, Epsilon: 0.18
Episode 343/1000, Total Reward: 8, Epsilon: 0.18
Episode 344/1000, Total Reward: -6, Epsilon: 0.18
Episode 345/1000, Total Reward: -11, Epsilon: 0.18
Episode 346/1000, Total Reward: -9, Epsilon: 0.18
Episode 347/1000, Total Reward: -12, Epsilon: 0.18
Episode 348/1000, Total Reward: -9, Epsilon: 0.18
Episode 349/1000, Total Reward: -7, Epsilon: 

Episode 493/1000, Total Reward: -10, Epsilon: 0.09
Episode 494/1000, Total Reward: -6, Epsilon: 0.09
Episode 495/1000, Total Reward: -6, Epsilon: 0.08
Episode 496/1000, Total Reward: -6, Epsilon: 0.08
Episode 497/1000, Total Reward: 8, Epsilon: 0.08
Episode 498/1000, Total Reward: -6, Epsilon: 0.08
Episode 499/1000, Total Reward: -14, Epsilon: 0.08
Episode 500/1000, Total Reward: -11, Epsilon: 0.08
Episode 501/1000, Total Reward: -11, Epsilon: 0.08
Episode 502/1000, Total Reward: -13, Epsilon: 0.08
Episode 503/1000, Total Reward: -14, Epsilon: 0.08
Episode 504/1000, Total Reward: -13, Epsilon: 0.08
Episode 505/1000, Total Reward: -17, Epsilon: 0.08
Episode 506/1000, Total Reward: -12, Epsilon: 0.08
Episode 507/1000, Total Reward: -7, Epsilon: 0.08
Episode 508/1000, Total Reward: -16, Epsilon: 0.08
Episode 509/1000, Total Reward: 4, Epsilon: 0.08
Episode 510/1000, Total Reward: -15, Epsilon: 0.08
Episode 511/1000, Total Reward: 10, Epsilon: 0.08
Episode 512/1000, Total Reward: -11, Epsi

Episode 658/1000, Total Reward: -10, Epsilon: 0.04
Episode 659/1000, Total Reward: -12, Epsilon: 0.04
Episode 660/1000, Total Reward: -14, Epsilon: 0.04
Episode 661/1000, Total Reward: 9, Epsilon: 0.04
Episode 662/1000, Total Reward: -13, Epsilon: 0.04
Episode 663/1000, Total Reward: -12, Epsilon: 0.04
Episode 664/1000, Total Reward: -13, Epsilon: 0.04
Episode 665/1000, Total Reward: -10, Epsilon: 0.04
Episode 666/1000, Total Reward: -14, Epsilon: 0.04
Episode 667/1000, Total Reward: -11, Epsilon: 0.04
Episode 668/1000, Total Reward: -9, Epsilon: 0.04
Episode 669/1000, Total Reward: -5, Epsilon: 0.04
Episode 670/1000, Total Reward: -14, Epsilon: 0.04
Episode 671/1000, Total Reward: -16, Epsilon: 0.04
Episode 672/1000, Total Reward: 10, Epsilon: 0.03
Episode 673/1000, Total Reward: -6, Epsilon: 0.03
Episode 674/1000, Total Reward: -12, Epsilon: 0.03
Episode 675/1000, Total Reward: -7, Epsilon: 0.03
Episode 676/1000, Total Reward: -9, Epsilon: 0.03
Episode 677/1000, Total Reward: 10, Eps

Episode 822/1000, Total Reward: -11, Epsilon: 0.02
Episode 823/1000, Total Reward: -12, Epsilon: 0.02
Episode 824/1000, Total Reward: -11, Epsilon: 0.02
Episode 825/1000, Total Reward: -11, Epsilon: 0.02
Episode 826/1000, Total Reward: 9, Epsilon: 0.02
Episode 827/1000, Total Reward: -7, Epsilon: 0.02
Episode 828/1000, Total Reward: -10, Epsilon: 0.02
Episode 829/1000, Total Reward: 8, Epsilon: 0.02
Episode 830/1000, Total Reward: -8, Epsilon: 0.02
Episode 831/1000, Total Reward: -7, Epsilon: 0.02
Episode 832/1000, Total Reward: -12, Epsilon: 0.02
Episode 833/1000, Total Reward: -6, Epsilon: 0.02
Episode 834/1000, Total Reward: -6, Epsilon: 0.02
Episode 835/1000, Total Reward: -7, Epsilon: 0.02
Episode 836/1000, Total Reward: 3, Epsilon: 0.02
Episode 837/1000, Total Reward: -11, Epsilon: 0.02
Episode 838/1000, Total Reward: -6, Epsilon: 0.02
Episode 839/1000, Total Reward: -12, Epsilon: 0.02
Episode 840/1000, Total Reward: -14, Epsilon: 0.02
Episode 841/1000, Total Reward: -5, Epsilon:

Episode 986/1000, Total Reward: -6, Epsilon: 0.01
Episode 987/1000, Total Reward: -10, Epsilon: 0.01
Episode 988/1000, Total Reward: -10, Epsilon: 0.01
Episode 989/1000, Total Reward: -14, Epsilon: 0.01
Episode 990/1000, Total Reward: -11, Epsilon: 0.01
Episode 991/1000, Total Reward: -8, Epsilon: 0.01
Episode 992/1000, Total Reward: -6, Epsilon: 0.01
Episode 993/1000, Total Reward: -9, Epsilon: 0.01
Episode 994/1000, Total Reward: -6, Epsilon: 0.01
Episode 995/1000, Total Reward: -12, Epsilon: 0.01
Episode 996/1000, Total Reward: -14, Epsilon: 0.01
Episode 997/1000, Total Reward: 10, Epsilon: 0.01
Episode 998/1000, Total Reward: -8, Epsilon: 0.01
Episode 999/1000, Total Reward: 8, Epsilon: 0.01
Episode 1000/1000, Total Reward: -5, Epsilon: 0.01
