# 1. Import Dependencies

In [None]:
# This project is about training a reinforcement learning agent to play the breakout atari game.
# Note it will be better to create a separate virtual environment for every coding project
# The name of my virtual environment is  "rl_in_3hrs"

# Installing dependencies
!pip install stable-baselines3[extra]
!pip install gym


In [None]:
# If one choose to use cuda accelerator, one will have to go to pytorch website to install it
# One might need to check out some tutorials on how to do that.
# After installation you will also need to restart your kernel.

In [None]:
# Importing dependencies
import gym 
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

# 2. Test Environment

In [None]:
# setting up the environment
environment_name = "Breakout-v0"
env = gym.make(environment_name)

In [None]:
# To reset the environment
env.reset()

In [None]:
# To know the action space of the environment
env.action_space()

# To know the observation space of the environment
env.observation_space()


In [None]:
# Testing our model
# Going through a number of episode and playing "breakout" game

# setting up the number of episode we want to play
episodes = 5

# looping through each one of those episodes
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        
        # taking random actions on the environment; playing randomly in this scenario.
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

#To close the environment
env.close()

In [None]:
env.action_space.sample()

In [None]:
env.observation_space.sample()

# 3. Vectorise Environment and Train Model

In [None]:
# Here we will vectorise our environment and train four different environment at thesame time.
# The aim of vectorising our environment is to speed up the training process.

# In this line of code we pass in the envinronment 'Breakout-v0' we are working with.
# Note that in the open-ai gym; there are two types of breakout environment; "Breakout-ram-v0" and "Breakout-v0"
# "Breakout-ram-v0" uses RAM as input while "Breakout-v0" uses image as input.

In [None]:
# Here we are using "Breakout-v0" version
env = make_atari_env('Breakout-v0', n_envs=4, seed=0)

In [None]:
# To stacks the environment together
env = VecFrameStack(env, n_stack=4)

In [None]:
# To reset the environment
env.reset()

# To render the environment
env.close()

In [None]:
# logpath is where we save our tensorboard log
# We can take a look at the tensorboard log to check how our model is performing
# The tutor created a folder named 'Training' in his project folder
# Inside the "Training" folder he created two folders and name them as "Logs" and "Saved Models"
# Tensorboard log is been saved inside the "Logs" folder.
# Trained Models are been saved inside the "Saved Models" folder.
log_path = os.path.join('Training', 'Logs')

In [None]:
# Defining our model 
# We are using A2C algorithm
# CnnPolicy(Convolutional Neural Network policy) is the policy we are using; 
# since image is the input into this model it will be better to use "CnnPolicy"
# Policy is the rule which tells an agent how to behave in an environment
# "env" that is the environment is passed as the second parameter
# "verbose = 1" because we want to log out the result of the particular model
# Then we specify our tensorboard log folder path
model = A2C("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

In [None]:
# To train our model
# This model will train for 100000 steps in this case.
model.learn(total_timesteps=400000)

# 4. Save and Reload Model

In [None]:
a2c_path = os.path.join('Training', 'Saved Models', 'A2C_model')

In [None]:
model.save(a2c_path)

In [None]:
# delete the model
del model

In [None]:
# if one has a model that performs better one can just pass in the model name as below
# a2c_path = os.path.join('Training', 'Saved Models', 'model name')

# The tutor has a better model which was trained for around 2 million times.
# He passed the model as a parameter as shown below;
# a2c_path = os.path.join('Training', 'Saved Models', 'A2C_2M_model')

In [None]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)

In [None]:
# to reload the model
model = A2C.load(a2c_path, env)

# 5. Evaluate and Test

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
env.close()