In [1]:
import gym_super_mario_bros
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.env_util import make_vec_env

# Set the random seed for reproducibility
set_random_seed(42)

# Create the Super Mario Bros environment
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')

# Define the number of episodes to record for
n_episodes = 3

# Define the duration of each episode in steps
episode_duration = 500

# Wrap the environment in a DummyVecEnv
env = DummyVecEnv([lambda: env])

# Create a VecVideoRecorder to record the gameplay as a video file
env = VecVideoRecorder(env, ".", record_video_trigger=lambda x: x == 0, video_length=episode_duration, name_prefix="mario_video")

# Create an A2C model
model = A2C('MlpPolicy', env, verbose=1)

# Train the model for 10,000 steps
model.learn(total_timesteps=1500)

# Evaluate the model's performance
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)

print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# Play the game with the trained model and record the gameplay as a video file
obs = env.reset()
for i in range(n_episodes):
    done = False
    episode_reward = 0
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
    print(f"Episode {i+1}: reward={episode_reward}")
env.close()


2023-05-10 17:18:35.138258: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using cpu device
Wrapping the env in a VecTransposeImage.
Saving video to /Users/sanyamjain/Desktop/RAMario/Proposed RAMario/mario_video-step-0-to-step-500.mp4
------------------------------------
| time/                 |          |
|    fps                | 35       |
|    iterations         | 100      |
|    time_elapsed       | 14       |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -5.41    |
|    explained_variance | 1.19e-07 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 21.4     |
|    value_loss         | 19.2     |
------------------------------------


  return (self.ram[0x86] - self.ram[0x071c]) % 256


------------------------------------
| time/                 |          |
|    fps                | 37       |
|    iterations         | 200      |
|    time_elapsed       | 26       |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -5.31    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | -4.23    |
|    value_loss         | 0.877    |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 38       |
|    iterations         | 300      |
|    time_elapsed       | 39       |
|    total_timesteps    | 1500     |
| train/                |          |
|    entropy_loss       | -5.38    |
|    explained_variance | 2.56e-06 |
|    learning_rate      | 0.0007   |
|    n_updates          | 299      |
|    policy_loss        | 0.0798   |
|    value_loss         | 0.000266 |
-



Saving video to /Users/sanyamjain/Desktop/RAMario/Proposed RAMario/mario_video-step-1500-to-step-2000.mp4
Mean reward: 60.00 +/- 0.00
Saving video to /Users/sanyamjain/Desktop/RAMario/Proposed RAMario/mario_video-step-11480-to-step-11980.mp4


  return (self.ram[0x86] - self.ram[0x071c]) % 256


Episode 1: reward=[266.]
Episode 2: reward=[229.]
Episode 3: reward=[505.]
