In [1]:
import numpy as np
import gym
import random
import stable_baselines3
from stable_baselines3 import DQN
import matplotlib.pyplot as plt
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common import results_plotter
from stable_baselines3.common.evaluation import evaluate_policy
import os
from stable_baselines3.common.env_checker import check_env

In [2]:
# method 1 - build from gym package
env = gym.make("gym_basic:basic-v0")

In [3]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

print(q_table)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
print(action_space_size)


81000


In [5]:
check_env(env,warn= True, skip_render_check=True)

In [6]:
log_dir = "/tmp/gym/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
env = Monitor(env, log_dir)

In [7]:
# generate the model by DQN
model = DQN("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env in a DummyVecEnv.


In [8]:
model.learn(total_timesteps=100, log_interval=4)
# save the model
model.save("dqn_facts")

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 2.64e+05 |
|    exploration_rate | 0.62     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 4        |
|    time_elapsed     | 0        |
|    total_timesteps  | 4        |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 2.96e+05 |
|    exploration_rate | 0.24     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 4        |
|    time_elapsed     | 1        |
|    total_timesteps  | 8        |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1        |
|    ep_rew_mean      | 2.85e+05 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes       

In [9]:

obs = env.reset()

#check how the model runs
for i in range(100):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done :
      obs = env.reset()

In [10]:
mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward_after:.2f} +/- std_reward:{std_reward_after:.2f}")

mean_reward:288445.60 +/- std_reward:145906.05


In [11]:
print(results_plotter.X_TIMESTEPS)

timesteps


In [12]:
print([log_dir])

['/tmp/gym/']


In [13]:
print(mean_reward_after)

288445.6


In [14]:
#results_plotter.plot_results([log_dir], 100, results_plotter.X_TIMESTEPS, "facts_gym")
Monitor.get_episode_rewards(env)

[252800.0,
 355380.0,
 132640.0,
 315080.0,
 238580.0,
 582720.0000000001,
 132840.0,
 356980.0,
 499760.0,
 244120.0,
 84240.0,
 223300.0,
 83040.0,
 248460.00000000006,
 250300.0,
 379600.0,
 254160.0,
 336600.0,
 395800.0,
 576700.0,
 321620.0,
 345440.0,
 122220.0,
 386860.0,
 306400.0,
 340920.0,
 225660.00000000006,
 472760.0,
 102460.0,
 370600.0,
 112060.0,
 606100.0,
 136820.0,
 370620.0,
 110980.0,
 350860.0,
 223020.0,
 613040.0,
 201000.0,
 242080.0,
 247939.99999999994,
 360439.99999999994,
 240500.0,
 153760.0,
 361040.0,
 478180.0,
 626600.0,
 227400.0,
 358080.0,
 148300.0,
 585240.0,
 368600.0,
 225080.0,
 607179.9999999999,
 131880.0,
 623360.0,
 243560.00000000006,
 169360.0,
 179320.0,
 358540.0,
 134000.0,
 322100.0,
 235160.00000000006,
 346300.0,
 242880.0,
 499980.0,
 362760.0,
 310160.00000000006,
 386980.0,
 573720.0,
 182000.0,
 360780.0,
 345920.0,
 359100.0,
 335620.0,
 83220.0,
 109400.0,
 278739.99999999994,
 156240.0,
 341880.0,
 605160.0,
 390040.0,
 82

In [15]:
Monitor.get_episode_times(env)

[0.3353080749511719,
 0.6844356060028076,
 0.8204357624053955,
 1.054636001586914,
 1.2242660522460938,
 1.5372631549835205,
 1.6657769680023193,
 1.9868550300598145,
 2.2702994346618652,
 2.5322256088256836,
 2.669865608215332,
 2.8322465419769287,
 2.9692065715789795,
 3.2425503730773926,
 3.411428213119507,
 3.6356585025787354,
 3.8023974895477295,
 4.019842147827148,
 4.281216382980347,
 4.711179733276367,
 4.976671934127808,
 5.340314626693726,
 5.476314306259155,
 5.793079137802124,
 6.027855634689331,
 6.283764362335205,
 6.452394962310791,
 6.724393844604492,
 6.8431525230407715,
 7.068845272064209,
 7.210384130477905,
 7.631153345108032,
 7.774442434310913,
 8.101755857467651,
 8.237948656082153,
 8.485658645629883,
 8.647959232330322,
 8.960553169250488,
 9.112516403198242,
 9.27268123626709,
 9.440158128738403,
 9.648119926452637,
 9.804703712463379,
 9.92470669746399,
 10.144574165344238,
 10.408573389053345,
 10.840940475463867,
 10.99976897239685,
 11.260639429092407,
 11

In [16]:
'''
import matplotlib.pyplot as plt
reward_y = Monitor.get_episode_rewards(env)
timestep_x = Monitor.get_episode_times(env)
plt.plot(timestep_x,reward_y)
'''

'\nimport matplotlib.pyplot as plt\nreward_y = Monitor.get_episode_rewards(env)\ntimestep_x = Monitor.get_episode_times(env)\nplt.plot(timestep_x,reward_y)\n'

In [17]:
episodes = 1000
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:108000.0
Episode:2 Score:297480.0
Episode:3 Score:110400.0
Episode:4 Score:326040.0
Episode:5 Score:499760.0
Episode:6 Score:134380.0
Episode:7 Score:111220.0
Episode:8 Score:199780.0
Episode:9 Score:367780.0
Episode:10 Score:350680.0
Episode:11 Score:358100.0
Episode:12 Score:390020.0
Episode:13 Score:242260.00000000006
Episode:14 Score:344200.0
Episode:15 Score:473100.0
Episode:16 Score:601620.0
Episode:17 Score:552920.0
Episode:18 Score:304960.0
Episode:19 Score:288700.0
Episode:20 Score:298860.00000000006
Episode:21 Score:630920.0000000001
Episode:22 Score:157320.0
Episode:23 Score:252600.0
Episode:24 Score:252239.99999999994
Episode:25 Score:134220.0
Episode:26 Score:182960.0
Episode:27 Score:286600.0
Episode:28 Score:136920.0
Episode:29 Score:134240.0
Episode:30 Score:371060.0
Episode:31 Score:273139.99999999994
Episode:32 Score:109120.0
Episode:33 Score:424960.0
Episode:34 Score:132320.0
Episode:35 Score:221880.0
Episode:36 Score:136640.0
Episode:37 Score:259060.