In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import random
from collections import deque
import copy

In [2]:
model = tf.keras.Sequential([
                             layers.Dense(32,activation='relu', input_shape = (2,)),
                             layers.Dense(32,activation='relu'),
                             layers.Dense(3)
])

model.compile(optimizer='rmsprop',loss='mse', metrics=['accuracy'])

In [3]:
model_fixed_target = tf.keras.models.clone_model(model)
model_fixed_target.set_weights(model.get_weights()) 

In [None]:
env = gym.make('MountainCar-v0')
#actions: 0->go left 1->go right
#observations: [cart pos, cart vel, pole angle, pole velocity at tip]

###HYPERPARAMETERS###
num_episodes = 1000
replay_capacity = 5000
epsilon = 1
min_epsilon = 0.1
epsilon_dec = 0.000006
gamma = 1
minibatch_size = 32
train_every_n = 5
copy_weights_every_n = 500
#####################
replay = deque()
num_replays = 0
total_timesteps = 0

for i_episode in range(num_episodes):
    state = env.reset()    
    total_reward = 0
    t = 0
    
    while True:
        
        if(random.random()<epsilon):
            action = env.action_space.sample()
        else:
            action = np.argmax(model.predict( np.reshape( state, (1,2) ) ) )

        if(epsilon > min_epsilon):
            epsilon -= epsilon_dec
        
        next_state, reward, done, info = env.step(action)
        
        total_reward += reward

        if(num_replays<replay_capacity):
            replay.append([state, action, reward, next_state, done])
            num_replays += 1
        else:
            replay.popleft()
            replay.append([state, action, reward, next_state, done])
            if (total_timesteps % train_every_n) == 0:
                replay_arr = list(replay) 
                minibatch = random.sample(replay_arr, minibatch_size)
                curr_states = np.array([ minibatch[i][0] for i in range(minibatch_size) ])
                new_states = np.array([ minibatch[i][3] for i in range(minibatch_size) ])
                Y = model.predict(curr_states)
                new_state_Q_values = model_fixed_target.predict(new_states)
                
                for i,(state, action, reward, next_state, done) in enumerate(minibatch):
                    Y[i,action] = reward + gamma * np.max( new_state_Q_values[i]) * (1 - done)
                model.fit(curr_states, Y, epochs=10, verbose = 0)
            
            if (total_timesteps % copy_weights_every_n) == 0:
                model_fixed_target.set_weights(model.get_weights()) 

        state = next_state
        total_timesteps += 1
        t += 1          
        if done:
            print("Episode ", i_episode, ": ", t,"  epsilon: ", epsilon, "\n")
            break
env.close()

Episode  0 :  200   epsilon:  0.9988000000000099 

Episode  1 :  200   epsilon:  0.9976000000000198 

Episode  2 :  200   epsilon:  0.9964000000000297 

Episode  3 :  200   epsilon:  0.9952000000000396 

Episode  4 :  200   epsilon:  0.9940000000000495 

Episode  5 :  200   epsilon:  0.9928000000000594 

Episode  6 :  200   epsilon:  0.9916000000000693 

Episode  7 :  200   epsilon:  0.9904000000000792 

Episode  8 :  200   epsilon:  0.9892000000000891 

Episode  9 :  200   epsilon:  0.988000000000099 

Episode  10 :  200   epsilon:  0.9868000000001089 

Episode  11 :  200   epsilon:  0.9856000000001188 

Episode  12 :  200   epsilon:  0.9844000000001287 

Episode  13 :  200   epsilon:  0.9832000000001386 

Episode  14 :  200   epsilon:  0.9820000000001485 

Episode  15 :  200   epsilon:  0.9808000000001584 

Episode  16 :  200   epsilon:  0.9796000000001683 

Episode  17 :  200   epsilon:  0.9784000000001782 

Episode  18 :  200   epsilon:  0.9772000000001881 

Episode  19 :  200   ep

Episode  158 :  169   epsilon:  0.8229940000014606 

Episode  159 :  201   epsilon:  0.8217880000014706 

Episode  160 :  139   epsilon:  0.8209540000014774 

Episode  161 :  201   epsilon:  0.8197480000014874 

Episode  162 :  200   epsilon:  0.8185480000014973 

Episode  163 :  200   epsilon:  0.8173480000015072 

Episode  164 :  200   epsilon:  0.8161480000015171 

Episode  165 :  200   epsilon:  0.814948000001527 

Episode  166 :  69   epsilon:  0.8145340000015304 

Episode  167 :  201   epsilon:  0.8133280000015404 



In [0]:
model.save('my_mod_final.h5')

In [0]:
env = gym.make('MountainCar-v0')
print(env.action_space)
#> Discrete(2)
print(env.observation_space)

Discrete(3)
Box(2,)
