In [1]:
import gym
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.optimizers import SGD, RMSprop, Adam, Adamax

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
env = gym.make('LunarLander-v2')

## Observation samples for scaler

In [3]:
observation_samples = []

for n in range(100):
    observation = env.reset()
    observation_samples.append(observation)
    done = False
    while not done:
        action = np.random.randint(0, env.action_space.n)
        observation, reward, done, _ = env.step(action)
        observation_samples.append(observation)
        
observation_samples = np.array(observation_samples)

# Create scaler and fit
sc = StandardScaler()
sc.fit(observation_samples)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [4]:
# Observation sample
#
# array([ 0.00379438,  1.4016738 ,  0.38431674, -0.41094953, -0.00438996,
#       -0.08705341,  0.        ,  0.        ], dtype=float32)

## Creat monitor

In [5]:
#env = wrappers.Monitor(env, 'monitor-folder', force=True) # Saves cubed episodes

## Build Neural Networks - One for each action

In [6]:
def build_neural_network():
    model = Sequential()
    
    model.add(Dense(128, kernel_initializer='uniform', input_shape=(8,)))
    model.add(Activation('relu'))
    model.add(Dense(256, kernel_initializer='uniform'))
    model.add(Activation('tanh'))
    model.add(Dense(1, kernel_initializer='uniform'))
    model.add(Activation('linear')) #linear output so we can have range of real-valued outputs

    model.compile(loss='mse', optimizer='adamax')
              
    return model

## Combined Model

In [7]:
class Model:
    
    def __init__(self, env, scaler):
        self.env = env
        self.scaler = scaler
        self.models = []
        for i in range(env.action_space.n):
            model = build_neural_network()
            self.models.append(model) 

    def predict(self, state):
        state = self.scaler.transform(np.atleast_2d(state))
        preds = []
        for m in self.models:
            preds.append(m.predict(np.array(state), verbose=0)[0])
        return np.array(preds)                 

    def fit(self, state, action, future_discounted_reward):
        state = self.scaler.transform(np.atleast_2d(state))
        model = self.models[action] # Different model depending on action
        model.fit(np.array(state), np.array([future_discounted_reward]), epochs=1, verbose=0)

    def action(self, state, EPSILON):
        if np.random.random() < EPSILON:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.predict(state))

In [9]:
def plot(REWARD_BAG):
    running_avg = np.empty(len(REWARD_BAG))
    running_avg = list(map(lambda t: REWARD_BAG[max(0, t-25):(t+1)].mean(), range(len(REWARD_BAG))))
    plt.plot(running_avg)
    plt.title("Running Average")
    plt.show()

## Training

In [10]:
EPISODES = 5000
GAMMA = 0.99 # DISCOUNT
EPSILON = 1.0 / np.sqrt(1) # EXPLORATION RATE
EPSILON_DECAY_RATE = 0.001
EPSILON_MIN = 0.001
REWARD_BAG = np.empty(EPISODES)
COSTS = np.empty(EPISODES)

In [11]:
model = Model(env, sc)

for e in range(EPISODES):
    
    state = env.reset()
    done = False
    totalreward = 0
    iters = 0
    EPSILON = 1.0 / np.sqrt(e+1)
    
    # Q-learning - Learn model to map state to future discounted reward
    
    while not done:
        
        # Perform action 
        
        env.render()
        action = model.action(state, EPSILON)
        prev_state = state
        state, reward, done, info = env.step(action)

        # Fit model
        
        pred = model.predict(observation)
        future_discounted_reward = reward + GAMMA * np.max(pred)
        model.fit(prev_state, action, future_discounted_reward)
        totalreward += reward
        iters += 1
    
    REWARD_BAG[e] = totalreward
    
    if e % 25 == 0:
        print("Episode: ", e, "Itr", iters, "Reward:", totalreward, "Epsilon: %.3f", EPSILON, "Avg reward (25)):", REWARD_BAG[max(0, e-25):(e+1)].mean())
    if REWARD_BAG[max(0, e-100):(e+1)].mean() >= 200:
        break

print("Avg reward for last 100 episodes:", REWARD_BAG[-100:].mean())
print("Total steps:", REWARD_BAG.sum())

plt.plot(REWARD_BAG)
plt.title("Rewards")
plt.show()

Episode:  0 Itr 109 Reward: -143.4550578120585 Epsilon: %.3f 1.0 Avg reward (25)): -143.4550578120585
Episode:  25 Itr 475 Reward: 204.0853905394063 Epsilon: %.3f 0.19611613513818404 Avg reward (25)): -183.08322551812458
Episode:  50 Itr 226 Reward: -145.89284383075997 Epsilon: %.3f 0.14002800840280097 Avg reward (25)): -192.14104875165984
Episode:  75 Itr 195 Reward: -274.8553498076835 Epsilon: %.3f 0.11470786693528087 Avg reward (25)): -228.64698580067613
Episode:  100 Itr 216 Reward: -339.5144437709476 Epsilon: %.3f 0.09950371902099892 Avg reward (25)): -222.92016154425332
Episode:  125 Itr 232 Reward: -77.796420811839 Epsilon: %.3f 0.0890870806374748 Avg reward (25)): -229.78702146935004
Episode:  150 Itr 465 Reward: -204.16960169968937 Epsilon: %.3f 0.08137884587711594 Avg reward (25)): -184.2809514680524
Episode:  175 Itr 233 Reward: -100.51349379384666 Epsilon: %.3f 0.07537783614444091 Avg reward (25)): -203.89224734871067


KeyboardInterrupt: 

In [None]:
plot(REWARD_BAG)

In [None]:
env.close()