In [1]:
import gym
import os
import random
import sys
import numpy as np
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.optimizers import SGD, RMSprop, Adam, Adamax

Using TensorFlow backend.


In [2]:
env = gym.make('LunarLander-v2')

  result = entry_point.load(False)


## Observation samples for scaler

In [3]:
observation_samples = []

for n in range(100):
    observation = env.reset()
    observation_samples.append(observation)
    done = False
    while not done:
        action = np.random.randint(0, env.action_space.n)
        observation, reward, done, _ = env.step(action)
        observation_samples.append(observation)
        
observation_samples = np.array(observation_samples)

# Create scaler and fit
sc = StandardScaler()
sc.fit(observation_samples)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [4]:
# Observation sample
#
# array([ 0.00379438,  1.4016738 ,  0.38431674, -0.41094953, -0.00438996,
#       -0.08705341,  0.        ,  0.        ], dtype=float32)

## Creat monitor

In [5]:
#env = wrappers.Monitor(env, 'monitor-folder', force=True) # Saves cubed episodes

## Build Neural Networks - One for each action

In [6]:
def build_neural_network():
    model = Sequential()
    
    model.add(Dense(128, kernel_initializer='uniform', input_shape=(8,)))
    model.add(Activation('relu'))
    model.add(Dense(256, kernel_initializer='uniform'))
    model.add(Activation('relu'))
    model.add(Dense(1, kernel_initializer='uniform'))
    model.add(Activation('linear')) #linear output so we can have range of real-valued outputs

    model.compile(loss='mse', optimizer='adamax')

    return model

## Combined Model

In [7]:
class Model:
    
    def __init__(self, env, scaler):
        self.env = env
        self.scaler = scaler
        self.models = []
        for i in range(env.action_space.n):
            model = build_neural_network()
            self.models.append(model) 

    def predict(self, state):
        state = self.scaler.transform(np.atleast_2d(state))
        preds = []
        for m in self.models:
            preds.append(m.predict(np.array(state), verbose=0)[0])
        return np.array(preds)                 

    def _fit(self, state, action, future_discounted_reward):
        state = self.scaler.transform(np.atleast_2d(state))
        model = self.models[action] # Different model depending on action
        
        #print(type(future_discounted_reward))
        
        if type(future_discounted_reward) is np.ndarray:
            print(future_discounted_reward)
            #future_dsicounted_reward = future_discounted_reward[0][action]
            x = 1
            print(np.array(future_discounted_reward[0][action]).shape)
            print(np.array(state).shape)
            model.fit(np.array(state), 
                      np.array(future_discounted_reward[0][action]), 
                      epochs=1, verbose=0)
        else:
            #future_dsicounted_reward = future_discounted_reward[action]
            print(future_discounted_reward)
            model.fit(np.array(state), np.array([future_discounted_reward]), epochs=1, verbose=0)
        #future_discounted_reward = future_discounted_reward[action]
        #print("State: ", state)
        #print("Reward: ", [future_discounted_reward])
   

    def action(self, state, EPSILON):
        if np.random.random() < EPSILON:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.predict(state))

In [8]:
def remember(prev_state, action, reward, state, done):
    memory.append((prev_state, action, reward, state, done))

In [9]:
def replay(memory, BATCH_SIZE):
    mini_batch = random.sample(memory, 100) # MAX 2000 in memory
    
    for prev_state, action, reward, state, done in mini_batch:
        if not done:
            next_state_pred_reward = reward + GAMMA * np.max(model.predict(state)[0])
        else:
            next_state_pred_reward = reward
        
        
        # PREDICT = INPUT ONE STATE -> 4 REWARDS (ARGMAX FOR ACTION)
        # FIT = 
        
        '''
    next_state_pred_reward:  -99.82350345313549
    
    state_pred_reward [[-0.21892993]
     [-0.12000708]
     [-0.14074107]
     [-0.5067307 ]]
    (1, 4)

    State:  [[-0.5119985  -2.050301    0.45516843 -0.79553884 -2.557886   -8.063909
      -0.12273416 -0.16050309]]
              
        '''
        
        #print("next_state_pred_reward: ", next_state_pred_reward)
        state_pred_reward = model.predict(state)
        #print("state_pred_reward", state_pred_reward)
        state_pred_reward = state_pred_reward.reshape(1, -1)
        state_pred_reward[0][action] = next_state_pred_reward
        #print("state_pred_reward_after", state_pred_reward)
        print(state_pred_reward.shape)
        model._fit(prev_state, action, state_pred_reward)

## Helpers

In [10]:
def plot(episode_rewards):
    running_avg = np.empty(len(episode_rewards))
    running_avg = list(map(lambda t: episode_rewards[max(0, t-25):(t+1)].mean(), range(len(episode_rewards))))
    plt.plot(running_avg)
    plt.title("Running Average")
    plt.show()

In [11]:
def mission_accomplished(episode_rewards, e):
    return episode_rewards[max(0, e-100):(e+1)].mean() >= 200

## Training

In [12]:
EPISODES = 10000
GAMMA = 0.99 # DISCOUNT
EPSILON = 1.0 / np.sqrt(1) # EXPLORATION RATE
EPSILON_DECAY_RATE = 0.001
EPSILON_MIN = 0.001
BATCH_SIZE = 10
episode_rewards = np.empty(1000)
from collections import deque
memory = deque(maxlen=2000)

In [13]:
model = Model(env, sc)

for e in range(EPISODES):
    
    EPSILON = 1.0 / np.sqrt(e+1)
    
    state = env.reset()
    done = False
    episode_reward = 0
    frames = 0
    
    while not done:
        
        # Perform action
        
        #env.render()
        action = model.action(state, EPSILON)
        prev_state = state
        state, reward, done, info = env.step(action)

        # Predict future discounted reward from current state and train model to find this mapping

        pred_reward = model.predict(state)
        future_discounted_reward = reward + GAMMA * np.max(pred_reward)
        model._fit(prev_state, action, future_discounted_reward)
        episode_reward += reward
        frames += 1
        
        remember(prev_state, action, future_discounted_reward, state, done)
        
    episode_rewards[e] = episode_reward
    
    print(len(memory))
    if len(memory) > 250: # 250
        print("Replay!")
        replay(memory, BATCH_SIZE)
    
    if len(memory) >= 1000:
        print("Clear!")
        memory.clear()
    
    if e % 5 == 0:
        print("Episode: ", e, "Itr", frames, "Reward:", episode_reward, "Epsilon: %.3f" % EPSILON, "Avg reward (25):", episode_rewards[max(0, e-25):(e+1)].mean())
    if mission_accomplished(episode_rewards, e):
        break

print("Avg reward for last 100 episodes:", episode_rewards[-100:].mean())
print("Total steps:", episode_rewards.sum())

plt.plot(episode_rewards)
plt.title("Rewards")
plt.show()

0.41557726110863996
-0.4334262347177867
1.2328217117343059
0.43095975843357903
-0.8917287965253207
-0.7778270502423675
-0.9787182979051022
-1.4755010298476408
0.7339660480842497
-1.4438465780456682
0.4072372838367141
-1.1602478694177842
-0.5392656202201636
-1.6085622715777868
-0.7618545310712797
-0.9579490221675939
-1.0567628823166673
-1.1574494336772125
-3.564872806610997
0.08758163849293737
-3.486787598099281
-2.2788145398988613
-2.4602990245187857
-2.0997673500730865
-1.4713679701141722
-0.2251665478958603
-0.1237736112109178
-0.3344470129015213
0.18918204378104064
-1.0219015984186626
0.09021891246642554
-0.9425880882767274
-2.0478856438784176
-1.207683733807106
-2.359453840506935
-0.6565460484875257
-0.6407305800536562
-2.6566328791110663
-2.6223063039920733
-1.7424252000286016
-0.8345233671905249
-2.6422714961831137
-1.9104223068799218
-0.8168740259279605
-2.795548079828062
-1.0883781967256563
-2.6749572357207567
-2.6999763193178876
-1.7194133915140084
-2.9103102251096717
-2.14903

ValueError: Error when checking target: expected activation_3 to have 2 dimensions, but got array with shape ()

In [None]:
plot(episode_rewards)

In [None]:
env.close()