# MountainCar Continuous

In [1]:
import itertools
import numpy as np
import gym

#np.random.seed(1)
env = gym.make('MountainCarContinuous-v0')
print(env.
#env.seed(1)

In [38]:
space = env.observation_space                     # observation space
print("space: ", space.shape, space.dtype)        # space dimension and type
print("low:   ", space.low)                       # minimum values
print("high:  ", space.high)                      # maximum values

actions = env.action_space                        # action space
print("actions:", actions)       # number of actions, type
print("actions:", actions.shape) 

space:  (2,) float32
low:    [-1.2  -0.07]
high:   [0.6  0.07]
actions: Box([-1.], [1.], (1,), float32)
actions: (1,)


In [22]:
# ZhiqingXiao
# https://github.com/ZhiqingXiao/OpenAIGymSolution/tree/master/MountainCarContinuous-v0
def policy(obs):
    p, v = obs
    force = 2*int( p > -4*v or p < 13*v-0.6) - 1.0
    return np.array([force,])

In [24]:
def policy(obs):    
    force = 2*int( obs[1] > 0 ) - 1.0
    return np.array([force,])

In [36]:
def policy(obs):        
    return 2*np.random.random((1,))-1

In [48]:
def run_episode(env, render=False, verbose=False):
    observation = env.reset()
    episode_reward = 0.
    for step in itertools.count():
        if render:
            env.render()
        action = policy(observation)
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        if done:
            break
    if verbose:
        print('get {} rewards in {} steps'.format(
                episode_reward, step + 1))
    return episode_reward

In [56]:
rews = [run_episode(env) for _ in range(1000)]
print(f"average episode rewards: {np.mean(rews):.2f} ± {np.std(rews):.2f}  [{np.min(rews):.2f}, {np.max(rews):.2f}]")

average episode rewards: 95.88 ± 0.84  [93.06, 97.58]


In [30]:
import datetime
import torch
import torch.nn as nn

state = torch.load('models/MountainCar_Policy_2_32_64_1.98.1.pt')  
print(state['info'])
print(state['date'])
print(state['model'])

nH = [32, 64]
model = nn.Sequential(           
           nn.Linear(2, nH[0]),    
           nn.Sigmoid(),         
           nn.Linear(nH[0], nH[1]),  
           nn.Sigmoid(),         
           nn.Linear(nH[1], 1),  
           nn.Sigmoid() )      

model.load_state_dict(state['state']) 

def policy(obs):
    #obs =  -1. + 2.*(obs - env.observation_space.low)/(env.observation_space.high-env.observation_space.low)
    with torch.no_grad():    
        x = torch.tensor(obs, dtype=torch.float32)
        y = model(x)                
    #a = 1 if y > 0.5 else -1
    #return np.array([a])            
    return np.array([2*y-1])            

MountainCar best solution. Reward: -98.2  [-108...-83] std = 7
2022-06-13 11:55:40.431447
Sequential(
  (0): Linear(in_features=2, out_features=32, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=32, out_features=64, bias=True)
  (3): Sigmoid()
  (4): Linear(in_features=64, out_features=1, bias=True)
  (5): Sigmoid()
)


## Video

In [42]:
import math
import copy
import time
import gym
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

class ActorModel(nn.Module):
    """ Neural network for pi(a|s) """
    def __init__(self, nS, nA, hiddens):
        super(ActorModel, self).__init__()
        
        neurons, layers = [nS] + hiddens, []        
        for i in range(len(neurons)-1):
            layers.append(nn.Linear(neurons[i], neurons[i+1]) )            
            layers.append( nn.ReLU() )
        self.base = nn.Sequential(*layers)

        self.mu = nn.Sequential(
            nn.Linear(neurons[-1], nA),
            nn.Tanh()                   # action in [-1...1]
        )             
        self.std = nn.Sequential(
            nn.Linear(neurons[-1], nA),
            nn.Softplus()               # std > 0
        )         
         
        
    def forward(self, x):
        base = self.base(x)        
        return self.mu(base), self.std(base)        


In [54]:
def scale(obs):
    """ to [-1...1] """    
    return -1. + 2.*(obs - env.observation_space.low )/(env.observation_space.high-env.observation_space.low )
        
def policy(state):
    state = scale(state)
    state = torch.tensor(state, dtype=torch.float32).to(device)
    mu, std = actor( state )
     
    mu  = mu.data.cpu().numpy()
    std = std.data.cpu().numpy()
     
    actions = np.random.normal(mu, std)
        
    return np.clip(actions, -1, 1)       


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

state = torch.load('models/MountainCarContinuous-v0.96.pt')  

print(state['info'])
print(state['date'])
print(state['config'])
print(state['actor'])

actor = ActorModel(2, 1, state['config']['actor']) 
actor.to(device)
actor.load_state_dict(state['actor_nn']) 

device: cuda:0
MountainCarContinuous-v0: Q-function, Reward:  286
2022-07-27 20:26:47.089979
{'env': 'MountainCarContinuous-v0', 'ticks': 1000, 'timeout': True, 'method': 'ac', 'gamma': 0.99, 'decays': 1, 'capacity': 10000, 'actor': [64, 64], 'critic': [512, 512], 'online': 1, 'update': -1, 'std_min': 0.01, 'std_max': 3, 'scale': True, 'optimizer': 'adam', 'batch_act': 1000, 'batch_cri': 1000, 'lm_act': 1e-06, 'lm_cri': 0.0001, 'beta': 0.01}
ActorModel(
  (base): Sequential(
    (0): Linear(in_features=2, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
  )
  (mu): Sequential(
    (0): Linear(in_features=64, out_features=1, bias=True)
    (1): Tanh()
  )
  (std): Sequential(
    (0): Linear(in_features=64, out_features=1, bias=True)
    (1): Softplus(beta=1, threshold=20)
  )
)


<All keys matched successfully>

In [58]:
from PIL import Image, ImageDraw, ImageFont
import imageio

def render_frame(frame, text: str):
    """
    Get frame with overwritten text:
    """
    im = Image.fromarray(frame)
    if text:        
        drawer = ImageDraw.Draw(im)    
        text_color = (255,255,255) if np.mean(im) < 128 else (0,0,0)
        font = ImageFont.truetype("verdana.ttf", 18)
        drawer.text((10, 10), text, fill=text_color, font=font)        
        font = ImageFont.truetype("verdana.ttf", 14)
        drawer.text((im.size[0]-100,im.size[1]-20), "QuData.com", fill=text_color, font=font)
    return im


frames, last = [], 0
for episode in range(1, 11):
    rew = 0
    print(f"\repisode:{episode:2d}", end="")
    s = env.reset()                        
    for t in range(1000):       
        a = policy(s)                 
        s, r, done, _ = env.step(a) 
        rew += r
        
        if done:
            last = rew

        frame = env.render(mode='rgb_array') 
        frame = render_frame(frame, f"{episode:2d}: Acrtor-Critic  <rew> =  95.9 ± 0.8 [93.1, 97.6]  {last:4.0f}")        
        frames.append(frame)
    
        if done:              
            break                

imageio.mimwrite("render.mp4", frames, fps=60)            
env.close()

episode:10

