In [9]:
import time
import numpy as np
import matplotlib.pyplot as plt
import gym
import torch
from   torch import nn

In [20]:
env = gym.make('MountainCar-v0')                        
action_space = np.arange(env.action_space.n)           

nH = [32, 64]
model = nn.Sequential(
           nn.Linear(2, nH[0]),    
           nn.ReLU(),      
           nn.Linear(nH[0], nH[1]),  
           nn.ReLU(),         
           nn.Linear(nH[1], 3) )      

In [21]:
def discount_rewards(rewards, gamma=0.99):
    r = np.array( [gamma**i * rewards[i] for i in range(len(rewards))] )    
    r = r[: : -1].cumsum()[ : :-1]            # срез:  [beg : end : steps]
    return r - r.mean()


def policy(obs):
    with torch.no_grad():
        y = model(torch.FloatTensor(obs)).detach()        
        
    probs = torch.softmax(y, 0).numpy()      
    return np.random.choice(action_space, p=probs)        # случайный выбор с вероятностью            
    
#------------------------------------------------------------------------------

def learn(env, model, num_iters=1000, num_episodes=100, gamma=0.99, ticks = 200):        
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)  # Define optimizer 
        
    tot_learn = 0
    left      = -1.2, 
    beg       = time.process_time()
    total_rewards = []
    
    for iter in range(num_iters):                                  # итерации        
        batch_rewards, batch_actions, batch_states = [], [], []
        
        episode = 0        
        while episode < num_episodes: # and  len(batch_actions) < 500:            
            rewards, vels = [], []
            obs = env.reset()
            
            
            max_x = obs[0] - left
            for _ in range(ticks):                
                action = policy(obs)
                
                batch_states.append (obs.tolist())                # чтобы warning-а не было
                batch_actions.append(action)
            
                obs, rew, done, _ = env.step(action)   
                
                if obs[0] - left > max_x:
                    max_x = obs[0] - left

                rewards.append(rew)                
                
                if done:
                    total_rewards.append( sum(rewards) )
                    rewards[-1] += 10 * max_x
                    break                                         # эпизод окончен
            
            batch_rewards.extend( discount_rewards(rewards, gamma) )            
            episode += 1                     
        
        X = torch.FloatTensor(batch_states)
        W = torch.FloatTensor(batch_rewards)                    
        A = torch.LongTensor (batch_actions)                   # Actions are used as indices, must be LongTensor

        for epoch in range(1):                                 # эпохи обучения                        
            probs = torch.softmax(model(X), 1)
            logprob = (probs+1.e-5).log()                      # логарифм вероятностей
            logprob = torch.gather(logprob, 1, A.view(-1,1)).squeeze()            
            loss = -(W * logprob).mean()                       # Calculate loss                    

            optimizer.zero_grad()
            loss.backward()                                    # Calculate gradients                    
            optimizer.step()                                   # Apply gradients

            tot_learn += 1
       
        last_num = 1000
        avg, std = np.mean(total_rewards[-last_num:]), np.std(total_rewards[-last_num:])/len(total_rewards[-last_num:])**0.5
        print("\rIter: %4d  episode: %5d Reward of last %d trials:  %5.2f +/- %5.2f  loss:%.3e,  time: %5.0fs" 
                 % (iter + 1, episode, last_num, avg, std, loss.detach().item() ,time.process_time() - beg), end = "")        

    print(" tot_learn", tot_learn)     
    return total_rewards

In [None]:
state = torch.load('MountainCar_16_32_soft.9980.99.pt')                   # загружаем файл
model.load_state_dict(state['model'])        # получаем параметры модели
optimizer.load_state_dict(state['optimizer'])    # получаем состояние оптимизатора
print(state['info'], state['date'])              # вспомогательная информация


In [22]:
rewards = learn(env, model)

Iter:   33  episode:   100 Reward of last 1000 trials:  -200.00 +/-  0.00  loss:8.438e-08,  time:   435ss

KeyboardInterrupt: 