# Approximation Policy Function in the MountainCar problem
## Neural network with three output (with softmax)

In [None]:
def dataset():
    X, Y = [], []
    pi = solver.pi
    for ix in range(1, solver.bins[0]-1):
        for iv in range(1, solver.bins[1]-1):
            a = pi[ix, iv]
            if a != -1:                
                  X.append( solver.state((ix, iv)).tolist() )
                  Y.append( int(pi[ix, iv]) )
    return X, Y


In [None]:
solver.prev, solver.next = [], []    # clear memory

X, Y = dataset()
X, Y = torch.tensor(X, dtype=torch.float32), torch.tensor(Y, dtype=torch.int64)

print(X[-3:])
print(Y[-3:])

print(torch.min(Y), torch.max(Y), len(Y), torch.sum(Y == 1.), torch.sum( Y == 0.))

In [None]:
import torch
import torch.nn as nn

gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(gpu)

## Network architecture

In [None]:
nH = [32, 64]
model = nn.Sequential(
           nn.Linear(2, nH[0]),    
           nn.ReLU(),      
           nn.Linear(nH[0], nH[1]),  
           nn.ReLU(),         
           nn.Linear(nH[1], 3),  
           )      
model.to(gpu) 
print(gpu)

In [None]:
loss      = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),   lr=0.1, momentum=0.8) 
#optimizer = torch.optim.Adam(model.parameters(), lr=0.1 )

gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

X = X.to(gpu)
Y = Y.to(gpu)

print(X.shape, Y.shape)
print(X.device, Y.device)

In [None]:
def fit(model, X,Y, batch_size=1000, train=True):          
      model.train(train)                                 
      sumL, sumA, numB = 0, 0, int( len(X)/batch_size )  
       
      idx = torch.randperm( len(X) ).to(gpu)             
      X, Y = X[idx], Y[idx]      

      for i in range(0, numB*batch_size, batch_size):          
          xb = X[i: i+batch_size]                        
          yb = Y[i: i+batch_size]                        
              
          y = model(xb)                                  
          L = loss(y, yb)                                
  
          if train:                                      
              optimizer.zero_grad()                      
              L.backward()                               
              optimizer.step()                           
                                     
          sumL += L.detach().item()                       
          a = torch.argmax( torch.softmax(y.detach(), 1), 1 )
          sumA += (a == yb).float().mean()                
         
      return sumL/numB,  sumA/numB                        

In [None]:
import time
model.to(gpu)
print( "before:      loss: %.3e accuracy: %.4f" %  fit(model, X,Y, train=False) )
 
beg = time.process_time()
epochs = 1000                                             
for epoch in range(epochs):                               
    L,A = fit(model, X, Y, batch_size=1000)               
      
    if (epoch and epoch % 10 == 0) or epoch == epochs-1:  
        print(f"epoch: {epoch:5d} loss: {L:.3e} accuracy: {A:.4f}   {time.process_time()-beg:.2f}s")   
        beg = time.process_time()

In [None]:
def table(model, bins = (101, 101)):    
    pi = np.empty(bins)
    step = (solver.env.high-solver.env.low)/bins
    X = torch.empty((bins[0]*bins[1], 2), dtype=torch.float32)
    i = 0
    for ix in range(bins[0]):
        for iv in range(bins[1]):
            X[i] = torch.tensor(solver.env.low + (ix, iv)*step, dtype=torch.float32)
            i += 1

    print("model calc")
    with torch.no_grad():
        Y = model(torch.tensor(X, dtype=torch.float32))
        A = torch.argmax( torch.softmax(Y.detach(), 1), 1 )
    print("X:", X.shape, "Y:", Y.shape, "A:", A.shape)
    
    i, cnt = 0, 0
    for ix in range(bins[0]):
        for iv in range(bins[1]):            
            #probs = torch.softmax(Y[i], 0).numpy()     
            #pi[ix, iv] = np.random.choice([0,1,2], p=probs)  
            pi[ix, iv] = A[i]
            i += 1

        print(f"\rPolicy : {ix*100./bins[0]:3.0f}%", end="")
    return pi


pi = table(model.cpu(), (1001, 1001))

In [None]:
plot(pi, 'Policy function', d_ticks=100, bins = (1001, 1001))

In [None]:
def test(model, episodes = 10000, ticks=200, level = 0.05):
    env = gym.make("MountainCar-v0")
    rews = []
    for episode in range(episodes):
        obs =  env.reset()
        tot = 0
        for _ in range(ticks):
            with torch.no_grad():
                y = model(torch.tensor(obs, dtype=torch.float32))
                a = torch.argmax( torch.softmax(y, 0), 0 )
            action = a.item()

            obs, rew, done, _ = env.step(action)
            tot += rew

            if done:
                break

        rews.append(tot)
        if episode and episode % 100 == 0:
            mean, std = np.mean(rews), np.std(rews)
            print(f"\r{episode:5d}:  Reward: {mean:7.2f} ± {std/len(rews)**0.5:.1f}, std: {std:.0f}, min: {np.min(rews):.0f}  max: {np.max(rews):.0f}", end="")  

    mean, std = np.mean(rews), np.std(rews)
    print(f"\r{episode:5d}:  Reward: {mean:7.2f} ± {std/len(rews)**0.5:.1f}, std: {std:.0f}, min: {np.min(rews):.0f}  max: {np.max(rews):.0f}")  
            
    
test(model, level = 0.05)

In [None]:
import datetime
  
state = {'info':      "MountainCar",            # описание
         'date':      datetime.datetime.now(),   # дата и время
         'model' :    model.state_dict(),        # параметры модели
         'optimizer': optimizer.state_dict()}    # состояние оптимизатора
 
torch.save(state, 'MountainCar_16_32_soft.9980.99.pt')                    # сохраняем файл