In [1]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
import seaborn as sns
import gymGharrafa

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import time
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common import cmd_util
from tensorboardX import SummaryWriter

from collections import deque  

import warnings
warnings.filterwarnings('ignore')
import gc

import torch
import torch.nn as nn
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim


In [2]:
class Experience:
    def __init__(self, subEnvs, nsteps, gamma, policy, critic, possible_actions, device):
        self.policy = policy
        self.critic = critic
        self.subEnvs = subEnvs
        self.steps = nsteps
        self.state = subEnvs.reset()
        self.last_steps = deque([])
        self.reward_accumulator = 0
        self.episodes_count = 0
        self.possible_actions = possible_actions
        self.gamma = gamma
        self.device = device
        
        #for monitoring
        self.currentscores = np.zeros(subEnvs.num_envs)
        self.accscores = np.zeros(subEnvs.num_envs)
        self.finished = np.zeros(subEnvs.num_envs)
        self.lastfinished = np.zeros(subEnvs.num_envs)
        self.lastaccscores = np.zeros(subEnvs.num_envs)
        
        self.probs = []
    
    def getStats(self):
        entropy = np.mean([np.mean([stats.entropy(a[pi]) for pi in range(a.shape[0])]) for a in self.probs[-100:]])
        
        if 0 not in self.finished: 
            mean_return = np.mean(self.accscores / self.finished)

        else:
            mean_return = 0
        self.probs = self.probs[-100:]
        gc.collect()
        return mean_return,entropy,self.episodes_count
    
    def update(self,policy,critic):
        self.policy = policy
        self.critic = critic
        
    def __iter__(self):
        return self
    

    def __next__(self):
        for i in range((self.steps+1)-len(self.last_steps)): 
            if self.policy:
                obsTensor = torch.FloatTensor(self.state.reshape(self.subEnvs.num_envs,subEnvs.observation_space.shape[1])).to(self.device)
                softmax = F.softmax(self.policy(obsTensor))
                probs = softmax.detach().cpu().numpy() 
                self.probs.append(probs)
                actions = np.array([ np.random.choice(self.possible_actions,p=probs[c]) for c in range(self.subEnvs.num_envs)] )
            else:
                actions = np.array([random.choice(self.possible_actions) for _ in range(self.subEnvs.num_envs)])
            newstate,reward,episode_over,info = self.subEnvs.step(actions)

            #print(episode_over)
            #print("Time: %2f. Reward: %2f. Episode over: %s" % (info['time'],reward,"YES" if episode_over else "NO"))
            
            #add final score of finished episodes
            self.currentscores += reward
            self.accscores += self.currentscores * episode_over
            self.finished += episode_over
            
            #reset currentscores for finished episodes
            self.currentscores  = self.currentscores * ~episode_over
            
            self.reward_accumulator += np.sum(reward)
            self.episodes_count += np.sum(episode_over)
            
            
            self.last_steps.append((self.state,reward,episode_over,actions))
            self.state = newstate

        first_state,accumulated_reward,was_end_reached,first_actions = self.last_steps.popleft()
        
        gammap=1.0
        for i in range(self.steps-1):
            gammap = gammap*self.gamma
            _,reward,end_reached,_ = self.last_steps[i]
            
            accumulated_reward += reward*gammap* ~was_end_reached
            
            was_end_reached += end_reached

        last_state = self.last_steps[-1][0]*(~was_end_reached)[:,None]

        
        return (first_state,accumulated_reward,last_state,first_actions,was_end_reached)

In [3]:
GAMMA = 0.99
LEARNING_RATE = 0.00001
ENTROPY_BETA = 0.001
BATCH_SIZE = 32
NUM_ENVS = 6
nsteps = 20

CLIP_GRAD = 0.2

possible_actions = range(11)

In [4]:
class NetPG(nn.Module):
    def __init__(self, input_size, n_actions):
        super(NetPG, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Linear(256, n_actions)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
subEnvs = cmd_util.make_vec_env("gymGharrafa-v1",gymGharrafa.GharrafaBasicEnv,NUM_ENVS,1232)

Logging to /tmp/openai-2019-04-01-11-03-49-335466


In [None]:
policy = None
critic = None
device = torch.device('cuda:1')

exp = Experience(subEnvs,nsteps, GAMMA, policy, critic,possible_actions, device)

writer = SummaryWriter(comment="ANN-Roundabout_NOCRITIC. Nenv %d,batch length %d, nsteps %d, eta %2f" % (NUM_ENVS,BATCH_SIZE,nsteps,LEARNING_RATE))



In [None]:
batch_states = []
batch_actions = []
batch_advantages = []
batch_last_states = []
batch_final_state = []
batch_count = 0
baseline = 0

for first_state,advantage,last_state,first_action,final_state in exp:
    batch_states.append(first_state)
    batch_advantages.append(advantage)
    batch_final_state.append(final_state)
    batch_last_states.append(last_state)
    batch_actions.append(first_action)
    
    
    if len(batch_states) == BATCH_SIZE:
        batch_states = np.array(batch_states).reshape(NUM_ENVS*BATCH_SIZE,subEnvs.observation_space.shape[1])
        batch_actions = np.array(batch_actions).reshape(NUM_ENVS*BATCH_SIZE)
        batch_advantages = np.array(batch_advantages).reshape(NUM_ENVS*BATCH_SIZE)
        batch_final_state = np.array(batch_final_state).reshape(NUM_ENVS*BATCH_SIZE)
        batch_last_states = np.array(batch_last_states).reshape(NUM_ENVS*BATCH_SIZE,subEnvs.observation_space.shape[1])
        
        if policy == None: 
            policy = NetPG(68,11).to(device)
            exp.update(policy,critic)
            optimizer = optim.Adam(policy.parameters(), lr=LEARNING_RATE)
            
        delta = torch.FloatTensor(batch_advantages).to(device)
        optimizer.zero_grad()
        logits_v = policy( torch.FloatTensor(batch_states).to(device))
        log_prob_v = F.log_softmax(logits_v, dim=1)
        log_prob_actions_v = delta * log_prob_v[:,batch_actions]
        loss_policy_v = -log_prob_actions_v.mean()
        
        
        prob_v = F.softmax(logits_v, dim=1)
        entropy_v = -(prob_v * log_prob_v).sum(dim=1).mean()
        entropy_loss_v = -ENTROPY_BETA * entropy_v

        loss_v = loss_policy_v + entropy_loss_v

        loss_v.backward()
        
        optimizer.step()
                        
        batch_states = []
        batch_actions = []
        batch_advantages = []
        batch_last_states = []
        batch_final_state = []

        gc.collect()
        batch_count+=1
        
        
        episode_score,entropy,nepisodes = exp.getStats()
        writer.add_scalar("Episode Score", episode_score, nepisodes)
        writer.add_scalar("Entropy", entropy, nepisodes)
        #writer.add_scalar("Last Gradient variance", np.var(g), nepisodes)
        
        if batch_count % 1 == 0:
            print("Mean entropy of last action: %2f. Average score after %d batches and %d episodes: %2f" % (exp.getStats()[1],batch_count,nepisodes,exp.getStats()[0]))        
        #if batch_count % 10 == 0:
            #policy.save_model("policy_last_roundabout_ann.model")
            #break


Mean entropy of last action: nan. Average score after 1 batches and 0 episodes: 0.000000
Mean entropy of last action: 2.046667. Average score after 2 batches and 2 episodes: 0.000000
Mean entropy of last action: 2.068501. Average score after 3 batches and 3 episodes: 0.000000
Mean entropy of last action: 2.046592. Average score after 4 batches and 4 episodes: 0.000000
Mean entropy of last action: 2.046337. Average score after 5 batches and 6 episodes: 0.000000
Mean entropy of last action: 2.050170. Average score after 6 batches and 7 episodes: 665.833333
Mean entropy of last action: 2.052599. Average score after 7 batches and 10 episodes: 655.305556
Mean entropy of last action: 2.045501. Average score after 8 batches and 10 episodes: 655.305556
Mean entropy of last action: 2.022125. Average score after 9 batches and 11 episodes: 649.722222
Mean entropy of last action: 2.012846. Average score after 10 batches and 12 episodes: 618.138889
Mean entropy of last action: 2.002524. Average sco

In [None]:
torch.save(policy, "ANN-Roundabout_NOCRITIC_Nenv_%d_batch_length_%d_nsteps_%d_eta_%2f.model" % (NUM_ENVS,BATCH_SIZE,nsteps,LEARNING_RATE))