In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.nn import init
import copy
import numpy as np

import queue

In [16]:
gpu = False

In [17]:
class RND(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3):
        super(RND,self).__init__()
        
        self.width = width
        self.height = height
        self.channel = channel
        
        self.predictor = nn.Sequential(
            nn.Conv2d(
                in_channels=3,
                out_channels=32,
                kernel_size=8,
                stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1),
            nn.LeakyReLU(),
            Flatten(),
            nn.Linear(46592, 512), # change
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512)
            )
        self.target = copy.deepcopy(self.predictor)
        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

        for param in self.target.parameters():
            param.requires_grad = False
            
    def forward(self, state):
        target_feature = self.target(state)
        predict_feature = self.predictor(state)
        return predict_feature, target_feature

In [18]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
class PPO(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3, action_dim = 7):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        super(PPO,self).__init__()
        self.memory = []
        
        self.basic = nn.Sequential(\
                    nn.Conv2d(in_channels = 3,\
                             out_channels = 32,\
                             kernel_size = 8,\
                              stride = 4),
                    nn.ReLU(),
                                   nn.Conv2d(in_channels = 32,\
                                           out_channels = 64,\
                                           kernel_size = 4,\
                                           stride = 2),\
                    nn.ReLU(),\
                    nn.Conv2d(in_channels = 64,\
                             out_channels = 64,\
                             kernel_size = 3,\
                             stride = 1),
                                   nn.ReLU(),\
                                   Flatten(),
                    nn.Linear(46592,256), #have to change       
                    nn.ReLU(),\
                    nn.Linear(256,448),
                    nn.ReLU()
                    )
        self.actor = nn.Sequential(\
                                  nn.Linear(448,448),\
                                  nn.ReLU(),\
                                  nn.Linear(448,self.action_dim)\
                                  )
        
        self.extrinsic_critic = nn.Linear(448,1)
        self.intrinsic_critic = nn.Linear(448,1)
        
        init.orthogonal_(self.extrinsic_critic.weight, 0.01)
        self.extrinsic_critic.bias.data.zero_()

        init.orthogonal_(self.intrinsic_critic.weight, 0.01)
        self.intrinsic_critic.bias.data.zero_()

        for i in range(len(self.actor)):
            if type(self.actor[i]) == nn.Linear:
                init.orthogonal_(self.actor[i].weight, 0.01)
                self.actor[i].bias.data.zero_()
    def forward(self, x,dim = -1):
        x = self.basic(x)
        action = self.actor(x)
        action_prob = F.softmax(action,dim = dim)
        
        intrinsic = self.intrinsic_critic(x)
        extrinsic = self.extrinsic_critic(x)
        return action_prob,extrinsic,intrinsic    

    


In [19]:
learning_rate = 0.0001
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 4
T_horizon     = 128
critic_coef = 0.5
ent_coef = 0.001
intrinsic_gamma = 0.99
extrinsic_gamma = 0.999
update_proportion = 0.25

extrinsic_advantage_coef = 2
intrinsic_advantage_coef = 1
class Agent(nn.Module):
    def __init__(self,width=240,height=256,channel = 3,action_dim=7,learning_rate=0.0005):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        
        self.memory = []
        self.intrinsic_queue = queue.Queue()
        self.intrinsic_input_queue = queue.Queue()
        self.ppo = PPO(self.width, self.height, self.channel, self.action_dim)
        self.rnd = RND(self.width, self.height , self.channel)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
    def put_data(self,data):
        self.memory.append(data)
    def make_batch(self):
        state_list, action_list, extrinsic_reward_list, intrinsic_reward_list, next_state_list, \
        prob_list, extrinsic_done_list,intrinsic_done_list = [],[],[],[],[],[], [],[]
        for data in self.memory:
            state,action,extrinsic_reward, intrinsic_reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            extrinsic_reward_list.append([extrinsic_reward])
            intrinsic_reward_list.append([intrinsic_reward])

            next_state_list.append(next_state)
            extrinsic_done_mask = 0 if done else 1
            extrinsic_done_list.append([extrinsic_done_mask])
            intrinsic_done_list.append([1])
            prob_list.append([prob])
        self.memory = []

        s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob \
                                        = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),\
                                        torch.tensor(extrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(extrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(prob_list,dtype = torch.float)
        if gpu:
            return s.cuda(),a.cuda(),er.cuda(),ir.cuda(),next_s.cuda(),extrinsic_done_list.cuda()\
            ,intrinsic_done_list.cuda(),prob.cuda() 
        else :
            return s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob  
    
    def train(self):
        state,action,extrinsic_reward,intrinsic_reward, next_state,extrinsic_done_list,\
        intrinsic_done_list,action_prob = self.make_batch()
        
        for i in range(K_epoch):
            state = state.squeeze()
            next_state = next_state.squeeze()
            predicted_action, predicted_extrinsic, predicted_intrinsic = self.ppo(state)
            predicted_next_action, predicted_next_extrinsic, predicted_next_intrinsic = self.ppo(next_state)

            if gpu:
                intrinsic_next_state_mean = torch.mean(torch.cat(list(self.intrinsic_input_queue.queue)),dim = 0).cuda()
                intrinsic_next_state_std = torch.std(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0).cuda()
            
                preprocessed_next_state = torch.clamp(((next_state - intrinsic_next_state_mean) / \
                                                   intrinsic_next_state_std + torch.tensor(1e-8).cuda()),-5,5)
            else:
                intrinsic_next_state_mean = torch.mean(torch.cat(list(self.intrinsic_input_queue.queue)),dim = 0)
                intrinsic_next_state_std = torch.std(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0)
            
                preprocessed_next_state = torch.clamp(((next_state - intrinsic_next_state_mean) / \
                                                   intrinsic_next_state_std + torch.tensor(1e-8)),-5,5)
            
            predict_feature, target_feature = self.rnd(next_state)
            td_error = extrinsic_reward + extrinsic_gamma * predicted_next_extrinsic * extrinsic_done_list
            delta = td_error - predicted_next_extrinsic
            if gpu:
                delta = delta.detach().cpu().numpy()
            else:
                delta = delta.detach().numpy()
            advantage_list = []
            
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            if gpu:
                advantage = torch.tensor(advantage_list,dtype = torch.float).cuda()
            else:
                advantage = torch.tensor(advantage_list,dtype = torch.float)
            ##intrinsic_advantage
            intrinsic_td_error = intrinsic_reward + intrinsic_gamma * predicted_next_intrinsic * intrinsic_done_list
            intrinsic_delta = intrinsic_td_error - predicted_next_intrinsic
            if gpu:
                intrinsic_delta = intrinsic_delta.detach().cpu().numpy()
            else:
                intrinsic_delta = intrinsic_delta.detach().numpy()
                
            intrinsic_advantage_list = []
            intrinsic_advantage = 0.0
            
            for intrinsic_delta_t in intrinsic_delta[::-1]:
                intrinsic_advantage = gamma * lmbda * intrinsic_advantage + intrinsic_delta_t[0]
                intrinsic_advantage_list.append([intrinsic_advantage])
            intrinsic_advantage_list.reverse()
            if gpu:
                intrinsic_advantage = torch.tensor(intrinsic_advantage_list,dtype = torch.float).cuda()
            else:
                intrinsic_advantage = torch.tensor(intrinsic_advantage_list,dtype = torch.float)
            #### intrinsic_error
            intrinsic_error = (intrinsic_td_error - predicted_intrinsic.detach()).pow(2)
            if gpu:
                masking = torch.rand(len(intrinsic_error)).cuda()
                masking = (masking < update_proportion).type(torch.FloatTensor).cuda()
            else:
                masking = torch.rand(len(intrinsic_error))
                masking = (masking < update_proportion).type(torch.FloatTensor)
            if gpu:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(intrinsic_error.sum(), torch.Tensor([1]).cuda())
            else:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(masking.sum(), torch.Tensor([1]))

            
            now_action = predicted_action
            m = Categorical(now_action)
            entropy = m.entropy().mean()
            
            
            now_action = now_action.gather(1,action)
            
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            advantage = extrinsic_advantage_coef * advantage +  intrinsic_advantage_coef * intrinsic_advantage
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2) + critic_coef * (F.smooth_l1_loss(predicted_extrinsic,td_error.detach()) +\
                    intrinsic_error) - ent_coef * entropy + F.mse_loss(predict_feature, target_feature)
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [20]:
epochs = 1000
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)


In [21]:
if gpu:
    model = Agent().cuda()
else:
    model = Agent()


In [8]:
'''
try:
    if gpu:
        model.load_state_dict(torch.load("weights/2_episode_model"))
    else:
        model.load_state_dict(torch.load("weights/2_episode_model", map_location={'cuda:0': 'cpu'}))
except :
    pass
'''

In [40]:
T_horizon = 20

for epoch in range(epochs):
    global_step = 0
    model.intrinsic_queue = queue.Queue()
    model.intrinsic_input_queue = queue.Queue()
    state = env.reset()
    state = np.array(state)/255
            #state = np.transpose(state,(2,0,1))
    state = np.moveaxis(state, -1, 0)
    state = torch.tensor(state).float()
    state = state.unsqueeze(0)
    done = False
    while not done :
        for t in range(T_horizon):
            #env.render()
            global_step +=1

            if gpu:
                action_prob, _ , _ = model.ppo.forward(state.cuda())
            else:
                action_prob, _ , _ = model.ppo.forward(state)
            m = Categorical(action_prob)
            action = m.sample().item()
            
            
            next_state, extrinsic_reward, done, info = env.step(action)
            next_state = np.array(next_state)/255
            next_state = np.moveaxis(next_state,-1,0)
            next_state = torch.tensor(next_state).float()
            next_state = next_state.unsqueeze(0)
            
            
            model.intrinsic_input_queue.put(next_state)
            if len(model.intrinsic_input_queue.queue) > 128:
                model.intrinsic_input_queue.get()
            intrinsic_next_state_mean = \
                    torch.mean(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0)
            if len(model.intrinsic_input_queue.queue) == 1:
                intrinsic_next_state_std = torch.zeros(1)
            else:
                intrinsic_next_state_std = \
                        torch.std(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0)
    
            preprocessed_next_state = \
                    torch.clamp(((next_state - intrinsic_next_state_mean) / (intrinsic_next_state_std + 1e-8)), -5,5)
            
            
            #(model.intrinsic_input_queue)
            if gpu:
                predictor,target = model.rnd.forward(preprocessed_next_state.cuda())
            else:
                predictor,target = model.rnd.forward(preprocessed_next_state)
            intrinsic_reward = (predictor - target).pow(2).sum(1) / 2
            if len(model.intrinsic_queue.queue) > 128:
                model.intrinsic_queue.get()
            model.intrinsic_queue.put(intrinsic_reward.item())
            intrinsic_mean = np.mean(model.intrinsic_queue.queue)
            intrinsic_std = np.std(model.intrinsic_queue.queue)
            intrinsic_reward = (intrinsic_reward - intrinsic_mean) / (intrinsic_std+ 1e-8)
            
            if info['time'] == 0 or info['life'] == 1 or info['time'] == 300:
                done = True
                reward = -10.

            model.put_data((state.tolist(), action, extrinsic_reward/100, (intrinsic_reward.item()), next_state.tolist(), action_prob[0][action].item(), done))
            print('global_step : ',global_step,', action : ', action,'action_prob : ',action_prob.tolist()[0])
            print('extrinsic_reward : ',extrinsic_reward,'intrinsic_reward : ',intrinsic_reward.item())
            print('place',info['x_pos'])
            print('info',info)
            if done :
                print('epoch : ',epoch, ', global_step : ',global_step)
                break
            state = next_state
        model.train()
    #env.render()

#env.close()

intrinsic_reward.item() :  0.0
global_step :  1 , action :  3 action_prob :  [0.14285677671432495, 0.14285710453987122, 0.14285744726657867, 0.14285707473754883, 0.14285728335380554, 0.14285729825496674, 0.14285695552825928]
extrinsic_reward :  0 intrinsic_reward :  0.0
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
intrinsic_reward.item() :  0.0
global_step :  2 , action :  2 action_prob :  [0.14285677671432495, 0.14285710453987122, 0.14285744726657867, 0.14285707473754883, 0.14285728335380554, 0.14285729825496674, 0.14285695552825928]
extrinsic_reward :  0 intrinsic_reward :  0.0
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
intrinsic_reward.item() :  1.9368853569030762
global_step :  3 , action :  1 action_prob :  [0.14285677671432495, 0.

intrinsic_reward.item() :  22939.35546875
global_step :  19 , action :  0 action_prob :  [0.14285677671432495, 0.14285710453987122, 0.14285744726657867, 0.14285707473754883, 0.14285728335380554, 0.14285729825496674, 0.14285694062709808]
extrinsic_reward :  -1 intrinsic_reward :  2.2297821044921875
place 48
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 48, 'x_pos_screen': 48, 'y_pos': 79}
intrinsic_reward.item() :  31892.9140625
global_step :  20 , action :  3 action_prob :  [0.14285677671432495, 0.14285710453987122, 0.14285744726657867, 0.14285707473754883, 0.14285728335380554, 0.14285729825496674, 0.14285695552825928]
extrinsic_reward :  1 intrinsic_reward :  2.7246546745300293
place 49
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 49, 'x_pos_screen': 49, 'y_pos': 79}
intrinsic_reward.item() :  37568.2734375
global_step :  21 , a

intrinsic_reward.item() :  42682.17578125
global_step :  37 , action :  0 action_prob :  [0.12560036778450012, 0.12549930810928345, 0.1909065693616867, 0.12511563301086426, 0.12487104535102844, 0.184056356549263, 0.1239507645368576]
extrinsic_reward :  1 intrinsic_reward :  1.0110849142074585
place 65
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 65, 'x_pos_screen': 65, 'y_pos': 91}
intrinsic_reward.item() :  15310.85546875
global_step :  38 , action :  6 action_prob :  [0.12560328841209412, 0.12550222873687744, 0.19089840352535248, 0.12511862814426422, 0.1248740553855896, 0.1840495616197586, 0.12395390123128891]
extrinsic_reward :  1 intrinsic_reward :  -0.5207064747810364
place 66
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 66, 'x_pos_screen': 66, 'y_pos': 86}
intrinsic_reward.item() :  15174.3798828125
global_step :  39 , act

KeyboardInterrupt: 

In [22]:
model.intrinsic_queue.queue

deque([nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan])

In [20]:
preprocessed_next_state

tensor([[[[-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          ...,
          [-5.,  5.,  5.,  ...,  5.,  5., nan],
          [-5.,  5.,  5.,  ...,  5., nan, nan],
          [ 5., nan, nan,  ..., nan, nan,  5.]],

         [[-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          ...,
          [-5., -5., -5.,  ..., -5., -5., nan],
          [-5., -5., -5.,  ..., -5., nan, nan],
          [-5., nan, nan,  ..., nan, nan, -5.]],

         [[-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          ...,
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan]]]])

319

In [44]:
env.close()

ValueError: env has already been closed.

In [22]:
if gpu:
    model.load_state_dict(torch.load("weights/3"))
else:
    model.load_state_dict(torch.load("weights/3", map_location={'cuda:0': 'cpu'}))

In [23]:
epochs = 1000
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT



In [34]:
env = gym_super_mario_bros.make('SuperMarioBros-1-3-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)

In [35]:
T_horizon = 20

for epoch in range(epochs):
    global_step = 0
    state = env.reset()
    state = np.array(state)/255
            #state = np.transpose(state,(2,0,1))
    state = np.moveaxis(state, -1, 0)
    state = torch.tensor(state).float()
    state = state.unsqueeze(0)
    done = False
    while not done :
        for t in range(T_horizon):
            env.render()
            global_step +=1

            if gpu:
                action_prob, _ , _ = model.ppo.forward(state.cuda())
            else:
                action_prob, _ , _ = model.ppo.forward(state)
            m = Categorical(action_prob)
            action = m.sample().item()
            next_state, extrinsic_reward, done, info = env.step(action)
            next_state = np.array(next_state)/255
            next_state = np.moveaxis(next_state,-1,0)
            next_state = torch.tensor(next_state).float()
            next_state = next_state.unsqueeze(0)
            if gpu:
                predictor,target = model.rnd.forward(next_state.cuda())
            else:
                predictor,target = model.rnd.forward(next_state)
            intrinsic_reward = (predictor - target).pow(2).sum(1) / 2
            if info['time'] == 0 :
                done = True
                reward = -10.
            if done :
                print('epoch : ',epoch, ', global_step : ',global_step)
                break
            print('global_step : ',global_step,', action : ', action,'action_prob : ',action_prob.tolist()[0])
            print('extrinsic_reward : ',extrinsic_reward,'intrinsic_reward : ',intrinsic_reward.item())
            print('place',info['x_pos'])
            print('info',info)
            state = next_state
    #env.render()

#env.close()

global_step :  1 , action :  4 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.042902544140815735, 0.04744540527462959]
extrinsic_reward :  0 intrinsic_reward :  411.56402587890625
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
global_step :  2 , action :  2 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.042902544140815735, 0.04744540527462959]
extrinsic_reward :  0 intrinsic_reward :  411.56402587890625
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 84}
global_step :  3 , action :  1 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.0

global_step :  21 , action :  3 action_prob :  [0.059839654713869095, 0.17654338479042053, 0.0441221185028553, 0.556575357913971, 0.07255519181489944, 0.04291249439120293, 0.04745174199342728]
extrinsic_reward :  1 intrinsic_reward :  411.8468322753906
place 51
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 51, 'x_pos_screen': 51, 'y_pos': 79}
global_step :  22 , action :  3 action_prob :  [0.05985831096768379, 0.1765560656785965, 0.04413874074816704, 0.5564744472503662, 0.07257462292909622, 0.042928896844387054, 0.0474688820540905]
extrinsic_reward :  2 intrinsic_reward :  409.69305419921875
place 53
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 53, 'x_pos_screen': 53, 'y_pos': 79}
global_step :  23 , action :  3 action_prob :  [0.05986681208014488, 0.17656221985816956, 0.04414632171392441, 0.5564278364181519, 0.07258360087871552,

global_step :  43 , action :  0 action_prob :  [0.05975869297981262, 0.1764753758907318, 0.04406150057911873, 0.556989848613739, 0.07249054312705994, 0.042827147990465164, 0.04739689826965332]
extrinsic_reward :  3 intrinsic_reward :  419.9659423828125
place 95
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 95, 'x_pos_screen': 86, 'y_pos': 97}
global_step :  44 , action :  3 action_prob :  [0.06068625673651695, 0.17709366977214813, 0.04486773908138275, 0.5520557165145874, 0.07341953366994858, 0.04365895688533783, 0.04821815341711044]
extrinsic_reward :  2 intrinsic_reward :  399.4931335449219
place 97
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 97, 'x_pos_screen': 87, 'y_pos': 92}
global_step :  45 , action :  3 action_prob :  [0.05847126618027687, 0.17557556927204132, 0.042946331202983856, 0.5638759136199951, 0.07119212299585342

global_step :  64 , action :  3 action_prob :  [0.05714671313762665, 0.174601748585701, 0.041814640164375305, 0.5709572434425354, 0.0698714479804039, 0.040495578199625015, 0.045112691819667816]
extrinsic_reward :  3 intrinsic_reward :  428.06732177734375
place 144
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 144, 'x_pos_screen': 106, 'y_pos': 110}
global_step :  65 , action :  3 action_prob :  [0.05670202150940895, 0.17425698041915894, 0.04143271595239639, 0.5733663439750671, 0.06941680610179901, 0.04010433703660965, 0.04472067952156067]
extrinsic_reward :  3 intrinsic_reward :  421.8553466796875
place 147
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 147, 'x_pos_screen': 107, 'y_pos': 108}
global_step :  66 , action :  3 action_prob :  [0.057204511016607285, 0.17464035749435425, 0.04186017066240311, 0.5706691145896912, 0.0699200

global_step :  86 , action :  3 action_prob :  [0.05457090958952904, 0.17261597514152527, 0.039589665830135345, 0.584944486618042, 0.06718721985816956, 0.038283947855234146, 0.042807694524526596]
extrinsic_reward :  3 intrinsic_reward :  450.08953857421875
place 210
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 210, 'x_pos_screen': 112, 'y_pos': 101}
global_step :  87 , action :  3 action_prob :  [0.05509524047374725, 0.17303602397441864, 0.040041107684373856, 0.582086443901062, 0.06773441284894943, 0.03873194381594658, 0.043274879455566406]
extrinsic_reward :  3 intrinsic_reward :  446.8307189941406
place 213
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 213, 'x_pos_screen': 112, 'y_pos': 97}
global_step :  88 , action :  3 action_prob :  [0.05430394411087036, 0.17238500714302063, 0.03936558589339256, 0.5863998532295227, 0.066918

global_step :  106 , action :  3 action_prob :  [0.055427830666303635, 0.1732507199048996, 0.0403253436088562, 0.5803360342979431, 0.06808213889598846, 0.039002031087875366, 0.0435759536921978]
extrinsic_reward :  3 intrinsic_reward :  453.859130859375
place 270
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 270, 'x_pos_screen': 112, 'y_pos': 97}
global_step :  107 , action :  3 action_prob :  [0.055407360196113586, 0.17323730885982513, 0.04030415788292885, 0.5804594159126282, 0.0680517703294754, 0.03899035230278969, 0.04354958236217499]
extrinsic_reward :  3 intrinsic_reward :  444.97003173828125
place 273
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 273, 'x_pos_screen': 112, 'y_pos': 92}
global_step :  108 , action :  3 action_prob :  [0.055643919855356216, 0.17342299222946167, 0.04050181806087494, 0.5791960954666138, 0.06828563

global_step :  1 , action :  6 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.042902544140815735, 0.04744540527462959]
extrinsic_reward :  0 intrinsic_reward :  411.56402587890625
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
global_step :  2 , action :  3 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.042902544140815735, 0.04744540527462959]
extrinsic_reward :  -1 intrinsic_reward :  411.56402587890625
place 39
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 39, 'x_pos_screen': 39, 'y_pos': 79}
global_step :  3 , action :  3 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.

global_step :  22 , action :  2 action_prob :  [0.059823594987392426, 0.17653338611125946, 0.04410785809159279, 0.5566612482070923, 0.0725383311510086, 0.042898811399936676, 0.047436781227588654]
extrinsic_reward :  1 intrinsic_reward :  410.3907470703125
place 53
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 53, 'x_pos_screen': 53, 'y_pos': 79}
global_step :  23 , action :  3 action_prob :  [0.059806082397699356, 0.17652112245559692, 0.044092871248722076, 0.5567542910575867, 0.07252106815576553, 0.042882952839136124, 0.047421593219041824]
extrinsic_reward :  1 intrinsic_reward :  410.38330078125
place 54
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 54, 'x_pos_screen': 54, 'y_pos': 84}
global_step :  24 , action :  1 action_prob :  [0.05984826758503914, 0.17654940485954285, 0.04413039982318878, 0.5565264821052551, 0.0725652500987

global_step :  42 , action :  1 action_prob :  [0.059955403208732605, 0.17662300169467926, 0.04422389715909958, 0.5559524297714233, 0.07267395406961441, 0.043014880269765854, 0.047556452453136444]
extrinsic_reward :  1 intrinsic_reward :  397.001708984375
place 84
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 84, 'x_pos_screen': 82, 'y_pos': 79}
global_step :  43 , action :  3 action_prob :  [0.058751221746206284, 0.1757889837026596, 0.04318161681294441, 0.5623828172683716, 0.0714627355337143, 0.04193944111466408, 0.046493206173181534]
extrinsic_reward :  2 intrinsic_reward :  395.2485656738281
place 86
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 86, 'x_pos_screen': 83, 'y_pos': 79}
global_step :  44 , action :  0 action_prob :  [0.05969994142651558, 0.17644116282463074, 0.04401605948805809, 0.5572709441184998, 0.072447195649147

global_step :  62 , action :  2 action_prob :  [0.05779640004038811, 0.1750834435224533, 0.04236900061368942, 0.5674819350242615, 0.07051991671323776, 0.04107517749071121, 0.04567404091358185]
extrinsic_reward :  3 intrinsic_reward :  427.889404296875
place 136
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 136, 'x_pos_screen': 102, 'y_pos': 92}
global_step :  63 , action :  5 action_prob :  [0.05736677348613739, 0.17476914823055267, 0.04200087860226631, 0.5697803497314453, 0.07008908689022064, 0.04069320857524872, 0.045300524681806564]
extrinsic_reward :  3 intrinsic_reward :  425.51751708984375
place 139
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 139, 'x_pos_screen': 103, 'y_pos': 87}
global_step :  64 , action :  3 action_prob :  [0.05691853538155556, 0.1744239628314972, 0.04162256792187691, 0.5721802115440369, 0.069646306335

extrinsic_reward :  3 intrinsic_reward :  451.8390808105469
place 193
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 193, 'x_pos_screen': 112, 'y_pos': 95}
global_step :  82 , action :  3 action_prob :  [0.05506665259599686, 0.17299345135688782, 0.04002031311392784, 0.5822502374649048, 0.06771305203437805, 0.03869832679629326, 0.043257951736450195]
extrinsic_reward :  3 intrinsic_reward :  447.21044921875
place 196
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 196, 'x_pos_screen': 112, 'y_pos': 90}
global_step :  83 , action :  3 action_prob :  [0.05493781343102455, 0.17290057241916656, 0.03991219773888588, 0.5829293727874756, 0.06758514046669006, 0.03858642280101776, 0.043148498982191086]
extrinsic_reward :  3 intrinsic_reward :  452.8111267089844
place 199
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 's

global_step :  102 , action :  3 action_prob :  [0.05519409477710724, 0.1730620563030243, 0.040130969136953354, 0.5815882682800293, 0.06785078346729279, 0.03879405930638313, 0.043379660695791245]
extrinsic_reward :  3 intrinsic_reward :  439.0389404296875
place 256
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 256, 'x_pos_screen': 112, 'y_pos': 104}
global_step :  103 , action :  2 action_prob :  [0.05574695020914078, 0.1735055148601532, 0.040597692131996155, 0.5786048173904419, 0.06840935349464417, 0.03927904739975929, 0.0438566729426384]
extrinsic_reward :  3 intrinsic_reward :  443.80694580078125
place 259
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 259, 'x_pos_screen': 112, 'y_pos': 101}
global_step :  104 , action :  1 action_prob :  [0.05549899488687515, 0.17330823838710785, 0.04038805514574051, 0.5799421072006226, 0.06815

global_step :  123 , action :  4 action_prob :  [0.05496842786669731, 0.17289933562278748, 0.03992292657494545, 0.5828518867492676, 0.06758532673120499, 0.038622863590717316, 0.04314921423792839]
extrinsic_reward :  0 intrinsic_reward :  442.68414306640625
place 280
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 294, 'world': 1, 'x_pos': 280, 'x_pos_screen': 115, 'y_pos': 2}
epoch :  1 , global_step :  124
global_step :  1 , action :  3 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.042902544140815735, 0.04744540527462959]
extrinsic_reward :  0 intrinsic_reward :  411.56402587890625
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
global_step :  2 , action :  1 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.556

global_step :  20 , action :  3 action_prob :  [0.059812504798173904, 0.17652513086795807, 0.04409845545887947, 0.556720495223999, 0.07252772897481918, 0.04288829118013382, 0.04742753505706787]
extrinsic_reward :  1 intrinsic_reward :  411.7789611816406
place 49
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 49, 'x_pos_screen': 49, 'y_pos': 79}
global_step :  21 , action :  3 action_prob :  [0.059780243784189224, 0.1765032708644867, 0.04407006874680519, 0.5568938851356506, 0.07249433547258377, 0.04286014288663864, 0.04739804565906525]
extrinsic_reward :  1 intrinsic_reward :  411.1164245605469
place 50
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 50, 'x_pos_screen': 50, 'y_pos': 79}
global_step :  22 , action :  3 action_prob :  [0.05984208360314369, 0.1765448898077011, 0.04412464797496796, 0.5565611720085144, 0.07255835086107254,

global_step :  40 , action :  3 action_prob :  [0.059946198016405106, 0.17661651968955994, 0.04421427845954895, 0.5560083985328674, 0.07266070693731308, 0.043009210377931595, 0.047544728964567184]
extrinsic_reward :  2 intrinsic_reward :  411.8077697753906
place 82
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 82, 'x_pos_screen': 81, 'y_pos': 100}
global_step :  41 , action :  1 action_prob :  [0.05998970940709114, 0.17664526402950287, 0.04425302520394325, 0.5557738542556763, 0.0727061852812767, 0.043046947568655014, 0.047585006803274155]
extrinsic_reward :  1 intrinsic_reward :  401.0556640625
place 83
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 83, 'x_pos_screen': 81, 'y_pos': 98}
global_step :  42 , action :  1 action_prob :  [0.05879192799329758, 0.17581626772880554, 0.04321670904755592, 0.5621675252914429, 0.071503095328807

global_step :  62 , action :  4 action_prob :  [0.057391319423913956, 0.17479312419891357, 0.04201708734035492, 0.56966233253479, 0.07010151445865631, 0.040724508464336395, 0.04531010612845421]
extrinsic_reward :  2 intrinsic_reward :  434.8276672363281
place 129
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 129, 'x_pos_screen': 101, 'y_pos': 107}
global_step :  63 , action :  3 action_prob :  [0.05783672630786896, 0.17512246966362, 0.04240734130144119, 0.5672383904457092, 0.07056965678930283, 0.04110906645655632, 0.04571628198027611]
extrinsic_reward :  3 intrinsic_reward :  429.40606689453125
place 132
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 132, 'x_pos_screen': 102, 'y_pos': 104}
global_step :  64 , action :  3 action_prob :  [0.05716262012720108, 0.17462217807769775, 0.041823599487543106, 0.5708794593811035, 0.0698774531

global_step :  83 , action :  0 action_prob :  [0.054591696709394455, 0.17259454727172852, 0.03962535411119461, 0.5848092436790466, 0.06724532693624496, 0.038271721452474594, 0.04286215454339981]
extrinsic_reward :  3 intrinsic_reward :  451.7796630859375
place 192
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 192, 'x_pos_screen': 112, 'y_pos': 104}
global_step :  84 , action :  3 action_prob :  [0.05494478717446327, 0.1729026585817337, 0.03991811349987984, 0.5828953385353088, 0.06759250909090042, 0.038592059165239334, 0.04315464571118355]
extrinsic_reward :  3 intrinsic_reward :  450.0416259765625
place 195
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 195, 'x_pos_screen': 112, 'y_pos': 101}
global_step :  85 , action :  3 action_prob :  [0.054663728922605515, 0.17266932129859924, 0.03968221694231033, 0.584415853023529, 0.0673101

global_step :  105 , action :  4 action_prob :  [0.05578188970685005, 0.17352299392223358, 0.04063182696700096, 0.5784144997596741, 0.06845078617334366, 0.03930482640862465, 0.04389321058988571]
extrinsic_reward :  3 intrinsic_reward :  451.19366455078125
place 250
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 250, 'x_pos_screen': 112, 'y_pos': 104}
global_step :  106 , action :  0 action_prob :  [0.05580052733421326, 0.17355948686599731, 0.040646836161613464, 0.5782896876335144, 0.06846961379051208, 0.03932600095868111, 0.04390788450837135]
extrinsic_reward :  3 intrinsic_reward :  448.2415771484375
place 253
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 253, 'x_pos_screen': 112, 'y_pos': 101}
global_step :  107 , action :  3 action_prob :  [0.05518963932991028, 0.1730562001466751, 0.04012763872742653, 0.5816138386726379, 0.06784

global_step :  126 , action :  3 action_prob :  [0.05485275015234947, 0.17279431223869324, 0.03980395942926407, 0.5835942625999451, 0.06740225106477737, 0.03856537118554115, 0.04298708960413933]
extrinsic_reward :  3 intrinsic_reward :  399.610107421875
place 313
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 294, 'world': 1, 'x_pos': 313, 'x_pos_screen': 112, 'y_pos': 2}
epoch :  2 , global_step :  127
global_step :  1 , action :  4 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.042902544140815735, 0.04744540527462959]
extrinsic_reward :  0 intrinsic_reward :  411.56402587890625
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
global_step :  2 , action :  4 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.556623

global_step :  20 , action :  1 action_prob :  [0.059759192168712616, 0.17648889124393463, 0.04405176267027855, 0.5570061802864075, 0.07247326523065567, 0.042841147631406784, 0.04737953469157219]
extrinsic_reward :  1 intrinsic_reward :  409.74365234375
place 50
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 50, 'x_pos_screen': 50, 'y_pos': 79}
global_step :  21 , action :  3 action_prob :  [0.059839654713869095, 0.17654338479042053, 0.0441221185028553, 0.556575357913971, 0.07255519181489944, 0.04291249439120293, 0.04745174199342728]
extrinsic_reward :  1 intrinsic_reward :  411.8468322753906
place 51
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 51, 'x_pos_screen': 51, 'y_pos': 79}
global_step :  22 , action :  3 action_prob :  [0.05985831096768379, 0.1765560656785965, 0.04413874074816704, 0.5564744472503662, 0.07257462292909622, 

global_step :  40 , action :  1 action_prob :  [0.059975750744342804, 0.17663626372814178, 0.04424053430557251, 0.5558488368988037, 0.07269176840782166, 0.043034639209508896, 0.047572217881679535]
extrinsic_reward :  2 intrinsic_reward :  411.823974609375
place 81
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 81, 'x_pos_screen': 81, 'y_pos': 97}
global_step :  41 , action :  3 action_prob :  [0.06000417470932007, 0.17665478587150574, 0.04426584020256996, 0.5556958317756653, 0.07272151112556458, 0.043059173971414566, 0.047598566859960556]
extrinsic_reward :  1 intrinsic_reward :  411.18255615234375
place 82
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 82, 'x_pos_screen': 81, 'y_pos': 93}
global_step :  42 , action :  4 action_prob :  [0.05996882542967796, 0.1766292303800583, 0.04423557221889496, 0.555884838104248, 0.07268681377172

global_step :  61 , action :  3 action_prob :  [0.05781036987900734, 0.17510375380516052, 0.04238405451178551, 0.5673812627792358, 0.07054214179515839, 0.04108612984418869, 0.045692313462495804]
extrinsic_reward :  3 intrinsic_reward :  434.6617431640625
place 132
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 132, 'x_pos_screen': 101, 'y_pos': 104}
global_step :  62 , action :  3 action_prob :  [0.05808495730161667, 0.17529669404029846, 0.04262257739901543, 0.565910279750824, 0.07082267105579376, 0.04132528230547905, 0.04593762382864952]
extrinsic_reward :  3 intrinsic_reward :  431.8848876953125
place 135
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 135, 'x_pos_screen': 102, 'y_pos': 101}
global_step :  63 , action :  5 action_prob :  [0.05842521786689758, 0.17553554475307465, 0.04291604831814766, 0.5640950202941895, 0.071165129

global_step :  81 , action :  3 action_prob :  [0.05458497628569603, 0.17258691787719727, 0.03962021321058273, 0.5848464369773865, 0.06723939627408981, 0.038264501839876175, 0.042857520282268524]
extrinsic_reward :  3 intrinsic_reward :  453.21240234375
place 192
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 192, 'x_pos_screen': 112, 'y_pos': 108}
global_step :  82 , action :  1 action_prob :  [0.054960478097200394, 0.17291127145290375, 0.03993238881230354, 0.5828123092651367, 0.06760989129543304, 0.038603752851486206, 0.04317000135779381]
extrinsic_reward :  3 intrinsic_reward :  453.54638671875
place 195
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 195, 'x_pos_screen': 112, 'y_pos': 109}
global_step :  83 , action :  1 action_prob :  [0.054668158292770386, 0.1726752072572708, 0.03968551754951477, 0.5843904614448547, 0.067313961

global_step :  101 , action :  6 action_prob :  [0.055941108614206314, 0.17364652454853058, 0.040764544159173965, 0.577569842338562, 0.06860467791557312, 0.039448291063308716, 0.04402506723999977]
extrinsic_reward :  3 intrinsic_reward :  442.518310546875
place 252
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 252, 'x_pos_screen': 112, 'y_pos': 79}
global_step :  102 , action :  2 action_prob :  [0.055630967020988464, 0.1734083890914917, 0.04050486534833908, 0.5792195200920105, 0.06830023974180222, 0.03917165473103523, 0.04376440495252609]
extrinsic_reward :  3 intrinsic_reward :  454.26873779296875
place 255
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 255, 'x_pos_screen': 112, 'y_pos': 79}
global_step :  103 , action :  0 action_prob :  [0.05565628036856651, 0.17342279851436615, 0.04052228853106499, 0.5791031122207642, 0.068319

epoch :  3 , global_step :  122
global_step :  1 , action :  1 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.042902544140815735, 0.04744540527462959]
extrinsic_reward :  0 intrinsic_reward :  411.56402587890625
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
global_step :  2 , action :  1 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.5566235184669495, 0.0725478008389473, 0.042902544140815735, 0.04744540527462959]
extrinsic_reward :  0 intrinsic_reward :  411.56402587890625
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
global_step :  3 , action :  3 action_prob :  [0.05983036383986473, 0.17653553187847137, 0.0441148541867733, 0.556623518

global_step :  22 , action :  6 action_prob :  [0.05976485833525658, 0.17649368941783905, 0.04405582696199417, 0.5569781064987183, 0.07247728109359741, 0.04284738376736641, 0.04738292843103409]
extrinsic_reward :  1 intrinsic_reward :  410.0460205078125
place 51
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 51, 'x_pos_screen': 51, 'y_pos': 96}
global_step :  23 , action :  6 action_prob :  [0.05981658399105072, 0.17652851343154907, 0.04410143941640854, 0.5567000508308411, 0.07253064215183258, 0.04289275035262108, 0.04743005335330963]
extrinsic_reward :  1 intrinsic_reward :  410.88153076171875
place 52
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 52, 'x_pos_screen': 52, 'y_pos': 93}
global_step :  24 , action :  3 action_prob :  [0.05985487625002861, 0.17655512690544128, 0.04413506016135216, 0.5564936995506287, 0.0725698322057724

global_step :  42 , action :  1 action_prob :  [0.05988256633281708, 0.17657169699668884, 0.044159796088933945, 0.5563463568687439, 0.07259893417358398, 0.04295012354850769, 0.04749048128724098]
extrinsic_reward :  2 intrinsic_reward :  410.1383972167969
place 80
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 80, 'x_pos_screen': 80, 'y_pos': 85}
global_step :  43 , action :  5 action_prob :  [0.05993623286485672, 0.1766081154346466, 0.044206928461790085, 0.5560583472251892, 0.0726538598537445, 0.04299764335155487, 0.04753890633583069]
extrinsic_reward :  2 intrinsic_reward :  409.13525390625
place 82
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 82, 'x_pos_screen': 81, 'y_pos': 79}
global_step :  44 , action :  3 action_prob :  [0.059920188039541245, 0.1765977144241333, 0.04419270157814026, 0.556144118309021, 0.07263742387294769, 0

global_step :  64 , action :  0 action_prob :  [0.05790982022881508, 0.17516817152500153, 0.04246940836310387, 0.5668599009513855, 0.07064081728458405, 0.04117347300052643, 0.04577839374542236]
extrinsic_reward :  1 intrinsic_reward :  435.24664306640625
place 118
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 118, 'x_pos_screen': 96, 'y_pos': 79}
global_step :  65 , action :  4 action_prob :  [0.05885159224271774, 0.1758505254983902, 0.04328231140971184, 0.5618041753768921, 0.07159284502267838, 0.042009416967630386, 0.04660908132791519]
extrinsic_reward :  2 intrinsic_reward :  417.59832763671875
place 120
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 297, 'world': 1, 'x_pos': 120, 'x_pos_screen': 97, 'y_pos': 79}
global_step :  66 , action :  3 action_prob :  [0.05846035107970238, 0.17557144165039062, 0.04294005408883095, 0.5639169216156006, 0.07118880003

global_step :  84 , action :  3 action_prob :  [0.05674042925238609, 0.1742851436138153, 0.04146696999669075, 0.5731550455093384, 0.06945998221635818, 0.040133386850357056, 0.044759009033441544]
extrinsic_reward :  2 intrinsic_reward :  414.9064636230469
place 153
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 153, 'x_pos_screen': 111, 'y_pos': 81}
global_step :  85 , action :  1 action_prob :  [0.05636643245816231, 0.17399361729621887, 0.04114091023802757, 0.5752026438713074, 0.0690656304359436, 0.03981093317270279, 0.044419895857572556]
extrinsic_reward :  2 intrinsic_reward :  424.3379821777344
place 155
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 296, 'world': 1, 'x_pos': 155, 'x_pos_screen': 112, 'y_pos': 79}
global_step :  86 , action :  6 action_prob :  [0.056192975491285324, 0.17385348677635193, 0.041002366691827774, 0.5761081576347351, 0.06890958

global_step :  105 , action :  1 action_prob :  [0.054547686129808426, 0.1725568324327469, 0.039587266743183136, 0.5850530862808228, 0.06719900667667389, 0.038232848048210144, 0.04282330349087715]
extrinsic_reward :  2 intrinsic_reward :  445.72161865234375
place 190
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 190, 'x_pos_screen': 112, 'y_pos': 86}
global_step :  106 , action :  1 action_prob :  [0.0545901320874691, 0.17261511087417603, 0.03961312770843506, 0.5848311185836792, 0.06722328811883926, 0.038286641240119934, 0.04284051060676575]
extrinsic_reward :  2 intrinsic_reward :  449.368408203125
place 192
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 295, 'world': 1, 'x_pos': 192, 'x_pos_screen': 112, 'y_pos': 81}
global_step :  107 , action :  4 action_prob :  [0.0550152063369751, 0.1729467362165451, 0.03997647389769554, 0.5825361013412476, 0.06766002

global_step :  126 , action :  1 action_prob :  [0.05523906275629997, 0.17314782738685608, 0.04015856981277466, 0.5813271999359131, 0.06787451356649399, 0.038857731968164444, 0.043395042419433594]
extrinsic_reward :  2 intrinsic_reward :  445.3746337890625
place 227
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 294, 'world': 1, 'x_pos': 227, 'x_pos_screen': 112, 'y_pos': 88}
global_step :  127 , action :  3 action_prob :  [0.05516177415847778, 0.1730823963880539, 0.04009426385164261, 0.5817447304725647, 0.06779816746711731, 0.03878873214125633, 0.04333003982901573]
extrinsic_reward :  1 intrinsic_reward :  460.3038024902344
place 228
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 294, 'world': 1, 'x_pos': 228, 'x_pos_screen': 112, 'y_pos': 83}
global_step :  128 , action :  1 action_prob :  [0.05533274635672569, 0.17318947613239288, 0.0402434766292572, 0.5808427929878235, 0.06797784

extrinsic_reward :  3 intrinsic_reward :  447.98126220703125
place 278
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 293, 'world': 1, 'x_pos': 278, 'x_pos_screen': 112, 'y_pos': 63}
global_step :  148 , action :  3 action_prob :  [0.05510670691728592, 0.1730078160762787, 0.04004213958978653, 0.5821004509925842, 0.06772929430007935, 0.038741689175367355, 0.04327185079455376]
extrinsic_reward :  3 intrinsic_reward :  431.1484375
place 281
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 293, 'world': 1, 'x_pos': 281, 'x_pos_screen': 112, 'y_pos': 59}
global_step :  149 , action :  3 action_prob :  [0.05534571781754494, 0.1731952279806137, 0.04024358093738556, 0.5808191299438477, 0.06796853244304657, 0.03895222768187523, 0.04347562789916992]
extrinsic_reward :  3 intrinsic_reward :  429.13250732421875
place 284
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'stat

global_step :  10 , action :  6 action_prob :  [0.05961605906486511, 0.17639197409152985, 0.043925948441028595, 0.557774543762207, 0.07232643663883209, 0.04271500185132027, 0.04724998399615288]
extrinsic_reward :  1 intrinsic_reward :  413.97406005859375
place 41
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 41, 'x_pos_screen': 41, 'y_pos': 100}
global_step :  11 , action :  4 action_prob :  [0.05967261642217636, 0.17643171548843384, 0.0439753383398056, 0.5574703812599182, 0.07238389551639557, 0.0427655465900898, 0.04730050638318062]
extrinsic_reward :  0 intrinsic_reward :  412.70538330078125
place 41
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 300, 'world': 1, 'x_pos': 41, 'x_pos_screen': 41, 'y_pos': 103}
global_step :  12 , action :  1 action_prob :  [0.059678398072719574, 0.17643551528453827, 0.0439801886677742, 0.5574402809143066, 0.072389416396617

global_step :  30 , action :  3 action_prob :  [0.05982242524623871, 0.17653192579746246, 0.04410692676901817, 0.5566681623458862, 0.07253710180521011, 0.042897697538137436, 0.04743576422333717]
extrinsic_reward :  2 intrinsic_reward :  413.2981872558594
place 68
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 68, 'x_pos_screen': 68, 'y_pos': 96}
global_step :  31 , action :  3 action_prob :  [0.05980929359793663, 0.17652267217636108, 0.04409528523683548, 0.5567394495010376, 0.07252340763807297, 0.042886052280664444, 0.04742375388741493]
extrinsic_reward :  3 intrinsic_reward :  414.3782958984375
place 71
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 299, 'world': 1, 'x_pos': 71, 'x_pos_screen': 71, 'y_pos': 100}
global_step :  32 , action :  3 action_prob :  [0.05989186838269234, 0.1765792965888977, 0.04416729509830475, 0.5562973022460938, 0.072607189416885

global_step :  51 , action :  1 action_prob :  [0.057804934680461884, 0.1750953048467636, 0.04238089174032211, 0.5674107074737549, 0.07053947448730469, 0.04107797518372536, 0.04569076746702194]
extrinsic_reward :  3 intrinsic_reward :  434.695556640625
place 128
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 128, 'x_pos_screen': 97, 'y_pos': 106}
global_step :  52 , action :  1 action_prob :  [0.05807214230298996, 0.17528285086154938, 0.04261305555701256, 0.5659791827201843, 0.07081256061792374, 0.04131057485938072, 0.045929569751024246]
extrinsic_reward :  3 intrinsic_reward :  428.8128662109375
place 131
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 3, 'status': 'small', 'time': 298, 'world': 1, 'x_pos': 131, 'x_pos_screen': 98, 'y_pos': 109}
global_step :  53 , action :  3 action_prob :  [0.058218106627464294, 0.1753910630941391, 0.04273540526628494, 0.5652058720588684, 0.07095336914

KeyboardInterrupt: 

In [36]:
env.close()