In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.nn import init
import copy
import numpy as np

import queue

In [2]:
gpu = False

In [3]:
class RND(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3):
        super(RND,self).__init__()
        
        self.width = width
        self.height = height
        self.channel = channel
        
        self.predictor = nn.Sequential(
            nn.Conv2d(
                in_channels=3,
                out_channels=32,
                kernel_size=8,
                stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1),
            nn.LeakyReLU(),
            Flatten(),
            nn.Linear(46592, 512), # change
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512)
            )
        self.target = copy.deepcopy(self.predictor)
        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

        for param in self.target.parameters():
            param.requires_grad = False
            
    def forward(self, state):
        target_feature = self.target(state)
        predict_feature = self.predictor(state)
        return predict_feature, target_feature

In [4]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
class PPO(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3, action_dim = 7):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        super(PPO,self).__init__()
        self.memory = []
        
        self.basic = nn.Sequential(\
                    nn.Conv2d(in_channels = 3,\
                             out_channels = 32,\
                             kernel_size = 8,\
                              stride = 4),
                    nn.ReLU(),
                                   nn.Conv2d(in_channels = 32,\
                                           out_channels = 64,\
                                           kernel_size = 4,\
                                           stride = 2),\
                    nn.ReLU(),\
                    nn.Conv2d(in_channels = 64,\
                             out_channels = 64,\
                             kernel_size = 3,\
                             stride = 1),
                                   nn.ReLU(),\
                                   Flatten(),
                    nn.Linear(46592,256), #have to change       
                    nn.ReLU(),\
                    nn.Linear(256,448),
                    nn.ReLU()
                    )
        self.actor = nn.Sequential(\
                                  nn.Linear(448,448),\
                                  nn.ReLU(),\
                                  nn.Linear(448,self.action_dim)\
                                  )
        
        self.extrinsic_critic = nn.Linear(448,1)
        self.intrinsic_critic = nn.Linear(448,1)
        
        init.orthogonal_(self.extrinsic_critic.weight, 0.01)
        self.extrinsic_critic.bias.data.zero_()

        init.orthogonal_(self.intrinsic_critic.weight, 0.01)
        self.intrinsic_critic.bias.data.zero_()

        for i in range(len(self.actor)):
            if type(self.actor[i]) == nn.Linear:
                init.orthogonal_(self.actor[i].weight, 0.01)
                self.actor[i].bias.data.zero_()
    def forward(self, x,dim = -1):
        x = self.basic(x)
        action = self.actor(x)
        action_prob = F.softmax(action,dim = dim)
        
        intrinsic = self.intrinsic_critic(x)
        extrinsic = self.extrinsic_critic(x)
        return action_prob,extrinsic,intrinsic    

    


In [41]:
learning_rate = 0.0001
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 4
T_horizon     = 128
critic_coef = 0.5
ent_coef = 0.001
intrinsic_gamma = 0.99
extrinsic_gamma = 0.999
update_proportion = 0.25

extrinsic_advantage_coef = 2
intrinsic_advantage_coef = 1
class Agent(nn.Module):
    def __init__(self,width=240,height=256,channel = 3,action_dim=7,learning_rate=0.0005):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        
        self.memory = []
        self.intrinsic_queue = queue.Queue()
        self.intrinsic_input_queue = queue.Queue()
        self.ppo = PPO(self.width, self.height, self.channel, self.action_dim)
        self.rnd = RND(self.width, self.height , self.channel)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
    def put_data(self,data):
        self.memory.append(data)
    def make_batch(self):
        state_list, action_list, extrinsic_reward_list, intrinsic_reward_list, next_state_list, \
        prob_list, extrinsic_done_list,intrinsic_done_list = [],[],[],[],[],[], [],[]
        for data in self.memory:
            state,action,extrinsic_reward, intrinsic_reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            extrinsic_reward_list.append([extrinsic_reward])
            intrinsic_reward_list.append([intrinsic_reward])

            next_state_list.append(next_state)
            extrinsic_done_mask = 0 if done else 1
            extrinsic_done_list.append([extrinsic_done_mask])
            intrinsic_done_list.append([1])
            prob_list.append([prob])
        self.memory = []

        s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob \
                                        = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),\
                                        torch.tensor(extrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(extrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(prob_list,dtype = torch.float)
        if gpu:
            return s.cuda(),a.cuda(),er.cuda(),ir.cuda(),next_s.cuda(),extrinsic_done_list.cuda()\
            ,intrinsic_done_list.cuda(),prob.cuda() 
        else :
            return s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob  
    
    def train(self):
        state,action,extrinsic_reward,intrinsic_reward, next_state,extrinsic_done_list,\
        intrinsic_done_list,action_prob = self.make_batch()
        
        for i in range(K_epoch):
            state = state.squeeze()
            next_state = next_state.squeeze()
            predicted_action, predicted_extrinsic, predicted_intrinsic = self.ppo(state)
            predicted_next_action, predicted_next_extrinsic, predicted_next_intrinsic = self.ppo(next_state)

            if gpu:
                intrinsic_next_state_mean = torch.mean(torch.cat(list(self.intrinsic_input_queue.queue)),dim = 0).cuda()
                intrinsic_next_state_std = torch.std(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0).cuda()
            
                preprocessed_next_state = torch.clamp(((next_state - intrinsic_next_state_mean) / \
                                                   intrinsic_next_state_std + torch.tensor(1e-8).cuda()),-5,5)
            else:
                intrinsic_next_state_mean = torch.mean(torch.cat(list(self.intrinsic_input_queue.queue)),dim = 0)
                intrinsic_next_state_std = torch.std(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0)
            
                preprocessed_next_state = torch.clamp(((next_state - intrinsic_next_state_mean) / \
                                                   intrinsic_next_state_std + torch.tensor(1e-8)),-5,5)
            
            predict_feature, target_feature = self.rnd(next_state)
            td_error = extrinsic_reward + extrinsic_gamma * predicted_next_extrinsic * extrinsic_done_list
            delta = td_error - predicted_next_extrinsic
            if gpu:
                delta = delta.detach().cpu().numpy()
            else:
                delta = delta.detach().numpy()
            advantage_list = []
            
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            if gpu:
                advantage = torch.tensor(advantage_list,dtype = torch.float).cuda()
            else:
                advantage = torch.tensor(advantage_list,dtype = torch.float)
            ##intrinsic_advantage
            intrinsic_td_error = intrinsic_reward + intrinsic_gamma * predicted_next_intrinsic * intrinsic_done_list
            intrinsic_delta = intrinsic_td_error - predicted_next_intrinsic
            if gpu:
                intrinsic_delta = intrinsic_delta.detach().cpu().numpy()
            else:
                intrinsic_delta = intrinsic_delta.detach().numpy()
                
            intrinsic_advantage_list = []
            intrinsic_advantage = 0.0
            
            for intrinsic_delta_t in intrinsic_delta[::-1]:
                intrinsic_advantage = gamma * lmbda * intrinsic_advantage + intrinsic_delta_t[0]
                intrinsic_advantage_list.append([intrinsic_advantage])
            intrinsic_advantage_list.reverse()
            if gpu:
                intrinsic_advantage = torch.tensor(intrinsic_advantage_list,dtype = torch.float).cuda()
            else:
                intrinsic_advantage = torch.tensor(intrinsic_advantage_list,dtype = torch.float)
            #### intrinsic_error
            intrinsic_error = (intrinsic_td_error - predicted_intrinsic.detach()).pow(2)
            if gpu:
                masking = torch.rand(len(intrinsic_error)).cuda()
                masking = (masking < update_proportion).type(torch.FloatTensor).cuda()
            else:
                masking = torch.rand(len(intrinsic_error))
                masking = (masking < update_proportion).type(torch.FloatTensor)
            if gpu:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(intrinsic_error.sum(), torch.Tensor([1]).cuda())
            else:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(masking.sum(), torch.Tensor([1]))

            
            now_action = predicted_action
            m = Categorical(now_action)
            entropy = m.entropy().mean()
            
            
            now_action = now_action.gather(1,action)
            
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            advantage = extrinsic_advantage_coef * advantage +  intrinsic_advantage_coef * intrinsic_advantage
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2) + critic_coef * (F.smooth_l1_loss(predicted_extrinsic,td_error.detach()) +\
                    intrinsic_error) - ent_coef * entropy + F.mse_loss(predict_feature, target_feature)
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [6]:
epochs = 1000
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)


In [39]:
if gpu:
    model = Agent().cuda()
else:
    model = Agent()


In [8]:
'''
try:
    if gpu:
        model.load_state_dict(torch.load("weights/2_episode_model"))
    else:
        model.load_state_dict(torch.load("weights/2_episode_model", map_location={'cuda:0': 'cpu'}))
except :
    pass
'''

In [40]:
T_horizon = 20

for epoch in range(epochs):
    global_step = 0
    model.intrinsic_queue = queue.Queue()
    model.intrinsic_input_queue = queue.Queue()
    state = env.reset()
    state = np.array(state)/255
            #state = np.transpose(state,(2,0,1))
    state = np.moveaxis(state, -1, 0)
    state = torch.tensor(state).float()
    state = state.unsqueeze(0)
    done = False
    while not done :
        for t in range(T_horizon):
            #env.render()
            global_step +=1

            if gpu:
                action_prob, _ , _ = model.ppo.forward(state.cuda())
            else:
                action_prob, _ , _ = model.ppo.forward(state)
            m = Categorical(action_prob)
            action = m.sample().item()
            
            
            next_state, extrinsic_reward, done, info = env.step(action)
            next_state = np.array(next_state)/255
            next_state = np.moveaxis(next_state,-1,0)
            next_state = torch.tensor(next_state).float()
            next_state = next_state.unsqueeze(0)
            
            
            model.intrinsic_input_queue.put(next_state)
            if len(model.intrinsic_input_queue.queue) > 128:
                model.intrinsic_input_queue.get()
            intrinsic_next_state_mean = \
                    torch.mean(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0)
            if len(model.intrinsic_input_queue.queue) == 1:
                intrinsic_next_state_std = torch.zeros(1)
            else:
                intrinsic_next_state_std = \
                        torch.std(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0)
    
            preprocessed_next_state = \
                    torch.clamp(((next_state - intrinsic_next_state_mean) / (intrinsic_next_state_std + 1e-8)), -5,5)
            
            
            #(model.intrinsic_input_queue)
            if gpu:
                predictor,target = model.rnd.forward(preprocessed_next_state.cuda())
            else:
                predictor,target = model.rnd.forward(preprocessed_next_state)
            intrinsic_reward = (predictor - target).pow(2).sum(1) / 2
            if len(model.intrinsic_queue.queue) > 128:
                model.intrinsic_queue.get()
            print('intrinsic_reward.item() : ', intrinsic_reward.item())
            model.intrinsic_queue.put(intrinsic_reward.item())
            intrinsic_mean = np.mean(model.intrinsic_queue.queue)
            intrinsic_std = np.std(model.intrinsic_queue.queue)
            intrinsic_reward = (intrinsic_reward - intrinsic_mean) / (intrinsic_std+ 1e-8)
            
            if info['time'] == 0 or info['life'] == 1 or info['time'] == 300:
                done = True
                reward = -10.

            model.put_data((state.tolist(), action, extrinsic_reward/100, (intrinsic_reward.item()), next_state.tolist(), action_prob[0][action].item(), done))
            print('global_step : ',global_step,', action : ', action,'action_prob : ',action_prob.tolist()[0])
            print('extrinsic_reward : ',extrinsic_reward,'intrinsic_reward : ',intrinsic_reward.item())
            print('place',info['x_pos'])
            print('info',info)
            if done :
                print('epoch : ',epoch, ', global_step : ',global_step)
                break
            state = next_state
        model.train()
    #env.render()

#env.close()

intrinsic_reward.item() :  0.0
global_step :  1 , action :  3 action_prob :  [0.14285677671432495, 0.14285710453987122, 0.14285744726657867, 0.14285707473754883, 0.14285728335380554, 0.14285729825496674, 0.14285695552825928]
extrinsic_reward :  0 intrinsic_reward :  0.0
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
intrinsic_reward.item() :  0.0
global_step :  2 , action :  2 action_prob :  [0.14285677671432495, 0.14285710453987122, 0.14285744726657867, 0.14285707473754883, 0.14285728335380554, 0.14285729825496674, 0.14285695552825928]
extrinsic_reward :  0 intrinsic_reward :  0.0
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
intrinsic_reward.item() :  1.9368853569030762
global_step :  3 , action :  1 action_prob :  [0.14285677671432495, 0.

intrinsic_reward.item() :  22939.35546875
global_step :  19 , action :  0 action_prob :  [0.14285677671432495, 0.14285710453987122, 0.14285744726657867, 0.14285707473754883, 0.14285728335380554, 0.14285729825496674, 0.14285694062709808]
extrinsic_reward :  -1 intrinsic_reward :  2.2297821044921875
place 48
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 48, 'x_pos_screen': 48, 'y_pos': 79}
intrinsic_reward.item() :  31892.9140625
global_step :  20 , action :  3 action_prob :  [0.14285677671432495, 0.14285710453987122, 0.14285744726657867, 0.14285707473754883, 0.14285728335380554, 0.14285729825496674, 0.14285695552825928]
extrinsic_reward :  1 intrinsic_reward :  2.7246546745300293
place 49
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 49, 'x_pos_screen': 49, 'y_pos': 79}
intrinsic_reward.item() :  37568.2734375
global_step :  21 , a

intrinsic_reward.item() :  42682.17578125
global_step :  37 , action :  0 action_prob :  [0.12560036778450012, 0.12549930810928345, 0.1909065693616867, 0.12511563301086426, 0.12487104535102844, 0.184056356549263, 0.1239507645368576]
extrinsic_reward :  1 intrinsic_reward :  1.0110849142074585
place 65
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 65, 'x_pos_screen': 65, 'y_pos': 91}
intrinsic_reward.item() :  15310.85546875
global_step :  38 , action :  6 action_prob :  [0.12560328841209412, 0.12550222873687744, 0.19089840352535248, 0.12511862814426422, 0.1248740553855896, 0.1840495616197586, 0.12395390123128891]
extrinsic_reward :  1 intrinsic_reward :  -0.5207064747810364
place 66
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 66, 'x_pos_screen': 66, 'y_pos': 86}
intrinsic_reward.item() :  15174.3798828125
global_step :  39 , act

KeyboardInterrupt: 

In [22]:
model.intrinsic_queue.queue

deque([nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan,
       nan])

In [20]:
preprocessed_next_state

tensor([[[[-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          ...,
          [-5.,  5.,  5.,  ...,  5.,  5., nan],
          [-5.,  5.,  5.,  ...,  5., nan, nan],
          [ 5., nan, nan,  ..., nan, nan,  5.]],

         [[-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          ...,
          [-5., -5., -5.,  ..., -5., -5., nan],
          [-5., -5., -5.,  ..., -5., nan, nan],
          [-5., nan, nan,  ..., nan, nan, -5.]],

         [[-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          [-5., -5., -5.,  ..., -5., -5., -5.],
          ...,
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan]]]])

319

In [44]:
env.close()

ValueError: env has already been closed.

In [43]:
if gpu:
    model.load_state_dict(torch.load("weights/1_episode_model"))
else:
    model.load_state_dict(torch.load("weights/1_episode_model", map_location={'cuda:0': 'cpu'}))

In [55]:
epochs = 1000
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)


In [56]:
T_horizon = 20

for epoch in range(epochs):
    global_step = 0
    state = env.reset()
    state = np.array(state)/255
            #state = np.transpose(state,(2,0,1))
    state = np.moveaxis(state, -1, 0)
    state = torch.tensor(state).float()
    state = state.unsqueeze(0)
    done = False
    while not done :
        for t in range(T_horizon):
            env.render()
            global_step +=1

            if gpu:
                action_prob, _ , _ = model.ppo.forward(state.cuda())
            else:
                action_prob, _ , _ = model.ppo.forward(state)
            m = Categorical(action_prob)
            action = m.sample().item()
            next_state, extrinsic_reward, done, info = env.step(action)
            next_state = np.array(next_state)/255
            next_state = np.moveaxis(next_state,-1,0)
            next_state = torch.tensor(next_state).float()
            next_state = next_state.unsqueeze(0)
            if gpu:
                predictor,target = model.rnd.forward(next_state.cuda())
            else:
                predictor,target = model.rnd.forward(next_state)
            intrinsic_reward = (predictor - target).pow(2).sum(1) / 2
            if info['time'] == 0 :
                done = True
                reward = -10.
            if done :
                print('epoch : ',epoch, ', global_step : ',global_step)
                break
            print('global_step : ',global_step,', action : ', action,'action_prob : ',action_prob.tolist()[0])
            print('extrinsic_reward : ',extrinsic_reward,'intrinsic_reward : ',intrinsic_reward.item())
            print('place',info['x_pos'])
            print('info',info)
            state = next_state
    #env.render()

#env.close()

global_step :  1 , action :  2 action_prob :  [0.0008799101924523711, 0.017144275829195976, 0.9469815492630005, 0.00012634025188162923, 0.03220308944582939, 0.0026617946568876505, 3.0889298159308964e-06]
extrinsic_reward :  0 intrinsic_reward :  2.895847797393799
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
global_step :  2 , action :  2 action_prob :  [0.0008799101924523711, 0.017144275829195976, 0.9469815492630005, 0.00012634025188162923, 0.03220308944582939, 0.0026617946568876505, 3.0889298159308964e-06]
extrinsic_reward :  0 intrinsic_reward :  2.895847797393799
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 84}
global_step :  3 , action :  2 action_prob :  [0.0008799101924523711, 0.017144275829195976, 0.9469815492630005, 0.0001263402518816

global_step :  21 , action :  2 action_prob :  [0.000872174627147615, 0.01705988682806492, 0.9472266435623169, 0.0001249100751010701, 0.03207119181752205, 0.0026422275695949793, 3.0392150165425846e-06]
extrinsic_reward :  1 intrinsic_reward :  2.776517868041992
place 54
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 54, 'x_pos_screen': 54, 'y_pos': 140}
global_step :  22 , action :  2 action_prob :  [0.0008717708988115191, 0.017055492848157883, 0.9472392797470093, 0.00012483529280871153, 0.03206433728337288, 0.0026412021834403276, 3.0366250030056108e-06]
extrinsic_reward :  1 intrinsic_reward :  2.7469706535339355
place 55
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 55, 'x_pos_screen': 55, 'y_pos': 141}
global_step :  23 , action :  2 action_prob :  [0.0008705118671059608, 0.01704166643321514, 0.9472795128822327, 0.00012460377183

global_step :  40 , action :  2 action_prob :  [0.0008265043725259602, 0.016554201021790504, 0.9486960768699646, 0.00011654989793896675, 0.03127767890691757, 0.002526203403249383, 2.7533990305528278e-06]
extrinsic_reward :  2 intrinsic_reward :  4.443122863769531
place 86
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 398, 'world': 1, 'x_pos': 86, 'x_pos_screen': 83, 'y_pos': 99}
global_step :  41 , action :  2 action_prob :  [0.0008534396183677018, 0.01685388572514057, 0.9478251338005066, 0.00012146555673098192, 0.03174838051199913, 0.002594770397990942, 2.920414317486575e-06]
extrinsic_reward :  2 intrinsic_reward :  3.935343027114868
place 88
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 398, 'world': 1, 'x_pos': 88, 'x_pos_screen': 84, 'y_pos': 94}
global_step :  42 , action :  2 action_prob :  [0.0008298245957121253, 0.016591863706707954, 0.9485860466957092, 0.00011714545689756

global_step :  59 , action :  2 action_prob :  [0.0009168441756628454, 0.017543112859129906, 0.945823073387146, 0.00013320992002263665, 0.03282570466399193, 0.0027546973433345556, 3.3312371670035645e-06]
extrinsic_reward :  1 intrinsic_reward :  4.022909164428711
place 119
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 397, 'world': 1, 'x_pos': 119, 'x_pos_screen': 97, 'y_pos': 79}
global_step :  60 , action :  2 action_prob :  [0.000850738724693656, 0.016824044287204742, 0.9479116797447205, 0.00012097017315682024, 0.03170161694288254, 0.002587905852124095, 2.9034567887720186e-06]
extrinsic_reward :  2 intrinsic_reward :  4.999667167663574
place 121
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 397, 'world': 1, 'x_pos': 121, 'x_pos_screen': 98, 'y_pos': 79}
global_step :  61 , action :  2 action_prob :  [0.0009440652211196721, 0.017831934615969658, 0.9449849724769592, 0.000138333867

global_step :  80 , action :  2 action_prob :  [0.0008579960558563471, 0.0169041957706213, 0.9476789832115173, 0.00012230098946020007, 0.031827304512262344, 0.002606323454529047, 2.949099780380493e-06]
extrinsic_reward :  2 intrinsic_reward :  6.551303386688232
place 156
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 396, 'world': 1, 'x_pos': 156, 'x_pos_screen': 112, 'y_pos': 106}
global_step :  81 , action :  2 action_prob :  [0.0008757184259593487, 0.017098763957619667, 0.9471136331558228, 0.0001255619281437248, 0.03213223069906235, 0.0026511538308113813, 3.0618948585470207e-06]
extrinsic_reward :  2 intrinsic_reward :  5.705742835998535
place 158
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 396, 'world': 1, 'x_pos': 158, 'x_pos_screen': 112, 'y_pos': 109}
global_step :  82 , action :  2 action_prob :  [0.0009422510047443211, 0.017813043668866158, 0.9450395703315735, 0.000137986

global_step :  101 , action :  2 action_prob :  [0.0008660602616146207, 0.016992824152112007, 0.9474213719367981, 0.00012378452811390162, 0.03196609392762184, 0.0026267722714692354, 3.000221340698772e-06]
extrinsic_reward :  2 intrinsic_reward :  5.168170928955078
place 193
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 395, 'world': 1, 'x_pos': 193, 'x_pos_screen': 112, 'y_pos': 130}
global_step :  102 , action :  2 action_prob :  [0.0008680933387950063, 0.017014583572745323, 0.9473587870597839, 0.00012416791287250817, 0.03199934959411621, 0.0026320514734834433, 3.0133278414723463e-06]
extrinsic_reward :  1 intrinsic_reward :  6.742884635925293
place 194
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 395, 'world': 1, 'x_pos': 194, 'x_pos_screen': 112, 'y_pos': 126}
global_step :  103 , action :  2 action_prob :  [0.0008605233742855489, 0.016931844875216484, 0.9475986957550049, 0.000

global_step :  120 , action :  2 action_prob :  [0.0009623891673982143, 0.01802452653646469, 0.9444260001182556, 0.00014180202560964972, 0.03357311710715294, 0.0028685270808637142, 3.641750936367316e-06]
extrinsic_reward :  2 intrinsic_reward :  6.703056335449219
place 226
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 394, 'world': 1, 'x_pos': 226, 'x_pos_screen': 112, 'y_pos': 79}
global_step :  121 , action :  2 action_prob :  [0.0009250789298675954, 0.01763083226978779, 0.9455686211585999, 0.00013475648302119225, 0.03296198695898056, 0.0027753785252571106, 3.3864723718579626e-06]
extrinsic_reward :  2 intrinsic_reward :  6.694912910461426
place 228
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 394, 'world': 1, 'x_pos': 228, 'x_pos_screen': 112, 'y_pos': 79}
global_step :  122 , action :  2 action_prob :  [0.0008943183929659426, 0.017300577834248543, 0.9465277791023254, 0.0001290

global_step :  139 , action :  2 action_prob :  [0.0009011506917886436, 0.017374368384480476, 0.9463132619857788, 0.00013028419925831258, 0.03256231173872948, 0.002715349430218339, 3.227312163289753e-06]
extrinsic_reward :  2 intrinsic_reward :  7.033091068267822
place 264
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 393, 'world': 1, 'x_pos': 264, 'x_pos_screen': 112, 'y_pos': 102}
global_step :  140 , action :  2 action_prob :  [0.0008809618302620947, 0.01715564914047718, 0.9469485282897949, 0.00012653600424528122, 0.03222077339887619, 0.002664468949660659, 3.0957376111473422e-06]
extrinsic_reward :  3 intrinsic_reward :  6.881844520568848
place 267
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 393, 'world': 1, 'x_pos': 267, 'x_pos_screen': 112, 'y_pos': 107}
global_step :  141 , action :  2 action_prob :  [0.0009548512171022594, 0.01794598065316677, 0.9446535706520081, 0.0001403

global_step :  160 , action :  2 action_prob :  [0.000866797287017107, 0.01700081117451191, 0.9473983645439148, 0.00012392183998599648, 0.03197844326496124, 0.0026286630891263485, 3.004941845574649e-06]
extrinsic_reward :  1 intrinsic_reward :  7.642158508300781
place 320
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 392, 'world': 1, 'x_pos': 320, 'x_pos_screen': 112, 'y_pos': 143}
global_step :  161 , action :  2 action_prob :  [0.0008715699077583849, 0.01705269329249859, 0.9472480416297913, 0.00012480822624638677, 0.032059043645858765, 0.002640840131789446, 3.035523832295439e-06]
extrinsic_reward :  2 intrinsic_reward :  8.867561340332031
place 322
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 392, 'world': 1, 'x_pos': 322, 'x_pos_screen': 112, 'y_pos': 143}
global_step :  162 , action :  2 action_prob :  [0.0009439582354389131, 0.017831187695264816, 0.944986879825592, 0.00013830

global_step :  179 , action :  2 action_prob :  [0.0009838134283199906, 0.01824774220585823, 0.9437780380249023, 0.0001458791084587574, 0.03391896188259125, 0.0029216669499874115, 3.7920849536021706e-06]
extrinsic_reward :  1 intrinsic_reward :  9.031676292419434
place 354
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 391, 'world': 1, 'x_pos': 354, 'x_pos_screen': 112, 'y_pos': 143}
global_step :  180 , action :  2 action_prob :  [0.0009514391422271729, 0.01791016198694706, 0.9447574615478516, 0.00013971787120681256, 0.033396508544683456, 0.0028411406092345715, 3.5657997159432853e-06]
extrinsic_reward :  2 intrinsic_reward :  10.106260299682617
place 356
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 391, 'world': 1, 'x_pos': 356, 'x_pos_screen': 112, 'y_pos': 143}
global_step :  181 , action :  2 action_prob :  [0.0008932776399888098, 0.01728956773877144, 0.9465593695640564, 0.0001

global_step :  200 , action :  2 action_prob :  [0.0008560953428968787, 0.01688283309340477, 0.9477413892745972, 0.0001219586847582832, 0.03179321438074112, 0.0026016002520918846, 2.937229737653979e-06]
extrinsic_reward :  2 intrinsic_reward :  9.618212699890137
place 397
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 390, 'world': 1, 'x_pos': 397, 'x_pos_screen': 112, 'y_pos': 143}
global_step :  201 , action :  1 action_prob :  [0.0008693295530974865, 0.017029032111167908, 0.9473159909248352, 0.0001243807200808078, 0.032023314386606216, 0.002634960226714611, 3.0209489523258526e-06]
extrinsic_reward :  1 intrinsic_reward :  9.985185623168945
place 398
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 390, 'world': 1, 'x_pos': 398, 'x_pos_screen': 112, 'y_pos': 143}
global_step :  202 , action :  2 action_prob :  [0.0008533557993359864, 0.016852712258696556, 0.9478287100791931, 0.000121

global_step :  221 , action :  2 action_prob :  [0.0009751047473400831, 0.01815689168870449, 0.9440419673919678, 0.00014422566164284945, 0.03377772867679596, 0.002900186227634549, 3.7307713682821486e-06]
extrinsic_reward :  1 intrinsic_reward :  7.057394027709961
place 433
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 389, 'world': 1, 'x_pos': 433, 'x_pos_screen': 112, 'y_pos': 79}
global_step :  222 , action :  2 action_prob :  [0.0009394745575264096, 0.017783552408218384, 0.9451252818107605, 0.00013746546756010503, 0.03319930657744408, 0.002811395563185215, 3.483962700556731e-06]
extrinsic_reward :  1 intrinsic_reward :  9.032930374145508
place 434
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 389, 'world': 1, 'x_pos': 434, 'x_pos_screen': 111, 'y_pos': 79}
global_step :  223 , action :  2 action_prob :  [0.0008578608394600451, 0.016902247443795204, 0.9476850628852844, 0.00012228

global_step :  242 , action :  2 action_prob :  [0.0008705875370651484, 0.017042748630046844, 0.9472760558128357, 0.00012461341975722462, 0.032044727355241776, 0.002638155361637473, 3.0289968435681658e-06]
extrinsic_reward :  1 intrinsic_reward :  9.4586763381958
place 440
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 388, 'world': 1, 'x_pos': 440, 'x_pos_screen': 117, 'y_pos': 136}
global_step :  243 , action :  2 action_prob :  [0.0008705385262146592, 0.017042212188243866, 0.9472777843475342, 0.00012460461584851146, 0.03204386681318283, 0.002638034289702773, 3.0286846595117822e-06]
extrinsic_reward :  1 intrinsic_reward :  9.473649978637695
place 441
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 388, 'world': 1, 'x_pos': 441, 'x_pos_screen': 117, 'y_pos': 137}
global_step :  244 , action :  2 action_prob :  [0.0008701158803887665, 0.017037568613886833, 0.9472911357879639, 0.00012

global_step :  263 , action :  2 action_prob :  [0.0008577351691201329, 0.016900869086384773, 0.9476890563964844, 0.00012226028775330633, 0.031821418553590775, 0.0026057695504277945, 2.9475845622073393e-06]
extrinsic_reward :  1 intrinsic_reward :  8.320381164550781
place 471
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 387, 'world': 1, 'x_pos': 471, 'x_pos_screen': 117, 'y_pos': 111}
global_step :  264 , action :  2 action_prob :  [0.0008400389924645424, 0.016705181449651718, 0.9482574462890625, 0.00011901651305379346, 0.03151478245854378, 0.002560738008469343, 2.8367971935949754e-06]
extrinsic_reward :  2 intrinsic_reward :  7.177379131317139
place 473
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 387, 'world': 1, 'x_pos': 473, 'x_pos_screen': 117, 'y_pos': 111}
global_step :  265 , action :  2 action_prob :  [0.0008699146565049887, 0.017034953460097313, 0.9472993016242981, 0.00

global_step :  284 , action :  2 action_prob :  [0.0009251720039173961, 0.017631884664297104, 0.9455655217170715, 0.0001347728684777394, 0.03296371176838875, 0.002775596920400858, 3.3870812785607995e-06]
extrinsic_reward :  2 intrinsic_reward :  7.406111717224121
place 508
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 386, 'world': 1, 'x_pos': 508, 'x_pos_screen': 117, 'y_pos': 109}
global_step :  285 , action :  2 action_prob :  [0.0008332928991876543, 0.016629938036203384, 0.9484760761260986, 0.00011778785119531676, 0.03139645978808403, 0.002543568843975663, 2.795123691612389e-06]
extrinsic_reward :  2 intrinsic_reward :  8.008354187011719
place 510
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 386, 'world': 1, 'x_pos': 510, 'x_pos_screen': 117, 'y_pos': 113}
global_step :  286 , action :  2 action_prob :  [0.000842279230710119, 0.01672988198697567, 0.9481857419013977, 0.00011942

global_step :  304 , action :  4 action_prob :  [0.000840257853269577, 0.016707241535186768, 0.9482517838478088, 0.00011906247527804226, 0.031517449766397476, 0.002561385976150632, 2.8382578420860227e-06]
extrinsic_reward :  2 intrinsic_reward :  7.859036445617676
place 543
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 385, 'world': 1, 'x_pos': 543, 'x_pos_screen': 117, 'y_pos': 96}
global_step :  305 , action :  2 action_prob :  [0.0008860238594934344, 0.017210578545928, 0.9467890858650208, 0.00012747572327498347, 0.03230641782283783, 0.002677276963368058, 3.128531488982844e-06]
extrinsic_reward :  2 intrinsic_reward :  7.023093223571777
place 545
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 385, 'world': 1, 'x_pos': 545, 'x_pos_screen': 117, 'y_pos': 91}
global_step :  306 , action :  2 action_prob :  [0.0008570082718506455, 0.01689302921295166, 0.9477116465568542, 0.00012212406

global_step :  323 , action :  2 action_prob :  [0.0008961057174019516, 0.01731969602406025, 0.9464723467826843, 0.00012934915139339864, 0.032476671040058136, 0.0027027090545743704, 3.194267492290237e-06]
extrinsic_reward :  1 intrinsic_reward :  8.524702072143555
place 576
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 384, 'world': 1, 'x_pos': 576, 'x_pos_screen': 117, 'y_pos': 79}
global_step :  324 , action :  2 action_prob :  [0.0008743277285248041, 0.017083028331398964, 0.9471597075462341, 0.00012531406537164003, 0.03210679814219475, 0.0026477694045752287, 3.053129603358684e-06]
extrinsic_reward :  2 intrinsic_reward :  9.144312858581543
place 578
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 384, 'world': 1, 'x_pos': 578, 'x_pos_screen': 117, 'y_pos': 79}
global_step :  325 , action :  2 action_prob :  [0.0008317795000039041, 0.016613038256764412, 0.9485253095626831, 0.000117

global_step :  344 , action :  2 action_prob :  [0.0009722962859086692, 0.018127376213669777, 0.9441280364990234, 0.00014369594282470644, 0.03373159095644951, 0.002893290016800165, 3.711146518980968e-06]
extrinsic_reward :  0 intrinsic_reward :  8.140340805053711
place 594
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 383, 'world': 1, 'x_pos': 594, 'x_pos_screen': 116, 'y_pos': 79}
global_step :  345 , action :  2 action_prob :  [0.0009722962859086692, 0.018127376213669777, 0.9441280364990234, 0.00014369594282470644, 0.03373159095644951, 0.002893290016800165, 3.711146518980968e-06]
extrinsic_reward :  0 intrinsic_reward :  8.452532768249512
place 594
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 383, 'world': 1, 'x_pos': 594, 'x_pos_screen': 116, 'y_pos': 79}
global_step :  346 , action :  2 action_prob :  [0.0009717263164930046, 0.018121425062417984, 0.9441452622413635, 0.00014358

global_step :  365 , action :  2 action_prob :  [0.0009721918613649905, 0.01812630146741867, 0.9441310167312622, 0.00014367583207786083, 0.03372994810342789, 0.0028930259868502617, 3.7104077819094528e-06]
extrinsic_reward :  0 intrinsic_reward :  8.281496047973633
place 594
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 382, 'world': 1, 'x_pos': 594, 'x_pos_screen': 116, 'y_pos': 79}
global_step :  366 , action :  2 action_prob :  [0.0009721918613649905, 0.01812630146741867, 0.9441310167312622, 0.00014367583207786083, 0.03372994810342789, 0.0028930259868502617, 3.7104077819094528e-06]
extrinsic_reward :  0 intrinsic_reward :  8.319443702697754
place 594
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 382, 'world': 1, 'x_pos': 594, 'x_pos_screen': 116, 'y_pos': 79}
global_step :  367 , action :  2 action_prob :  [0.0009723942494019866, 0.018128404393792152, 0.9441249966621399, 0.000143

global_step :  386 , action :  2 action_prob :  [0.0009722425602376461, 0.018126843497157097, 0.944129467010498, 0.00014368518895935267, 0.03373081609606743, 0.0028931465931236744, 3.7107558910065563e-06]
extrinsic_reward :  0 intrinsic_reward :  8.293135643005371
place 594
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 381, 'world': 1, 'x_pos': 594, 'x_pos_screen': 116, 'y_pos': 123}
global_step :  387 , action :  2 action_prob :  [0.0009708055295050144, 0.018111836165189743, 0.9441730976104736, 0.00014341226778924465, 0.033707525581121445, 0.0028895866125822067, 3.7006921047577634e-06]
extrinsic_reward :  0 intrinsic_reward :  8.146827697753906
place 594
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 381, 'world': 1, 'x_pos': 594, 'x_pos_screen': 116, 'y_pos': 124}
global_step :  388 , action :  4 action_prob :  [0.0009721393580548465, 0.018125781789422035, 0.9441327452659607, 0.00

global_step :  406 , action :  2 action_prob :  [0.0008501553093083203, 0.016817031428217888, 0.9479326605796814, 0.00012087251525372267, 0.03168975189328194, 0.002586560556665063, 2.8999622827541316e-06]
extrinsic_reward :  2 intrinsic_reward :  8.93833065032959
place 616
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 380, 'world': 1, 'x_pos': 616, 'x_pos_screen': 123, 'y_pos': 141}
global_step :  407 , action :  2 action_prob :  [0.0008430139278061688, 0.016738183796405792, 0.9481615424156189, 0.00011956086382269859, 0.03156651183962822, 0.002568325726315379, 2.855291768355528e-06]
extrinsic_reward :  1 intrinsic_reward :  7.754384517669678
place 617
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 380, 'world': 1, 'x_pos': 617, 'x_pos_screen': 123, 'y_pos': 146}
global_step :  408 , action :  2 action_prob :  [0.0008496512309648097, 0.016811635345220566, 0.9479480981826782, 0.000120

global_step :  427 , action :  2 action_prob :  [0.0009595361771062016, 0.017994865775108337, 0.944511890411377, 0.00014125704183243215, 0.0335274375975132, 0.0028613777831196785, 3.6218680179445073e-06]
extrinsic_reward :  1 intrinsic_reward :  8.166711807250977
place 652
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 379, 'world': 1, 'x_pos': 652, 'x_pos_screen': 123, 'y_pos': 194}
global_step :  428 , action :  2 action_prob :  [0.0008737269090488553, 0.017076922580599785, 0.9471770524978638, 0.0001251955982297659, 0.03209793195128441, 0.0026461391244083643, 3.0491439702018397e-06]
extrinsic_reward :  2 intrinsic_reward :  9.274295806884766
place 654
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 379, 'world': 1, 'x_pos': 654, 'x_pos_screen': 123, 'y_pos': 195}
global_step :  429 , action :  2 action_prob :  [0.0008095027878880501, 0.016362592577934265, 0.9492533802986145, 0.00011

global_step :  448 , action :  2 action_prob :  [0.0007818369776941836, 0.01604689098894596, 0.950171709060669, 0.0001085075200535357, 0.03047679178416729, 0.0024117587599903345, 2.4865275918273255e-06]
extrinsic_reward :  2 intrinsic_reward :  9.563837051391602
place 689
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 378, 'world': 1, 'x_pos': 689, 'x_pos_screen': 123, 'y_pos': 134}
global_step :  449 , action :  2 action_prob :  [0.0009179763728752732, 0.01755472831428051, 0.9457898139953613, 0.00013343039609026164, 0.03284306451678276, 0.0027576584834605455, 3.3389503641956253e-06]
extrinsic_reward :  2 intrinsic_reward :  11.48947811126709
place 691
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 378, 'world': 1, 'x_pos': 691, 'x_pos_screen': 123, 'y_pos': 129}
global_step :  450 , action :  2 action_prob :  [0.0009832059731706977, 0.018240878358483315, 0.9437985420227051, 0.000145

global_step :  467 , action :  2 action_prob :  [0.0009149533580057323, 0.017522137612104416, 0.9458847045898438, 0.00013286902685649693, 0.03279191628098488, 0.0027501399163156748, 3.3188648558279965e-06]
extrinsic_reward :  1 intrinsic_reward :  10.738787651062012
place 722
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 377, 'world': 1, 'x_pos': 722, 'x_pos_screen': 123, 'y_pos': 102}
global_step :  468 , action :  2 action_prob :  [0.000997027033008635, 0.018383417278528214, 0.9433850049972534, 0.00014842320524621755, 0.03412748500704765, 0.0029545712750405073, 3.886520516971359e-06]
extrinsic_reward :  1 intrinsic_reward :  9.238004684448242
place 723
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 377, 'world': 1, 'x_pos': 723, 'x_pos_screen': 122, 'y_pos': 106}
global_step :  469 , action :  2 action_prob :  [0.0008177916170097888, 0.01645525172352791, 0.9489848017692566, 0.0001

global_step :  488 , action :  2 action_prob :  [0.0008285694639198482, 0.016576305031776428, 0.9486328959465027, 0.00011694160639308393, 0.03131089359521866, 0.0025317207910120487, 2.766327952485881e-06]
extrinsic_reward :  0 intrinsic_reward :  9.871922492980957
place 727
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 376, 'world': 1, 'x_pos': 727, 'x_pos_screen': 126, 'y_pos': 143}
global_step :  489 , action :  2 action_prob :  [0.0008257427834905684, 0.016544634476304054, 0.9487248659133911, 0.00011642801109701395, 0.03126107156276703, 0.00252450630068779, 2.7490220873005455e-06]
extrinsic_reward :  1 intrinsic_reward :  9.883054733276367
place 728
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 376, 'world': 1, 'x_pos': 728, 'x_pos_screen': 127, 'y_pos': 143}
global_step :  490 , action :  2 action_prob :  [0.0008273656130768359, 0.016562793403863907, 0.9486721754074097, 0.00011

global_step :  509 , action :  2 action_prob :  [0.0008090627379715443, 0.01635737158358097, 0.9492686986923218, 0.00011339791672071442, 0.030967095866799355, 0.002481699688360095, 2.6477707706362708e-06]
extrinsic_reward :  2 intrinsic_reward :  9.043231010437012
place 758
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 375, 'world': 1, 'x_pos': 758, 'x_pos_screen': 127, 'y_pos': 197}
global_step :  510 , action :  2 action_prob :  [0.0008379759965464473, 0.016681624576449394, 0.9483264088630676, 0.00011864981206599623, 0.03147687017917633, 0.0025556301698088646, 2.8241838663234375e-06]
extrinsic_reward :  1 intrinsic_reward :  9.151090621948242
place 759
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 375, 'world': 1, 'x_pos': 759, 'x_pos_screen': 127, 'y_pos': 199}
global_step :  511 , action :  2 action_prob :  [0.0008430019952356815, 0.0167375635355711, 0.9481639266014099, 0.00011

global_step :  530 , action :  2 action_prob :  [0.0008618060383014381, 0.016945693641901016, 0.9475587606430054, 0.00012300819798838347, 0.031891677528619766, 0.0026160888373851776, 2.973324171762215e-06]
extrinsic_reward :  1 intrinsic_reward :  9.398947715759277
place 794
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 374, 'world': 1, 'x_pos': 794, 'x_pos_screen': 127, 'y_pos': 195}
global_step :  531 , action :  2 action_prob :  [0.0008514386136084795, 0.016831407323479652, 0.9478906989097595, 0.0001211045091622509, 0.031712599098682404, 0.0025897743180394173, 2.9079510568408296e-06]
extrinsic_reward :  2 intrinsic_reward :  9.276586532592773
place 796
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 374, 'world': 1, 'x_pos': 796, 'x_pos_screen': 127, 'y_pos': 190}
global_step :  532 , action :  2 action_prob :  [0.0007984624826349318, 0.016237298026680946, 0.9496176242828369, 0.00

global_step :  551 , action :  1 action_prob :  [0.0007838335004635155, 0.016070080921053886, 0.9501039981842041, 0.00010886046948144212, 0.030513865873217583, 0.002416839823126793, 2.498127059880062e-06]
extrinsic_reward :  2 intrinsic_reward :  10.11160945892334
place 831
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 373, 'world': 1, 'x_pos': 831, 'x_pos_screen': 127, 'y_pos': 90}
global_step :  552 , action :  2 action_prob :  [0.000956073694396764, 0.017958449199795723, 0.9446176290512085, 0.00014060230751056224, 0.03347083553671837, 0.0028527823742479086, 3.5979119274998084e-06]
extrinsic_reward :  2 intrinsic_reward :  11.150381088256836
place 833
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 373, 'world': 1, 'x_pos': 833, 'x_pos_screen': 127, 'y_pos': 85}
global_step :  553 , action :  2 action_prob :  [0.0008349585114046931, 0.016648605465888977, 0.9484217166900635, 0.00011

info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 372, 'world': 1, 'x_pos': 866, 'x_pos_screen': 127, 'y_pos': 131}
global_step :  572 , action :  2 action_prob :  [0.0008152685477398336, 0.01642715372145176, 0.949066162109375, 0.00011452475155238062, 0.031076567247509956, 0.0024976676795631647, 2.6852671908272896e-06]
extrinsic_reward :  2 intrinsic_reward :  9.59532356262207
place 868
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 372, 'world': 1, 'x_pos': 868, 'x_pos_screen': 127, 'y_pos': 133}
global_step :  573 , action :  2 action_prob :  [0.000769858539570123, 0.01590871810913086, 0.950573742389679, 0.00010637181549100205, 0.03025801293551922, 0.0023808632977306843, 2.4170651613530936e-06]
extrinsic_reward :  1 intrinsic_reward :  10.462928771972656
place 869
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 372, 'world': 1, 'x_pos

global_step :  592 , action :  2 action_prob :  [0.0009246349218301475, 0.017625615000724792, 0.9455840587615967, 0.00013468172983266413, 0.03295314311981201, 0.0027743864338845015, 3.3836447528301505e-06]
extrinsic_reward :  2 intrinsic_reward :  7.619814872741699
place 903
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 371, 'world': 1, 'x_pos': 903, 'x_pos_screen': 127, 'y_pos': 143}
global_step :  593 , action :  4 action_prob :  [0.0009760028915479779, 0.01816594786942005, 0.94401615858078, 0.00014440214727073908, 0.033791303634643555, 0.0029024893883615732, 3.7371894450188847e-06]
extrinsic_reward :  1 intrinsic_reward :  7.728752613067627
place 904
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 371, 'world': 1, 'x_pos': 904, 'x_pos_screen': 127, 'y_pos': 143}
global_step :  594 , action :  2 action_prob :  [0.0008787100668996572, 0.01713068224489689, 0.9470215439796448, 0.00012

global_step :  613 , action :  2 action_prob :  [0.0008571844082325697, 0.016895132139325142, 0.9477053284645081, 0.0001221537822857499, 0.03181293234229088, 0.002604292705655098, 2.944013886008179e-06]
extrinsic_reward :  2 intrinsic_reward :  8.155094146728516
place 948
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 370, 'world': 1, 'x_pos': 948, 'x_pos_screen': 127, 'y_pos': 140}
global_step :  614 , action :  2 action_prob :  [0.0008018205408006907, 0.01627589762210846, 0.9495050311088562, 0.00011208203068235889, 0.030839595943689346, 0.0024629798717796803, 2.6042509944090853e-06]
extrinsic_reward :  2 intrinsic_reward :  9.093160629272461
place 950
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 370, 'world': 1, 'x_pos': 950, 'x_pos_screen': 127, 'y_pos': 138}
global_step :  615 , action :  2 action_prob :  [0.0008331058779731393, 0.016627775505185127, 0.9484825134277344, 0.00011

global_step :  634 , action :  2 action_prob :  [0.0009714575717225671, 0.018118923529982567, 0.9441524147987366, 0.00014353118604049087, 0.033718932420015335, 0.0028911361005157232, 3.705166591316811e-06]
extrinsic_reward :  2 intrinsic_reward :  8.345776557922363
place 985
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 369, 'world': 1, 'x_pos': 985, 'x_pos_screen': 127, 'y_pos': 79}
global_step :  635 , action :  2 action_prob :  [0.0008825364639051259, 0.017172593623399734, 0.9468993544578552, 0.00012683073873631656, 0.03224695101380348, 0.002668492030352354, 3.1059682896739105e-06]
extrinsic_reward :  2 intrinsic_reward :  8.290863990783691
place 987
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 369, 'world': 1, 'x_pos': 987, 'x_pos_screen': 127, 'y_pos': 79}
global_step :  636 , action :  2 action_prob :  [0.0008639786974526942, 0.01696968637406826, 0.9474889039993286, 0.000123

global_step :  654 , action :  2 action_prob :  [0.0008745674276724458, 0.017085712403059006, 0.9471518993377686, 0.00012535725545603782, 0.03211111202836037, 0.002648358466103673, 3.0546482321369695e-06]
extrinsic_reward :  2 intrinsic_reward :  9.348663330078125
place 1026
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 368, 'world': 1, 'x_pos': 1026, 'x_pos_screen': 127, 'y_pos': 79}
global_step :  655 , action :  2 action_prob :  [0.0008308945689350367, 0.016602979972958565, 0.9485545754432678, 0.00011735379666788504, 0.03135383874177933, 0.002537492895498872, 2.7804128421848873e-06]
extrinsic_reward :  2 intrinsic_reward :  7.494332790374756
place 1028
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 368, 'world': 1, 'x_pos': 1028, 'x_pos_screen': 127, 'y_pos': 79}
global_step :  656 , action :  1 action_prob :  [0.0008058101520873606, 0.016321001574397087, 0.9493741393089294, 0.00

global_step :  675 , action :  2 action_prob :  [0.0009242743253707886, 0.017622241750359535, 0.945593535900116, 0.00013460576883517206, 0.03294859454035759, 0.0027733666356652975, 3.3810690638347296e-06]
extrinsic_reward :  2 intrinsic_reward :  6.582877159118652
place 1063
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 367, 'world': 1, 'x_pos': 1063, 'x_pos_screen': 127, 'y_pos': 109}
global_step :  676 , action :  2 action_prob :  [0.0009405582677572966, 0.01779472641646862, 0.9450931549072266, 0.0001376749132759869, 0.033216238021850586, 0.0028141741640865803, 3.4914410207420588e-06]
extrinsic_reward :  2 intrinsic_reward :  5.718194961547852
place 1065
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 367, 'world': 1, 'x_pos': 1065, 'x_pos_screen': 127, 'y_pos': 105}
global_step :  677 , action :  2 action_prob :  [0.0008982364670373499, 0.017342982813715935, 0.9464043974876404, 0.

global_step :  696 , action :  2 action_prob :  [0.000853722624015063, 0.01685672625899315, 0.9478170871734619, 0.00012152211274951696, 0.03175240755081177, 0.0025955578312277794, 2.922275598393753e-06]
extrinsic_reward :  2 intrinsic_reward :  6.924394607543945
place 1100
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 366, 'world': 1, 'x_pos': 1100, 'x_pos_screen': 127, 'y_pos': 79}
global_step :  697 , action :  2 action_prob :  [0.0008545588352717459, 0.01686633564531803, 0.9477887153625488, 0.00012166910892119631, 0.03176804631948471, 0.00259758741594851, 2.9274235657794634e-06]
extrinsic_reward :  1 intrinsic_reward :  6.196002006530762
place 1101
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 366, 'world': 1, 'x_pos': 1101, 'x_pos_screen': 127, 'y_pos': 79}
global_step :  698 , action :  2 action_prob :  [0.0008583554299548268, 0.0169079452753067, 0.9476682543754578, 0.00012237

global_step :  717 , action :  2 action_prob :  [0.0009302587131969631, 0.017685597762465477, 0.945409893989563, 0.00013573530304711312, 0.0330466590821743, 0.002788425888866186, 3.4215061077702558e-06]
extrinsic_reward :  1 intrinsic_reward :  5.078805923461914
place 1136
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 365, 'world': 1, 'x_pos': 1136, 'x_pos_screen': 127, 'y_pos': 4}
global_step :  718 , action :  2 action_prob :  [0.0009030529763549566, 0.017394836992025375, 0.9462538957595825, 0.0001306389458477497, 0.03259420022368431, 0.002720137359574437, 3.239842044422403e-06]
extrinsic_reward :  -15 intrinsic_reward :  3.056934118270874
place 1138
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 365, 'world': 1, 'x_pos': 1138, 'x_pos_screen': 127, 'y_pos': 255}
global_step :  719 , action :  2 action_prob :  [0.000879438768606633, 0.01713913306593895, 0.9469964504241943, 0.000126

global_step :  738 , action :  2 action_prob :  [0.0008698486490175128, 0.01703440025448799, 0.9473005533218384, 0.00012448159395717084, 0.03203125670552254, 0.0026363511569797993, 3.0243556921050185e-06]
extrinsic_reward :  1 intrinsic_reward :  2.8043856620788574
place 53
info {'coins': 0, 'flag_get': False, 'life': 1, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 53, 'x_pos_screen': 53, 'y_pos': 139}
global_step :  739 , action :  2 action_prob :  [0.0008717903983779252, 0.017055679112672806, 0.9472387433052063, 0.0001248393818968907, 0.032064586877822876, 0.0026412573643028736, 3.0367591534741223e-06]
extrinsic_reward :  1 intrinsic_reward :  2.776517868041992
place 54
info {'coins': 0, 'flag_get': False, 'life': 1, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 54, 'x_pos_screen': 54, 'y_pos': 140}
global_step :  740 , action :  2 action_prob :  [0.0008717708988115191, 0.017055492848157883, 0.9472392797470093, 0.00012483

global_step :  757 , action :  2 action_prob :  [0.0008735475712455809, 0.01707489788532257, 0.9471831321716309, 0.00012516356946434826, 0.032094668596982956, 0.0026457018684595823, 3.048015059903264e-06]
extrinsic_reward :  2 intrinsic_reward :  5.012538433074951
place 84
info {'coins': 0, 'flag_get': False, 'life': 1, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 84, 'x_pos_screen': 82, 'y_pos': 104}
global_step :  758 , action :  2 action_prob :  [0.0008265043725259602, 0.016554201021790504, 0.9486960768699646, 0.00011654989793896675, 0.03127767890691757, 0.002526203403249383, 2.7533990305528278e-06]
extrinsic_reward :  1 intrinsic_reward :  4.477341651916504
place 86
info {'coins': 0, 'flag_get': False, 'life': 1, 'score': 0, 'stage': 1, 'status': 'small', 'time': 398, 'world': 1, 'x_pos': 86, 'x_pos_screen': 83, 'y_pos': 99}
global_step :  759 , action :  1 action_prob :  [0.0008535117376595736, 0.016854675486683846, 0.9478229284286499, 0.00012147859

global_step :  778 , action :  2 action_prob :  [0.000850738724693656, 0.016824044287204742, 0.9479116797447205, 0.00012097017315682024, 0.03170161694288254, 0.002587905852124095, 2.9034567887720186e-06]
extrinsic_reward :  1 intrinsic_reward :  5.015350818634033
place 121
info {'coins': 0, 'flag_get': False, 'life': 1, 'score': 0, 'stage': 1, 'status': 'small', 'time': 397, 'world': 1, 'x_pos': 121, 'x_pos_screen': 98, 'y_pos': 79}
global_step :  779 , action :  2 action_prob :  [0.0009444362367503345, 0.017835846170783043, 0.9449737071990967, 0.00013840387691743672, 0.03328033536672592, 0.002823813585564494, 3.5178936741431244e-06]
extrinsic_reward :  3 intrinsic_reward :  4.874264717102051
place 124
info {'coins': 0, 'flag_get': False, 'life': 1, 'score': 0, 'stage': 1, 'status': 'small', 'time': 397, 'world': 1, 'x_pos': 124, 'x_pos_screen': 99, 'y_pos': 79}
global_step :  780 , action :  5 action_prob :  [0.0008434404735453427, 0.016743067651987076, 0.9481474161148071, 0.000119636

KeyboardInterrupt: 

In [57]:
env.close()