In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.nn import init
import copy
import numpy as np

import queue

In [2]:
gpu = False

In [3]:
class RND(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3):
        super(RND,self).__init__()
        
        self.width = width
        self.height = height
        self.channel = channel
        
        self.predictor = nn.Sequential(
            nn.Conv2d(
                in_channels=3,
                out_channels=32,
                kernel_size=8,
                stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1),
            nn.LeakyReLU(),
            Flatten(),
            nn.Linear(46592, 512), # change
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512)
            )
        self.target = copy.deepcopy(self.predictor)
        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

        for param in self.target.parameters():
            param.requires_grad = False
            
    def forward(self, state):
        target_feature = self.target(state)
        predict_feature = self.predictor(state)
        return predict_feature, target_feature

In [4]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
class PPO(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3, action_dim = 7):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        super(PPO,self).__init__()

        
        self.basic = nn.Sequential(\
                    nn.Conv2d(in_channels = 3,\
                             out_channels = 32,\
                             kernel_size = 8,\
                              stride = 4),
                    nn.ReLU(),
                                   nn.Conv2d(in_channels = 32,\
                                           out_channels = 64,\
                                           kernel_size = 4,\
                                           stride = 2),\
                    nn.ReLU(),\
                    nn.Conv2d(in_channels = 64,\
                             out_channels = 64,\
                             kernel_size = 3,\
                             stride = 1),
                                   nn.ReLU(),\
                                   Flatten(),
                    nn.Linear(46592,256), #have to change       
                    nn.ReLU(),\
                    nn.Linear(256,448),
                    nn.ReLU()
                    )
        self.actor = nn.Sequential(\
                                  nn.Linear(448,448),\
                                  nn.ReLU(),\
                                  nn.Linear(448,self.action_dim)\
                                  )
        
        self.extrinsic_critic = nn.Linear(448,1)
        self.intrinsic_critic = nn.Linear(448,1)
        
        init.orthogonal_(self.extrinsic_critic.weight, 0.01)
        self.extrinsic_critic.bias.data.zero_()

        init.orthogonal_(self.intrinsic_critic.weight, 0.01)
        self.intrinsic_critic.bias.data.zero_()

        for i in range(len(self.actor)):
            if type(self.actor[i]) == nn.Linear:
                init.orthogonal_(self.actor[i].weight, 0.01)
                self.actor[i].bias.data.zero_()
    def forward(self, x,dim = -1):
        x = self.basic(x)
        action = self.actor(x)
        action_prob = F.softmax(action,dim = dim)
        
        intrinsic = self.intrinsic_critic(x)
        extrinsic = self.extrinsic_critic(x)
        return action_prob,extrinsic,intrinsic    

    


In [5]:
learning_rate = 0.0001
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 4
T_horizon     = 128
critic_coef = 0.5
ent_coef = 0.001
intrinsic_gamma = 0.99
extrinsic_gamma = 0.999
update_proportion = 0.25

extrinsic_advantage_coef = 2
intrinsic_advantage_coef = 1
class Agent(nn.Module):
    def __init__(self,width=240,height=256,channel = 3,action_dim=7,learning_rate=0.0005):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        
        self.memory = [[] for _ in range(env_num)]
        self.intrinsic_queue_list = [queue.Queue() for _ in range(env_num)]
        self.intrinsic_input_queue_list = [queue.Queue() for _ in range(env_num)]
        self.ppo = PPO(self.width, self.height, self.channel, self.action_dim)
        self.rnd = RND(self.width, self.height , self.channel)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
    def put_data(self,i,data):
        self.memory[i].append(data)
    def make_batch(self):
        state_list, action_list, extrinsic_reward_list, intrinsic_reward_list, next_state_list, \
        prob_list, extrinsic_done_list,intrinsic_done_list = [],[],[],[],[],[], [],[]
        for data in self.memory[i]:
            state,action,extrinsic_reward, intrinsic_reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            extrinsic_reward_list.append([extrinsic_reward])
            intrinsic_reward_list.append([intrinsic_reward])

            next_state_list.append(next_state)
            extrinsic_done_mask = 0 if done else 1
            extrinsic_done_list.append([extrinsic_done_mask])
            intrinsic_done_list.append([1])
            prob_list.append([prob])
        self.memory[i] = []

        s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob \
                                        = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),\
                                        torch.tensor(extrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(extrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(prob_list,dtype = torch.float)
        if gpu:
            return s.cuda(),a.cuda(),er.cuda(),ir.cuda(),next_s.cuda(),extrinsic_done_list.cuda()\
            ,intrinsic_done_list.cuda(),prob.cuda() 
        else :
            return s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob  
    
    def train(self,i):
        state,action,extrinsic_reward,intrinsic_reward, next_state,extrinsic_done_list,\
        intrinsic_done_list,action_prob = self.make_batch(i)
        
        for i in range(K_epoch):
            state = state.squeeze()
            next_state = next_state.squeeze()
            predicted_action, predicted_extrinsic, predicted_intrinsic = self.ppo(state)
            predicted_next_action, predicted_next_extrinsic, predicted_next_intrinsic = self.ppo(next_state)

            if gpu:
                intrinsic_next_state_mean = torch.mean(torch.cat(list(self.intrinsic_input_queue.queue)),dim = 0).cuda()
                intrinsic_next_state_std = torch.std(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0).cuda()
            
                preprocessed_next_state = torch.clamp(((next_state - intrinsic_next_state_mean) / \
                                                   intrinsic_next_state_std + torch.tensor(1e-8).cuda()),-5,5)
            else:
                intrinsic_next_state_mean = torch.mean(torch.cat(list(self.intrinsic_input_queue.queue)),dim = 0)
                intrinsic_next_state_std = torch.std(torch.cat(list(model.intrinsic_input_queue.queue)),dim = 0)
            
                preprocessed_next_state = torch.clamp(((next_state - intrinsic_next_state_mean) / \
                                                   intrinsic_next_state_std + torch.tensor(1e-8)),-5,5)
            
            predict_feature, target_feature = self.rnd(next_state)
            td_error = extrinsic_reward + extrinsic_gamma * predicted_next_extrinsic * extrinsic_done_list
            delta = td_error - predicted_next_extrinsic
            if gpu:
                delta = delta.detach().cpu().numpy()
            else:
                delta = delta.detach().numpy()
            advantage_list = []
            
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            if gpu:
                advantage = torch.tensor(advantage_list,dtype = torch.float).cuda()
            else:
                advantage = torch.tensor(advantage_list,dtype = torch.float)
            ##intrinsic_advantage
            intrinsic_td_error = intrinsic_reward + intrinsic_gamma * predicted_next_intrinsic * intrinsic_done_list
            intrinsic_delta = intrinsic_td_error - predicted_next_intrinsic
            if gpu:
                intrinsic_delta = intrinsic_delta.detach().cpu().numpy()
            else:
                intrinsic_delta = intrinsic_delta.detach().numpy()
                
            intrinsic_advantage_list = []
            intrinsic_advantage = 0.0
            
            for intrinsic_delta_t in intrinsic_delta[::-1]:
                intrinsic_advantage = gamma * lmbda * intrinsic_advantage + intrinsic_delta_t[0]
                intrinsic_advantage_list.append([intrinsic_advantage])
            intrinsic_advantage_list.reverse()
            if gpu:
                intrinsic_advantage = torch.tensor(intrinsic_advantage_list,dtype = torch.float).cuda()
            else:
                intrinsic_advantage = torch.tensor(intrinsic_advantage_list,dtype = torch.float)
            #### intrinsic_error
            intrinsic_error = (intrinsic_td_error - predicted_intrinsic.detach()).pow(2)
            if gpu:
                masking = torch.rand(len(intrinsic_error)).cuda()
                masking = (masking < update_proportion).type(torch.FloatTensor).cuda()
            else:
                masking = torch.rand(len(intrinsic_error))
                masking = (masking < update_proportion).type(torch.FloatTensor)
            if gpu:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(intrinsic_error.sum(), torch.Tensor([1]).cuda())
            else:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(masking.sum(), torch.Tensor([1]))

            
            now_action = predicted_action
            m = Categorical(now_action)
            entropy = m.entropy().mean()
            
            
            now_action = now_action.gather(1,action)
            
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            advantage = extrinsic_advantage_coef * advantage +  intrinsic_advantage_coef * intrinsic_advantage
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2) + critic_coef * (F.smooth_l1_loss(predicted_extrinsic,td_error.detach()) +\
                    intrinsic_error) - ent_coef * entropy + F.mse_loss(predict_feature, target_feature)
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [7]:
epochs = 1000
env_num = 4
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

env_list = [gym_super_mario_bros.make('SuperMarioBros-v0') for _ in range(env_num)]
env_list = [JoypadSpace(env, SIMPLE_MOVEMENT) for env in env_list]

In [9]:
if gpu:
    model = Agent().cuda()
else:
    model = Agent()


In [None]:
T_horizon = 20


for epoch in range(epochs):
    global_step_list = [0 for _ in range(env_num)]
    model.intrinsic_queue = [queue.Queue() for _ in range(env_num)]
    model.intrinsic_input_queue = [queue.Queue() for _ in range(env_num)]
    state_list = [env.reset() for env in env_list]
    state_list = [np.array(state)/255 for state in state_list]
            #state = np.transpose(state,(2,0,1))
    state_list = [np.moveaxis(state, -1, 0) for state in state_list]
    state_list = [torch.tensor(state).float() for state in state_list]
    state_list = [state.unsqueeze(0) for state in state_list]
    done_list = [False for _ in range(env_num)]
    
    action_prob_list = [[] for _ in range(env_num)]
    m_list = [[] for _ in range(env_num)]
    action_list = [[] for _ in range(env_num)]
    next_state_list = [[] for _ in range(env_num)]
    extrinsic_reward_list = [[] for _ in range(env_num)]
    info_list = [[] for _ in range(env_num)]
    next_state_list = [[] for _ in range(env_num)]
    while not all(done_list) :
        for i in range(env_num):
            for t in range(T_horizon):
                #env.render()
                global_step_list[i] +=1

                if gpu:
                    action_prob_list[i], _ , _ = model.ppo.forward(state_list[i].cuda())
                else:
                    action_prob_list[i], _ , _ = model.ppo.forward(state_list[i])
                m_list[i] = Categorical(action_prob_list[i])
                
                action_list[i] = m_list[i].sample().item()


                next_state, extrinsic_reward_list[i], done_list[i], info_list[i] = env_list[i].step(action_list[i])
                next_state = np.array(next_state)/255
                next_state = np.moveaxis(next_state,-1,0)
                next_state = torch.tensor(next_state).float()
                next_state = next_state.unsqueeze(0)
                next_state_list[i] = next_state

                model.intrinsic_input_queue_list[i].put(next_state)
                if len(model.intrinsic_input_queue_list[i].queue) > 128:
                    model.intrinsic_input_queue_list[i].get()
                intrinsic_next_state_mean = \
                        torch.mean(torch.cat(list(model.intrinsic_input_queue_list[i].queue)),dim = 0)
                if len(model.intrinsic_input_queue_list[i].queue) == 1:
                    intrinsic_next_state_std = torch.zeros(1)
                else:
                    intrinsic_next_state_std = \
                            torch.std(torch.cat(list(model.intrinsic_input_queue_list[i].queue)),dim = 0)

                preprocessed_next_state = \
                        torch.clamp(((next_state - intrinsic_next_state_mean) / (intrinsic_next_state_std + 1e-8)), -5,5)


                #(model.intrinsic_input_queue)
                if gpu:
                    predictor,target = model.rnd.forward(preprocessed_next_state.cuda())
                else:
                    predictor,target = model.rnd.forward(preprocessed_next_state)
                intrinsic_reward = (predictor - target).pow(2).sum(1) / 2
                if len(model.intrinsic_queue_list[i].queue) > 128:
                    model.intrinsic_queue_list[i].get()
                model.intrinsic_queue_list[i].put(intrinsic_reward.item())
                intrinsic_mean = np.mean(model.intrinsic_queue_list[i].queue)
                intrinsic_std = np.std(model.intrinsic_queue_list[i].queue)
                intrinsic_reward = (intrinsic_reward - intrinsic_mean) / (intrinsic_std+ 1e-8)

                if info_list[i]['time'] == 0 or info_list[i]['life'] == 1 or info_list[i]['time'] == 300:
                    done_list[i] = True
                    reward = -10.

                model.put_data(i,((state_list[i].tolist(), \
                                action_list[i], extrinsic_reward_list[i],\
                                (intrinsic_reward.item()), next_state.tolist(), \
                                action_prob_list[i][0][action_list[i]].item(), done_list[i])))
                print('env number : ',i,', global_step : ',global_step_list[i],', action : ', action_list[i],'action_prob : ',action_prob_list[i].tolist()[0])
                print('extrinsic_reward : ',extrinsic_reward_list[i],'intrinsic_reward : ',intrinsic_reward.item())
                print('place',info_list[i]['x_pos'])
                #print('info',info_list[i])
                if done_list[i] :
                    print('epoch : ',epoch, ', global_step : ',global_step_list[i])
                    break
                state_list[i] = next_state_list[i]
        for i in range(env_num) :
            if len(model.memory[i]) > 1:
                model.train(i)
    #env.render()

#env.close()

env number :  0 , global_step :  1 , action :  5 action_prob :  [0.14285747706890106, 0.14285731315612793, 0.14285649359226227, 0.142857164144516, 0.1428568810224533, 0.1428576111793518, 0.14285707473754883]
extrinsic_reward :  0 intrinsic_reward :  0.7073347568511963
place 40
env number :  0 , global_step :  2 , action :  6 action_prob :  [0.14285747706890106, 0.14285731315612793, 0.14285649359226227, 0.142857164144516, 0.1428568810224533, 0.1428576111793518, 0.14285707473754883]
extrinsic_reward :  0 intrinsic_reward :  0.6968941688537598
place 40
env number :  0 , global_step :  3 , action :  2 action_prob :  [0.14285747706890106, 0.14285731315612793, 0.14285649359226227, 0.142857164144516, 0.1428568810224533, 0.1428576111793518, 0.14285707473754883]
extrinsic_reward :  -1 intrinsic_reward :  0.680672287940979
place 39
env number :  0 , global_step :  4 , action :  4 action_prob :  [0.14285746216773987, 0.14285729825496674, 0.14285649359226227, 0.142857164144516, 0.1428568810224533,

In [23]:
state_list[0]

[]