In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.nn import init
import copy
import numpy as np
import math
import queue
from IPython.display import clear_output


In [1]:
class NoisyLinear(nn.Module):
    """Factorised Gaussian NoisyNet"""

    def __init__(self, in_features, out_features, sigma0=0.5):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.Tensor(out_features, in_features))
        self.bias = nn.Parameter(torch.Tensor(out_features))
        self.noisy_weight = nn.Parameter(
            torch.Tensor(out_features, in_features))
        self.noisy_bias = nn.Parameter(torch.Tensor(out_features))
        self.noise_std = sigma0 / math.sqrt(self.in_features)

        self.reset_parameters()
        self.register_noise()

    def register_noise(self):
        in_noise = torch.FloatTensor(self.in_features)
        out_noise = torch.FloatTensor(self.out_features)
        noise = torch.FloatTensor(self.out_features, self.in_features)
        self.register_buffer('in_noise', in_noise)
        self.register_buffer('out_noise', out_noise)
        self.register_buffer('noise', noise)

    def sample_noise(self):
        self.in_noise.normal_(0, self.noise_std)
        self.out_noise.normal_(0, self.noise_std)
        self.noise = torch.mm(
            self.out_noise.view(-1, 1), self.in_noise.view(1, -1))

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        self.noisy_weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)
            self.noisy_bias.data.uniform_(-stdv, stdv)

    def forward(self, x):
        """
        Note: noise will be updated if x is not volatile
        """
        normal_y = nn.functional.linear(x, self.weight, self.bias)
        if self.training:
            # update the noise once per update
            self.sample_noise()

        noisy_weight = self.noisy_weight * self.noise
        noisy_bias = self.noisy_bias * self.out_noise
        noisy_y = nn.functional.linear(x, noisy_weight, noisy_bias)
        return noisy_y + normal_y

    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'in_features=' + str(self.in_features) \
            + ', out_features=' + str(self.out_features) + ')'


class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)


class PPO(nn.Module):
    def __init__(self, input_size, output_size, use_noisy_net=True):
        super(CnnActorCriticNetwork, self).__init__()

        if use_noisy_net:
            print('use NoisyNet')
            linear = NoisyLinear
        else:
            linear = nn.Linear

        self.feature = nn.Sequential(
            nn.Conv2d(
                in_channels=4,
                out_channels=32,
                kernel_size=8,
                stride=4),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2),
            nn.ReLU(),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1),
            nn.ReLU(),
            Flatten(),
            linear(
                7 * 7 * 64,
                256),
            nn.ReLU(),
            linear(
                256,
                448),
            nn.ReLU()
        )

        self.actor = nn.Sequential(
            linear(448, 448),
            nn.ReLU(),
            linear(448, output_size)
        )

        self.extra_layer = nn.Sequential(
            linear(448, 448),
            nn.ReLU()
        )

        self.critic_ext = linear(448, 1)
        self.critic_int = linear(448, 1)

        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

        init.orthogonal_(self.critic_ext.weight, 0.01)
        self.critic_ext.bias.data.zero_()

        init.orthogonal_(self.critic_int.weight, 0.01)
        self.critic_int.bias.data.zero_()

        for i in range(len(self.actor)):
            if type(self.actor[i]) == nn.Linear:
                init.orthogonal_(self.actor[i].weight, 0.01)
                self.actor[i].bias.data.zero_()

        for i in range(len(self.extra_layer)):
            if type(self.extra_layer[i]) == nn.Linear:
                init.orthogonal_(self.extra_layer[i].weight, 0.1)
                self.extra_layer[i].bias.data.zero_()

    def forward(self, state,dim = -1):
        x = self.feature(state)
        policy = self.actor(x)
        policy = F.softmax(policy,dim = dim)
        value_ext = self.critic_ext(self.extra_layer(x) + x)
        value_int = self.critic_int(self.extra_layer(x) + x)
        return policy, value_ext, value_int


class RND(nn.Module):
    def __init__(self, input_size, output_size):
        super(RND, self).__init__()

        self.input_size = input_size
        self.output_size = output_size

        feature_output = 7 * 7 * 64
        self.predictor = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=32,
                kernel_size=8,
                stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1),
            nn.LeakyReLU(),
            Flatten(),
            nn.Linear(feature_output, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512)
        )

        self.target = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=32,
                kernel_size=8,
                stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1),
            nn.LeakyReLU(),
            Flatten(),
            nn.Linear(feature_output, 512)
        )

        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

        for param in self.target.parameters():
            param.requires_grad = False

    def forward(self, next_obs):
        target_feature = self.target(next_obs)
        predict_feature = self.predictor(next_obs)

        return predict_feature, target_feature

In [None]:
gpu = False

In [None]:
learning_rate = 0.0001
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 4
T_horizon     = 128
critic_coef = 0.5
ent_coef = 0.001
intrinsic_gamma = 0.99
extrinsic_gamma = 0.999
update_proportion = 0.25

extrinsic_advantage_coef = 2
intrinsic_advantage_coef = 1
class Agent(nn.Module):
    def __init__(self,width=240,height=256,channel = 3,action_dim=7,learning_rate=0.0005):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        
        self.memory = [[] for _ in range(env_num)]
        self.intrinsic_queue_list = [queue.Queue() for _ in range(env_num)]
        self.intrinsic_input_queue_list = [queue.Queue() for _ in range(env_num)]
        self.ppo = PPO(self.width, self.height, self.channel, self.action_dim)
        self.rnd = RND(self.width, self.height , self.channel)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
    def put_data(self,i,data):
        self.memory[i].append(data)
    def make_batch(self,i):
        state_list, action_list, extrinsic_reward_list, intrinsic_reward_list, next_state_list, \
        prob_list, extrinsic_done_list,intrinsic_done_list = [],[],[],[],[],[], [],[]
        for data in self.memory[i]:
            state,action,extrinsic_reward, intrinsic_reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            extrinsic_reward_list.append([extrinsic_reward])
            intrinsic_reward_list.append([intrinsic_reward])

            next_state_list.append(next_state)
            extrinsic_done_mask = 0 if done else 1
            extrinsic_done_list.append([extrinsic_done_mask])
            intrinsic_done_list.append([1])
            prob_list.append([prob])
        self.memory[i] = []

        s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob \
                                        = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),\
                                        torch.tensor(extrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(extrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(prob_list,dtype = torch.float)
        if gpu:
            return s.cuda(),a.cuda(),er.cuda(),ir.cuda(),next_s.cuda(),extrinsic_done_list.cuda()\
            ,intrinsic_done_list.cuda(),prob.cuda() 
        else :
            return s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob  
    
    def train(self,i):
        state,action,extrinsic_reward,intrinsic_reward, next_state,extrinsic_done_list,\
        intrinsic_done_list,action_prob = self.make_batch(i)
        
        for k in range(K_epoch):
            

            state = state.squeeze()
            next_state = next_state.squeeze()
            predicted_action, predicted_extrinsic, predicted_intrinsic = self.ppo(state)
            predicted_next_action, predicted_next_extrinsic, predicted_next_intrinsic = self.ppo(next_state)

            if gpu:
                intrinsic_next_state_mean = torch.mean(torch.cat(list(self.intrinsic_input_queue_list[i].queue)),dim = 0).cuda()
                #if len(model.trinsic_input_queue_list[i].queue) == 1:
                    
                intrinsic_next_state_std = torch.std(torch.cat(list(model.intrinsic_input_queue_list[i].queue)),dim = 0).cuda()
            
                preprocessed_next_state = torch.clamp(((next_state - intrinsic_next_state_mean) / \
                                                   (intrinsic_next_state_std + torch.tensor(1e-8).cuda())),-5,5)
            else:
                intrinsic_next_state_mean = torch.mean(torch.cat(list(self.intrinsic_input_queue_list[i].queue)),dim = 0)
                intrinsic_next_state_std = torch.std(torch.cat(list(model.intrinsic_input_queue_list[i].queue)),dim = 0)
            
                preprocessed_next_state = torch.clamp(((next_state - intrinsic_next_state_mean) / \
                                                   (intrinsic_next_state_std + torch.tensor(1e-8))),-5,5)
            
            predict_feature, target_feature = self.rnd(preprocessed_next_state)
            td_error = extrinsic_reward + extrinsic_gamma * predicted_next_extrinsic * extrinsic_done_list
            delta = td_error - predicted_next_extrinsic
            if gpu:
                delta = delta.detach().cpu().numpy()
            else:
                delta = delta.detach().numpy()
            advantage_list = []
            
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            if gpu:
                advantage = torch.tensor(advantage_list,dtype = torch.float).cuda()
            else:
                advantage = torch.tensor(advantage_list,dtype = torch.float)
            ##intrinsic_advantage
            intrinsic_td_error = intrinsic_reward + intrinsic_gamma * predicted_next_intrinsic * intrinsic_done_list
            intrinsic_delta = intrinsic_td_error - predicted_next_intrinsic
            if gpu:
                intrinsic_delta = intrinsic_delta.detach().cpu().numpy()
            else:
                intrinsic_delta = intrinsic_delta.detach().numpy()
            

            
            intrinsic_advantage_list = []
            intrinsic_advantage = 0.0
            
            for intrinsic_delta_t in intrinsic_delta[::-1]:
                intrinsic_advantage = gamma * lmbda * intrinsic_advantage + intrinsic_delta_t[0]
                intrinsic_advantage_list.append([intrinsic_advantage])
            intrinsic_advantage_list.reverse()
            if gpu:
                intrinsic_advantage = torch.tensor(intrinsic_advantage_list,dtype = torch.float).cuda()
            else:
                intrinsic_advantage = torch.tensor(intrinsic_advantage_list,dtype = torch.float)
            #### intrinsic_error
            intrinsic_error = (predict_feature - target_feature.detach()).pow(2).sum(0)
            if gpu:
                masking = torch.rand(len(intrinsic_error)).cuda()
                masking = (masking < update_proportion).type(torch.FloatTensor).cuda()
            else:
                masking = torch.rand(len(intrinsic_error))
                masking = (masking < update_proportion).type(torch.FloatTensor)
            if gpu:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(masking.sum(), torch.Tensor([1]).cuda())
            else:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(masking.sum(), torch.Tensor([1]))

            
            now_action = predicted_action
            m = Categorical(now_action)
            entropy = m.entropy().mean()
            
            
            now_action = now_action.gather(1,action)
            
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            advantage = extrinsic_advantage_coef * advantage +  intrinsic_advantage_coef * intrinsic_advantage
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2).mean() + critic_coef * (F.smooth_l1_loss(predicted_extrinsic,td_error.detach()) +\
                    intrinsic_error) - ent_coef * entropy.mean()  #+ F.mse_loss(predict_feature, target_feature)
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

In [None]:
epochs = 1000
env_num = 4
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

env_list = [gym_super_mario_bros.make('SuperMarioBros-1-1-v0'),\
           gym_super_mario_bros.make('SuperMarioBros-1-1-v0'),\
           gym_super_mario_bros.make('SuperMarioBros-1-1-v0'),\
           gym_super_mario_bros.make('SuperMarioBros-1-1-v0')]
env_list = [JoypadSpace(env, SIMPLE_MOVEMENT) for env in env_list]

#env_list = [gym_super_mario_bros.make('SuperMarioBros-1-'+str(i)+'-v0') for i in range(1,env_num+1)]
#env_list = [JoypadSpace(env, SIMPLE_MOVEMENT) for env in env_list]

In [None]:
if gpu:
    model = Agent().cuda()
else:
    model = Agent()

In [None]:
T_horizon_list = [32,64,96,128]


for epoch in range(epochs):
    #clear_output()
    if epoch > 0:
        print('epoch : ',epoch, 'info_list : ',info_list)
    global_step_list = [0 for _ in range(env_num)]
    model.intrinsic_queue = [queue.Queue() for _ in range(env_num)]
    model.intrinsic_input_queue = [queue.Queue() for _ in range(env_num)]
    state_list = [env.reset() for env in env_list]
    state_list = [np.array(state)/255 for state in state_list]
            #state = np.transpose(state,(2,0,1))
    state_list = [np.moveaxis(state, -1, 0) for state in state_list]
    state_list = [torch.tensor(state).float() for state in state_list]
    state_list = [state.unsqueeze(0) for state in state_list]
    done_list = [False for _ in range(env_num)]
    
    action_prob_list = [[] for _ in range(env_num)]
    m_list = [[] for _ in range(env_num)]
    action_list = [[] for _ in range(env_num)]
    next_state_list = [[] for _ in range(env_num)]
    extrinsic_reward_list = [[] for _ in range(env_num)]
    info_list = [[] for _ in range(env_num)]
    next_state_list = [[] for _ in range(env_num)]
    
    while not all(done_list) :
        for i in range(env_num):
            if not done_list[i] :
                for t in range(T_horizon_list[i]):
                    #env.render()
                    global_step_list[i] +=1

                    if gpu:
                        action_prob_list[i], _ , _ = model.ppo.forward(state_list[i].cuda())
                    else:
                        action_prob_list[i], _ , _ = model.ppo.forward(state_list[i])
                    m_list[i] = Categorical(action_prob_list[i])

                    action_list[i] = m_list[i].sample().item()


                    next_state, extrinsic_reward_list[i], done_list[i], info_list[i] = env_list[i].step(action_list[i])
                    next_state = np.array(next_state)/255
                    next_state = np.moveaxis(next_state,-1,0)
                    next_state = torch.tensor(next_state).float()
                    next_state = next_state.unsqueeze(0)
                    next_state_list[i] = next_state

                    model.intrinsic_input_queue_list[i].put(next_state)
                    if len(model.intrinsic_input_queue_list[i].queue) > 128:
                        model.intrinsic_input_queue_list[i].get()
                    intrinsic_next_state_mean = \
                            torch.mean(torch.cat(list(model.intrinsic_input_queue_list[i].queue)),dim = 0)
                    if len(model.intrinsic_input_queue_list[i].queue) == 1:
                        intrinsic_next_state_std = torch.zeros(1)
                    else:
                        intrinsic_next_state_std = \
                                torch.std(torch.cat(list(model.intrinsic_input_queue_list[i].queue)),dim = 0)

                    preprocessed_next_state = \
                            torch.clamp(((next_state - intrinsic_next_state_mean) / (intrinsic_next_state_std + 1e-8)), -5,5)


                    #(model.intrinsic_input_queue)
                    if gpu:
                        predictor,target = model.rnd.forward(preprocessed_next_state.cuda())
                    else:
                        predictor,target = model.rnd.forward(preprocessed_next_state)
                    intrinsic_reward = (predictor - target).pow(2).sum(1) / 2
                    if len(model.intrinsic_queue_list[i].queue) > 128:
                        model.intrinsic_queue_list[i].get()
                    model.intrinsic_queue_list[i].put(intrinsic_reward.item())
                    intrinsic_mean = np.mean(model.intrinsic_queue_list[i].queue)
                    if len(model.intrinsic_queue_list[i].queue) == 1:
                        if gpu:
                            intrinsic_std = torch.zeros(1).cuda()
                        else:
                            intrinsic_std = torch.zeros(1)
                    else:
                        intrinsic_std = np.std(model.intrinsic_queue_list[i].queue)
                    
                    intrinsic_reward = (intrinsic_reward - intrinsic_mean) / (intrinsic_std+ 1e-8)

                    if (info_list[i]['time'] == 0)  or(global_step_list[i] > 1000):
                        done_list[i] = True
                        reward = -10.

                        
                        
                    model.put_data(i,((state_list[i].tolist(), \
                                    action_list[i], extrinsic_reward_list[i]/100,\
                                    (intrinsic_reward.item()/100), next_state.tolist(), \
                                    action_prob_list[i][0][action_list[i]].item(), done_list[i])))
                    #print('env number : ',i,', global_step : ',global_step_list[i],', action : ', action_list[i],'action_prob : ',action_prob_list[i].tolist()[0])
                    #print('extrinsic_reward : ',extrinsic_reward_list[i],'intrinsic_reward : ',intrinsic_reward.item())
                    #print('place',info_list[i]['x_pos'])
                    #print('info',info_list[i])
                    if done_list[i] :
                        print('env_num : ',i', epoch : ',epoch, ', global_step : ',global_step_list[i])
                        break
                    state_list[i] = next_state_list[i]
        for i in range(env_num) :
            if len(model.memory[i]) > 1:
                model.train(i)
    #torch.save(model.state_dict(), 'gdrive/My Drive/supermario_checkpoint/'+str(epoch))
    #env.render()

#env.close()