In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.nn import init
import copy
import numpy as np

In [23]:
gpu = False

In [2]:
class RND(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3):
        super(RND,self).__init__()
        
        self.width = width
        self.height = height
        self.channel = channel
        
        self.predictor = nn.Sequential(
            nn.Conv2d(
                in_channels=3,
                out_channels=32,
                kernel_size=8,
                stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1),
            nn.LeakyReLU(),
            Flatten(),
            nn.Linear(46592, 512), # change
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512)
            )
        self.target = copy.deepcopy(self.predictor)
        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

        for param in self.target.parameters():
            param.requires_grad = False
            
    def forward(self, state):
        target_feature = self.target(state)
        predict_feature = self.predictor(state)
        return predict_feature, target_feature

In [3]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
class PPO(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3, action_dim = 7):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        super(PPO,self).__init__()
        self.memory = []
        
        self.basic = nn.Sequential(\
                    nn.Conv2d(in_channels = 3,\
                             out_channels = 32,\
                             kernel_size = 8,\
                              stride = 4),
                    nn.ReLU(),
                                   nn.Conv2d(in_channels = 32,\
                                           out_channels = 64,\
                                           kernel_size = 4,\
                                           stride = 2),\
                    nn.ReLU(),\
                    nn.Conv2d(in_channels = 64,\
                             out_channels = 64,\
                             kernel_size = 3,\
                             stride = 1),
                                   nn.ReLU(),\
                                   Flatten(),
                    nn.Linear(46592,256), #have to change       
                    nn.ReLU(),\
                    nn.Linear(256,448),
                    nn.ReLU()
                    )
        self.actor = nn.Sequential(\
                                  nn.Linear(448,448),\
                                  nn.ReLU(),\
                                  nn.Linear(448,self.action_dim)\
                                  )
        
        self.extrinsic_critic = nn.Linear(448,1)
        self.intrinsic_critic = nn.Linear(448,1)
        
        init.orthogonal_(self.extrinsic_critic.weight, 0.01)
        self.extrinsic_critic.bias.data.zero_()

        init.orthogonal_(self.intrinsic_critic.weight, 0.01)
        self.intrinsic_critic.bias.data.zero_()

        for i in range(len(self.actor)):
            if type(self.actor[i]) == nn.Linear:
                init.orthogonal_(self.actor[i].weight, 0.01)
                self.actor[i].bias.data.zero_()
    def forward(self, x,dim = -1):
        x = self.basic(x)
        action = self.actor(x)
        action_prob = F.softmax(action,dim = dim)
        
        intrinsic = self.intrinsic_critic(x)
        extrinsic = self.extrinsic_critic(x)
        return action_prob,extrinsic,intrinsic    

    


In [None]:
learning_rate = 0.0001
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 4
T_horizon     = 128
critic_coef = 0.5
ent_coef = 0.001
update_proportion = 0.25
class Agent(nn.Module):
    def __init__(self,width=240,height=256,channel = 3,action_dim=7,learning_rate=0.0005):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        
        self.memory = []
        
        self.ppo = PPO(self.width, self.height, self.channel, self.action_dim)
        self.rnd = RND(self.width, self.height , self.channel)
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
    def put_data(self,data):
        self.memory.append(data)
    def make_batch(self):
        state_list, action_list, extrinsic_reward_list, intrinsic_reward_list, next_state_list, \
        prob_list, extrinsic_done_list,intrinsic_done_list = [],[],[],[],[],[], [],[]
        for data in self.memory:
            state,action,extrinsic_reward, intrinsic_reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            extrinsic_reward_list.append([extrinsic_reward])
            intrinsic_reward_list.append([intrinsic_reward])

            next_state_list.append(next_state)
            extrinsic_done_mask = 0 if done else 1
            extrinsic_done_list.append([extrinsic_done_mask])
            intrinsic_done_list.append([1])
            prob_list.append([prob])
        self.memory = []

        s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob \
                                        = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),\
                                        torch.tensor(extrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_reward_list,dtype = torch.float),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(extrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(prob_list,dtype = torch.float)
        if gpu:
            return s.cuda(),a.cuda(),er.cuda(),ir.cuda(),next_s.cuda(),extrinsic_done_list.cuda()\
            ,intrinsic_done_list.cuda(),prob.cuda() 
        else :
            return s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob  
    
    def train(self):
        state,action,extrinsic_reward,intrinsic_reward, next_state,extrinsic_done_list,\
        intrinsic_done_list,action_prob = self.make_batch()
        
        for i in range(K_epoch):
            state = state.squeeze()
            next_state = next_state.squeeze()
            predicted_action, predicted_extrinsic, predicted_intrinsic = self.ppo(state)
            predicted_next_action, predicted_next_extrinsic, predicted_next_intrinsic = self.ppo(next_state)
            predict_feature, target_feature = self.rnd(next_state)
            td_error = extrinsic_reward + gamma * predicted_next_extrinsic * extrinsic_done_list
            delta = td_error - predicted_next_extrinsic
            if gpu:
                delta = delta.detach().cpu().numpy()
            else:
                delta = delta.detach().numpy()
            advantage_list = []
            
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            if gpu:
                advantage = torch.tensor(advantage_list,dtype = torch.float).cuda()
            else:
                advantage = torch.tensor(advantage_list,dtype = torch.float)
            
            intrinsic_td_error = intrinsic_reward + gamma * predicted_next_intrinsic * intrinsic_done_list
            
            intrinsic_error = (intrinsic_td_error - predicted_intrinsic.detach()).pow(2)
            if gpu:
                masking = torch.rand(len(intrinsic_error)).cuda()
                masking = (masking < update_proportion).type(torch.FloatTensor).cuda()
            else:
                masking = torch.rand(len(intrinsic_error))
                masking = (masking < update_proportion).type(torch.FloatTensor)
            if gpu:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(intrinsic_error.sum(), torch.Tensor([1]).cuda())
            else:
                intrinsic_error = (intrinsic_error * masking).sum() / torch.max(masking.sum(), torch.Tensor([1]))

            
            now_action = predicted_action
            m = Categorical(now_action)
            entropy = m.entropy().mean()
            
            
            now_action = now_action.gather(1,action)
            
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2) + critic_coef * (F.smooth_l1_loss(predicted_extrinsic,td_error.detach()) +\
                    intrinsic_error) + ent_coef * entropy + F.mse_loss(predict_feature, target_feature)
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [45]:
epochs = 1000
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)


In [21]:
if gpu:
    model = Agent().cuda()
else:
    model = Agent()


In [25]:
import glob

In [26]:
glob.glob("*")

['training_model.ipynb', 'weights']

In [30]:
if gpu:
    model.load_state_dict(torch.load("weights/3_epochs_model"))
else:
    model.load_state_dict(torch.load("weights/3_epochs_model", map_location={'cuda:0': 'cpu'}))

In [22]:
T_horizon = 20

for epoch in range(epochs):
    global_step = 0
    state = env.reset()
    state = np.array(state)/255
            #state = np.transpose(state,(2,0,1))
    state = np.moveaxis(state, -1, 0)
    state = torch.tensor(state).float()
    state = state.unsqueeze(0)
    done = False
    while not done :
        for t in range(T_horizon):
            #env.render()
            global_step +=1

            if gpu:
                action_prob, _ , _ = model.ppo.forward(state.cuda())
            else:
                action_prob, _ , _ = model.ppo.forward(state)
            m = Categorical(action_prob)
            action = m.sample().item()
            next_state, extrinsic_reward, done, info = env.step(action)
            next_state = np.array(next_state)/255
            next_state = np.moveaxis(next_state,-1,0)
            next_state = torch.tensor(next_state).float()
            next_state = next_state.unsqueeze(0)
            if gpu:
                predictor,target = model.rnd.forward(next_state.cuda())
            else:
                predictor,target = model.rnd.forward(next_state)
            intrinsic_reward = (predictor - target).pow(2).sum(1) / 2
            if info['time'] == 0 :
                done = True
                reward = -10.

            model.put_data((state.tolist(), action, extrinsic_reward/100, (intrinsic_reward.item())/10000, next_state.tolist(), action_prob[0][action].item(), done))
            print('global_step : ',global_step,', action : ', action,'action_prob : ',action_prob.tolist()[0])
            print('extrinsic_reward : ',extrinsic_reward,'intrinsic_reward : ',intrinsic_reward.item())
            print('place',info['x_pos'])
            print('info',info)
            if done :
                print('epoch : ',epoch, ', global_step : ',global_step)
                break
            state = next_state
        model.train()
    #env.render()

#env.close()

global_step :  1 , action :  0 action_prob :  [0.14285683631896973, 0.14285725355148315, 0.14285720884799957, 0.14285777509212494, 0.14285695552825928, 0.14285697042942047, 0.14285700023174286]
extrinsic_reward :  0 intrinsic_reward :  410.8612060546875
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
global_step :  2 , action :  1 action_prob :  [0.14285683631896973, 0.14285725355148315, 0.14285720884799957, 0.14285777509212494, 0.14285695552825928, 0.14285697042942047, 0.14285700023174286]
extrinsic_reward :  0 intrinsic_reward :  410.8612060546875
place 40
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}
global_step :  3 , action :  3 action_prob :  [0.14285683631896973, 0.14285725355148315, 0.14285720884799957, 0.14285777509212494, 0.14285695552825928

global_step :  21 , action :  0 action_prob :  [0.1459439992904663, 0.14143623411655426, 0.13868798315525055, 0.14606451988220215, 0.14062733948230743, 0.14789240062236786, 0.13934753835201263]
extrinsic_reward :  1 intrinsic_reward :  114.90711975097656
place 45
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 45, 'x_pos_screen': 45, 'y_pos': 79}
global_step :  22 , action :  6 action_prob :  [0.14594414830207825, 0.1414361596107483, 0.13868775963783264, 0.14606468379497528, 0.14062722027301788, 0.14789265394210815, 0.1393473595380783]
extrinsic_reward :  0 intrinsic_reward :  116.23658752441406
place 45
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 399, 'world': 1, 'x_pos': 45, 'x_pos_screen': 45, 'y_pos': 79}
global_step :  23 , action :  4 action_prob :  [0.14594340324401855, 0.14143650233745575, 0.1386888176202774, 0.146063894033432, 0.14062777161598206,

global_step :  41 , action :  4 action_prob :  [0.13961932063102722, 0.1379391849040985, 0.1424020677804947, 0.14805886149406433, 0.1457410603761673, 0.1543506383895874, 0.13188888132572174]
extrinsic_reward :  1 intrinsic_reward :  93.4362564086914
place 47
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 398, 'world': 1, 'x_pos': 47, 'x_pos_screen': 47, 'y_pos': 84}
global_step :  42 , action :  4 action_prob :  [0.13961975276470184, 0.13793976604938507, 0.14240212738513947, 0.14805828034877777, 0.14574070274829865, 0.15434928238391876, 0.13189013302326202]
extrinsic_reward :  0 intrinsic_reward :  93.27044677734375
place 47
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 398, 'world': 1, 'x_pos': 47, 'x_pos_screen': 47, 'y_pos': 89}
global_step :  43 , action :  0 action_prob :  [0.1396188586950302, 0.13793857395648956, 0.14240199327468872, 0.14805945754051208, 0.14574141800403595, 0

global_step :  61 , action :  1 action_prob :  [0.14050406217575073, 0.14339980483055115, 0.13978461921215057, 0.14273959398269653, 0.14994189143180847, 0.15096984803676605, 0.1326601356267929]
extrinsic_reward :  2 intrinsic_reward :  61.74724578857422
place 69
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 397, 'world': 1, 'x_pos': 69, 'x_pos_screen': 69, 'y_pos': 79}
global_step :  62 , action :  0 action_prob :  [0.1405046135187149, 0.1433996856212616, 0.13978523015975952, 0.14273971319198608, 0.1499403715133667, 0.15096823871135712, 0.13266213238239288]
extrinsic_reward :  2 intrinsic_reward :  61.314208984375
place 71
info {'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 397, 'world': 1, 'x_pos': 71, 'x_pos_screen': 71, 'y_pos': 79}
global_step :  63 , action :  1 action_prob :  [0.1405055820941925, 0.1433994621038437, 0.13978633284568787, 0.1427398920059204, 0.14993765950202942, 0.15

KeyboardInterrupt: 

{'coins': 0,
 'flag_get': False,
 'life': 2,
 'score': 0,
 'stage': 1,
 'status': 'small',
 'time': 311,
 'world': 1,
 'x_pos': 594,
 'x_pos_screen': 85,
 'y_pos': 79}

319

In [44]:
env.close()

ValueError: env has already been closed.

In [43]:
if gpu:
    model.load_state_dict(torch.load("weights/1_episode_model"))
else:
    model.load_state_dict(torch.load("weights/1_episode_model", map_location={'cuda:0': 'cpu'}))

In [46]:
T_horizon = 20

for epoch in range(epochs):
    global_step = 0
    state = env.reset()
    state = np.array(state)/255
            #state = np.transpose(state,(2,0,1))
    state = np.moveaxis(state, -1, 0)
    state = torch.tensor(state).float()
    state = state.unsqueeze(0)
    done = False
    while not done :
        for t in range(T_horizon):
            env.render()
            global_step +=1

            if gpu:
                action_prob, _ , _ = model.ppo.forward(state.cuda())
            else:
                action_prob, _ , _ = model.ppo.forward(state)
            m = Categorical(action_prob)
            action = m.sample().item()
            next_state, extrinsic_reward, done, info = env.step(action)
            next_state = np.array(next_state)/255
            next_state = np.moveaxis(next_state,-1,0)
            next_state = torch.tensor(next_state).float()
            next_state = next_state.unsqueeze(0)
            if gpu:
                predictor,target = model.rnd.forward(next_state.cuda())
            else:
                predictor,target = model.rnd.forward(next_state)
            intrinsic_reward = (predictor - target).pow(2).sum(1) / 2
            if info['time'] == 0 :
                done = True
                reward = -10.
            if done :
                print('epoch : ',epoch, ', global_step : ',global_step)
                break
            state = next_state
    #env.render()

#env.close()

KeyboardInterrupt: 

In [47]:
env.close()