In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.nn import init
import copy
import numpy as np

In [2]:
class RND(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3):
        super(RND,self).__init__()
        
        self.width = width
        self.height = height
        self.channel = channel
        
        self.predictor = nn.Sequential(
            nn.Conv2d(
                in_channels=3,
                out_channels=32,
                kernel_size=8,
                stride=4),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=4,
                stride=2),
            nn.LeakyReLU(),
            nn.Conv2d(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1),
            nn.LeakyReLU(),
            Flatten(),
            nn.Linear(128, 512), # change
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 512)
            )
        self.target = copy.deepcopy(self.predictor)
        for p in self.modules():
            if isinstance(p, nn.Conv2d):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

            if isinstance(p, nn.Linear):
                init.orthogonal_(p.weight, np.sqrt(2))
                p.bias.data.zero_()

        for param in self.target.parameters():
            param.requires_grad = False
            
    def forward(self, state):
        target_feature = self.target(state)
        predict_feature = self.predictor(state)
        return predict_feature, target_feature

In [3]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
class PPO(nn.Module):
    def __init__(self,width = 240, height =256, channel = 3, action_dim = 7):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        super(PPO,self).__init__()
        self.memory = []
        
        self.basic = nn.Sequential(\
                    nn.Conv2d(in_channels = 3,\
                             out_channels = 32,\
                             kernel_size = 8,\
                              stride = 4),
                    nn.ReLU(),
                                   nn.Conv2d(in_channels = 32,\
                                           out_channels = 64,\
                                           kernel_size = 4,\
                                           stride = 2),\
                    nn.ReLU(),\
                    nn.Conv2d(in_channels = 64,\
                             out_channels = 64,\
                             kernel_size = 3,\
                             stride = 1),
                                   nn.ReLU(),\
                                   Flatten(),
                    nn.Linear(46592,256), #have to change       
                    nn.ReLU(),\
                    nn.Linear(256,448),
                    nn.ReLU()
                    )
        self.actor = nn.Sequential(\
                                  nn.Linear(448,448),\
                                  nn.ReLU(),\
                                  nn.Linear(448,self.action_dim)\
                                  )
        
        self.extrinsic_critic = nn.Linear(448,1)
        self.intrinsic_critic = nn.Linear(448,1)
        
        init.orthogonal_(self.extrinsic_critic.weight, 0.01)
        self.extrinsic_critic.bias.data.zero_()

        init.orthogonal_(self.intrinsic_critic.weight, 0.01)
        self.intrinsic_critic.bias.data.zero_()

        for i in range(len(self.actor)):
            if type(self.actor[i]) == nn.Linear:
                init.orthogonal_(self.actor[i].weight, 0.01)
                self.actor[i].bias.data.zero_()
    def forward(self, x,dim = -1):
        x = self.basic(x)
        action = self.actor(x)
        action_prob = F.softmax(action,dim = dim)
        
        intrinsic = self.intrinsic_critic(x)
        extrinsic = self.extrinsic_critic(x)
        return action_prob,extrinsic,intrinsic    

    


In [4]:
learning_rate = 0.0001
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 4
T_horizon     = 128
critic_coef = 0.5
ent_coef = 0.001

class Agent(nn.Module):
    def __init__(self,width=240,height=256,channel = 3,action_dim=7,learning_rate=0.0005):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        
        self.memory = []
        
        self.ppo = PPO(self.width, self.height, self.channel, self.action_dim)
        self.rnd = RND(self.width, self.height , self.channel)
    
    def put_data(self,data):
        self.memory.append(data)
    def make_batch(self,episodic):
        state_list, action_list, extrinsic_reward_list, intrinsic_reward_list, next_state_list, \
        prob_list, extrinsic_done_list,intrinsic_done_list = [],[],[],[],[],[], [],[]
        for data in self.memory:
            state,action,extrinsic_reward, intrinsic_reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            extrinsic_reward_list.append([extrinsic_reward])
            intrinsic_reward_list.append([intrinsic_reward])
            prob_list.append([prob])
            next_state_list.append(next_state)
            extrinsic_done_mask = 0 if done else 1
            extrinsic_done_mask = 1
            extrinsic_done_list.append([done_mask])
            intrinsic_done_list.append([1])
        self.memory = []
        
        s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob \
                                        = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),\
                                        torch.tensor(extrinsic_reward_list),\
                                        torch.tensor(intrinsic_reward_list),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(extrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(intrinsic_done_list,dtype = torch.float),\
                                        torch.tensor(prob_list)
        return s,a,er,ir,next_s,extrinsic_done_list,intrinsic_done_list,prob  
    
    def train(self):
        state,action,extrinsic_reward,intrinsic_reward, next_state,extrinsic_done_list,\
        intrinsic_done_list,action_prob = self.make_batch()
        
        for i in range(K_epoch):
            predicted_action, predicted_extrinsic, predicted_intrinsic = self.ppo(state)
            predicted_next_action, predicted_next_extrinsic, predicted_next_intrinsic = self.ppo(next_state)
            td_error = extrinsic_reward + gamma * predicted_next_extrinsic * extrinsic_done_list
            delta = td_error - predicted_next_extrinsic
            delta = delta.detach().numpy()
            advantage_list = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            advantage = torch.tensor(advantage_list,dtype = torch.float)
            
            intrinsic_td_error = intrinsic_reward + gamma * predicted_next_intrinsic * intrinsic_done_list
            intrinsic_delta = intrinsic_td_error - predicted_next_intrinsic
            intrinsic_delta = intrinsic_delta.detach().numpy()
            intrinsic_advantage_list = []
            intrinsic_advantage = 0.0
            for intrinsic_delta_t in intrinsic_delta[::-1]:
                intrinsic_advantage = gamma * lmbda * intrinsic_advantage + intrinsic_delta_t[0]
                intrinsic_advantage_list.append([intrinsic_advantage])
            intrinsic_advantage_list.reverse()
            intrinsic_advantage_list = torch.tensor(intrinsic_advantage_list,dtype = torch.float)
            
            now_action = predicted_action
            m = Categorical(now_action)
            entropy = m.entropy().mean()
            now_action = now_action.gather(1,action)
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2) + critic_coef * (F.smooth_l1_loss(predicted_extrinsic,td_error.detach()) +\
                    F.smooth_l1_loss(predicted_intrinsic,intrinsic_td_error.detach())) + ent_coef * entropy \
                    + F.mse_loss(predicted_next_extrinsic, predicted_next_intrinsic)
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        self.ppo.memory = []

In [5]:
epochs = 1000
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
model = Agent()

In [None]:


for epoch in range(epochs):
    global_step = 0
    state = env.reset()
    done = False
    while not done :
        for t in range(T_horizon):
            env.render()
            global_step +=1
            state = np.array(state)/255
            #state = np.transpose(state,(2,0,1))
            state = np.moveaxis(state, -1, 0)
            state = torch.tensor(state).float()
            state = state.unsqueeze(0)
            action_prob, _ , intrinsic_reward = model.ppo.forward(state,dim = 0)
            m = Categorical(action_prob)
            action = m.sample().item()
                #state,action,extrinsic_reward,intrinsic_reward, next_state,done_mask,action_prob
            action
            next_state, extrinsic_reward, done, info = env.step(action)
            if info['time'] == 0 :
                done = True
                reward = -10.
            model.put_data((state, action, extrinsic_reward, intrinsic_reward, next_state, action_prob[0][action].item(), done))
            #print('global_step : ',global_step,', action : ', action,' reward : ',reward, 'action_prob : ',action_prob)

            if done :
                print('epoch : ',epoch, ', global_step : ',global_step)
                break
            state = next_state
    #env.render()

#env.close()

  return (self.ram[0x86] - self.ram[0x071c]) % 256


In [17]:
env.close()

In [13]:
model.ppo.forward(state,dim = 1)

(tensor([[-1.8816e-07, -1.0887e-07, -5.4678e-07,  6.8075e-07,  2.4049e-06,
           1.6383e-06,  3.7556e-06]], grad_fn=<AddmmBackward>),
 tensor([[1.3079e-06]], grad_fn=<AddmmBackward>),
 tensor([[-9.2827e-05]], grad_fn=<AddmmBackward>))

In [12]:
action_prob[action]

IndexError: index 4 is out of bounds for dimension 0 with size 1

In [219]:
env.close()

In [48]:
'''
#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 128
class Agent(nn.Module):
    def __init__(self,width=240,height=256,channel = 3,action_dim=7,learning_rate=0.0005):
        self.width = width
        self.height = height
        self.channel = channel
        self.action_dim = action_dim
        
        super(Agent,self).__init__()
        self.memory = []

        self.conv_1 = nn.Conv2d(self.channel,16,3) #channel_1, kernel_1
        self.pool_1 = nn.MaxPool2d(2) # channel_1
        self.conv_2 = nn.Conv2d(16,16,3) # channel_1,channel_2, kernel_2
        self.pool_2 = nn.MaxPool2d(2) # channel_2 
        self.middle = nn.Linear(58*62*16,16)
        self.policy = nn.Linear(16, self.action_dim) #바꿔야함
        self.value = nn.Linear(16, 1) #바꿔야함
        
        self.rnd_
        
        
        self.optimizer = optim.Adam(self.parameters(),lr = learning_rate)
        
    def get_action(self,x, softmax_dim = 0):
        x = F.relu(self.conv_1(x))
        x = self.pool_1(x)
        x = F.relu(self.conv_2(x))
        x = self.pool_2(x)
        x = torch.flatten(x,start_dim=1)
        x = self.middle(x)
        x = self.policy(x)
        if softmax_dim == 0 :
            x = x.squeeze()
        prob = F.softmax(x, dim = softmax_dim)
        return prob
    
    def get_value(self,x):
        x = F.relu(self.conv_1(x))
        x = self.pool_1(x)
        x = F.relu(self.conv_2(x))
        x = self.pool_2(x)
        x = torch.flatten(x,start_dim=1)
        x = self.middle(x)
        x = self.value(x)
        return x
    
    def put_data(self,data):
        self.memory.append(data)
        
    def make_batch(self):
        state_list, action_list, reward_list, next_state_list, prob_list, done_list = [],[],[],[],[],[]
        for data in self.memory:
            state,action,reward,next_state,prob,done = data
            state_list.append(state)
            action_list.append([action])
            reward_list.append([reward])
            prob_list.append([prob])
            next_state_list.append(next_state)
            done_mask = 0 if done else 1
            done_list.append([done_mask])
        self.memory = []
        
        s,a,r,next_s,done_mask,prob = torch.tensor(state_list,dtype=torch.float),\
                                        torch.tensor(action_list),torch.tensor(reward_list),\
                                        torch.tensor(next_state_list,dtype=torch.float),\
                                        torch.tensor(done_list,dtype = torch.float),\
                                        torch.tensor(prob_list)
        return s,a,r,next_s,done_mask,prob
    
    def train(self):
        state,action,extrinsic_reward,intrinsic_reward, next_state,done_mask,action_prob = self.make_batch()

        for i in range(K_epoch):
            td_error = reward + gamma * self.get_value(next_state) * done_mask
            delta = td_error - self.get_value(state)
            delta = delta.detach().numpy()
            advantage_list = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_list.append([advantage])
            advantage_list.reverse()
            advantage = torch.tensor(advantage_list,dtype = torch.float)
            
            
            now_action = self.get_action(state,softmax_dim = 1)
            now_action = now_action.gather(1,action)
            
            ratio = torch.exp(torch.log(now_action) - torch.log(action_prob))
            
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio , 1-eps_clip, 1 + eps_clip) * advantage
            loss = - torch.min(surr1,surr2) + F.smooth_l1_loss(self.get_value(state),td_error.detach())
            
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
    
'''

tensor([[[[0.4078, 0.4078, 0.4078,  ..., 0.4078, 0.4078, 0.4078],
          [0.4078, 0.4078, 0.4078,  ..., 0.4078, 0.4078, 0.4078],
          [0.4078, 0.4078, 0.4078,  ..., 0.4078, 0.4078, 0.4078],
          ...,
          [0.9412, 0.8941, 0.8941,  ..., 0.8941, 0.8941, 0.0000],
          [0.9412, 0.8941, 0.8941,  ..., 0.8941, 0.0000, 0.0000],
          [0.8941, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.8941]],

         [[0.5333, 0.5333, 0.5333,  ..., 0.5333, 0.5333, 0.5333],
          [0.5333, 0.5333, 0.5333,  ..., 0.5333, 0.5333, 0.5333],
          [0.5333, 0.5333, 0.5333,  ..., 0.5333, 0.5333, 0.5333],
          ...,
          [0.8157, 0.3608, 0.3608,  ..., 0.3608, 0.3608, 0.0000],
          [0.8157, 0.3608, 0.3608,  ..., 0.3608, 0.0000, 0.0000],
          [0.3608, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.3608]],

         [[0.9882, 0.9882, 0.9882,  ..., 0.9882, 0.9882, 0.9882],
          [0.9882, 0.9882, 0.9882,  ..., 0.9882, 0.9882, 0.9882],
          [0.9882, 0.9882, 0.9882,  ..., 0

In [47]:
action_prob

tensor([[1., 1., 1., 1., 1., 1., 1.]], grad_fn=<SoftmaxBackward>)

In [125]:
action_prob

tensor([[1., 1., 1., 1., 1., 1., 1.]], grad_fn=<SoftmaxBackward>)

In [96]:
env.observation_space.shape

(240, 256, 3)

In [155]:
state

tensor([[[[104., 104., 104.,  ..., 104., 104., 104.],
          [104., 104., 104.,  ..., 104., 104., 104.],
          [104., 104., 104.,  ..., 104., 104., 104.],
          ...,
          [228., 228., 228.,  ..., 228., 228., 228.],
          [228., 228.,   0.,  ..., 228., 228., 228.],
          [  0.,   0.,   0.,  ...,   0.,   0.,   0.]],

         [[136., 136., 136.,  ..., 136., 136., 136.],
          [136., 136., 136.,  ..., 136., 136., 136.],
          [136., 136., 136.,  ..., 136., 136., 136.],
          ...,
          [ 92.,  92.,  92.,  ...,  92.,  92.,  92.],
          [ 92.,  92.,   0.,  ...,  92.,  92.,  92.],
          [  0.,   0.,   0.,  ...,   0.,   0.,   0.]],

         [[252., 252., 252.,  ..., 252., 252., 252.],
          [252., 252., 252.,  ..., 252., 252., 252.],
          [252., 252., 252.,  ..., 252., 252., 252.],
          ...,
          [ 16.,  16.,  16.,  ...,  16.,  16.,  16.],
          [ 16.,  16.,   0.,  ...,  16.,  16.,  16.],
          [  0.,   0.,   0.,  ...

In [93]:
env.action_space.n

7

In [27]:
import numpy as np

In [11]:
SIMPLE_MOVEMENT

[['NOOP'],
 ['right'],
 ['right', 'A'],
 ['right', 'B'],
 ['right', 'A', 'B'],
 ['A'],
 ['left']]

In [28]:
torch.tensor(state)

ValueError: some of the strides of a given numpy array are negative. This is currently not supported, but will be added in future releases.

In [58]:
state =np.array(state)
#test_state = torch.tensor(np.array(state))

In [104]:
state.shape

(240, 256, 3)

In [66]:

test_state = np.transpose(state,(2,0,1))

In [86]:
test_state = test_state.float()

In [74]:
test_state = torch.tensor(test_state)
test_state = test_state.unsqueeze(0)

In [89]:
test_conv(test_state)

tensor([[[[-1.7245e+02, -1.7245e+02, -1.7245e+02,  ..., -1.7245e+02,
           -1.7245e+02, -1.7245e+02],
          [-1.7245e+02, -1.7245e+02, -1.7245e+02,  ..., -1.7245e+02,
           -1.7245e+02, -1.7245e+02],
          [-1.7245e+02, -1.7245e+02, -1.7245e+02,  ..., -1.7245e+02,
           -1.7245e+02, -1.7245e+02],
          ...,
          [-5.5279e+01, -1.3196e+01, -3.9281e+01,  ...,  1.4265e+01,
           -1.2687e+02, -9.1684e+01],
          [-8.2662e+01, -9.2342e+01, -7.8128e+01,  ...,  4.2180e+00,
           -6.6506e+01, -7.0498e+01],
          [-3.9383e+01, -1.2426e+02, -8.1975e+01,  ...,  2.6676e-01,
           -6.1623e+01, -7.2660e+00]],

         [[-1.9878e+01, -1.9878e+01, -1.9878e+01,  ..., -1.9878e+01,
           -1.9878e+01, -1.9878e+01],
          [-1.9878e+01, -1.9878e+01, -1.9878e+01,  ..., -1.9878e+01,
           -1.9878e+01, -1.9878e+01],
          [-1.9878e+01, -1.9878e+01, -1.9878e+01,  ..., -1.9878e+01,
           -1.9878e+01, -1.9878e+01],
          ...,
     

In [17]:
test_conv = nn.Conv2d(3,64,3)

In [37]:
test_state.shape

torch.Size([240, 256, 3])

In [39]:
test_state.reshape(-1,240,256,3).shape

torch.Size([1, 240, 256, 3])

In [57]:
test_state.shape

torch.Size([240, 256, 3])

In [56]:
test_state.transpose(0,2).shape

torch.Size([3, 256, 240])

In [50]:
test_state.reshape(-1,3,240,256)

tensor([[[[104, 136, 252,  ..., 136, 252, 104],
          [136, 252, 104,  ..., 252, 104, 136],
          [252, 104, 136,  ..., 104, 136, 252],
          ...,
          [104, 136, 252,  ..., 136, 252, 104],
          [136, 252, 104,  ..., 252, 104, 136],
          [252, 104, 136,  ..., 104, 136, 252]],

         [[104, 136, 252,  ..., 136, 252, 104],
          [136, 252, 104,  ..., 252, 104, 136],
          [252, 104, 136,  ..., 104, 136, 252],
          ...,
          [104, 136, 252,  ..., 136, 252, 104],
          [136, 252, 104,  ..., 252, 104, 136],
          [252, 104, 136,  ...,   0,   0,   0]],

         [[104, 136, 252,  ..., 136, 252, 104],
          [136, 252, 104,  ..., 252, 104, 136],
          [252, 104, 136,  ..., 104, 136, 252],
          ...,
          [  0,   0,   0,  ..., 208, 176,   0],
          [  0,   0,   0,  ...,   0,   0,   0],
          [  0, 228,  92,  ...,   0,   0,   0]]]], dtype=torch.uint8)

In [49]:
test_state.permute(2,1,0).shape

torch.Size([3, 256, 240])

In [None]:
test_conv(test_state.reshape(-1,240,256,3))