In [1]:
import torch
from torch import nn
from torch.nn import functional as F
import torch.autograd as autograd
from torch.distributions import Categorical
from torch.utils.tensorboard import SummaryWriter
from torch import optim
from torch import nn
from collections import deque
import numpy as np
import random
import time
import warnings
warnings.simplefilter('ignore')

seed = 42
torch.manual_seed(seed)
writer = SummaryWriter()

# critic

In [2]:
class critic(nn.Module):
    def __init__(self,input_dim):
        super(critic,self).__init__()
        
        # conv_layers
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        # fc_layers
        self.fc1 = nn.Linear(3136,512)
        self.fc2 = nn.Linear(512,128)
        self.fc3 = nn.Linear(128,1)

    def forward(self,state):
        # get batch_size
        batch_size = state.shape[0]
        
        # forward conv layers
        state = state.permute(0,3,1,2) #batch_size,通道數,長,寬
        state = self.conv(state)
        state = state.reshape(batch_size,-1) #flatten
        
        # forward fc layers
        state = F.relu(self.fc1(state))
        state = F.relu(self.fc2(state))
        value = self.fc3(state)
        
        return value

# actor

In [3]:
class actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(actor, self).__init__()
        
        # conv_layers
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        
        # fc_layers
        self.fc1 = nn.Linear(3136,512)
        self.fc2 = nn.Linear(512,128)
        self.fc3 = nn.Linear(128,output_dim)
    
    def forward(self, state):
        # get_batch_size
        batch_size = state.shape[0]
        
        # forward conv_layers
        state = state.permute(0,3,1,2) #batch_size,通道數,長,寬
        state = self.conv(state)
        state = state.reshape(batch_size,-1) #flatten
        
        # forward fc_layers
        state = F.relu(self.fc1(state))
        state = F.relu(self.fc2(state))
        probs = F.softmax(self.fc3(state),dim=-1)
        
        return probs

# agent

In [4]:
class PPOAgent:
    def __init__(self,env,gamma = 0.99,clip = 0.2,lr = 1e-3,K_epoch = 4):
        # commom
        self.device = "cuda"
        self.env = env
        self.obs_dim = env.observation_space.shape
        self.action_dim = env.action_space.n
        
        # Hyperparamters
        self.gamma = gamma
        self.clip = clip
        self.lr = lr
        self.K_epoch = K_epoch
        
        # critic
        self.critic = critic(self.obs_dim).to(self.device)
        self.critic.apply(self._weights_init)
        
        # actor_new
        self.actor_new = actor(self.obs_dim,self.action_dim).to(self.device)
        self.actor_new.apply(self._weights_init)
        
        # actor_old,sync with actor_new
        self.actor_old = actor(self.obs_dim,self.action_dim).to(self.device)
        self.sync()
        
        # optimizers
        self.critic_optimizer = optim.Adam(self.critic.parameters(),lr=lr)
        self.actor_optimizer = optim.Adam(self.actor_new.parameters(),lr=lr)
        
        # recorder
        self.recorder = {'a_loss':[],
                         'v_loss':[],
                         'e_loss':[],
                         'ratio':[]}
        
        self.best_score = -np.inf
            
    @staticmethod
    def _weights_init(m):
        if hasattr(m,'weight'):
            nn.init.xavier_uniform_(m.weight)
            nn.init.constant_(m.bias, 0.1)
    
    def sync(self):
        for old_param, new_param in zip(self.actor_old.parameters(),self.actor_new.parameters()):
            old_param.data.copy_(new_param.data)
    
    def get_action(self,state):
        state = torch.FloatTensor(state).to(self.device)
        probs = self.actor_new(state) # softmax_probs
        dist = Categorical(probs) # Categorical distribution
        act = dist.sample() # smaple action from this Categorical distribution
        return act.item()
    
    def get_value(self,state):
        state = torch.FloatTensor([state]).to(self.device)
        value = self.critic(state)
        return value.item()
    
    def compute_returns(self,rewards):
        returns = []
        G = 0
        
        for r in rewards[::-1]:
            G = r + self.gamma*G
            returns.insert(0,G)
        
        returns = np.array([i for i in returns]).ravel()
        return torch.FloatTensor(returns).to(self.device).view(-1, 1)
    
    def normalize_img(self,x):
        return x / 255.0
    
    def update(self,trajectory):
        print('update...')
        
        # get trajectory
        state = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device)
        action = torch.LongTensor([sars[1] for sars in trajectory]).to(self.device).view(-1, 1)
        rewards = [sars[2] for sars in trajectory]
        next_state = torch.FloatTensor([sars[3] for sars in trajectory]).to(self.device)
        done = torch.FloatTensor([sars[4] for sars in trajectory]).to(self.device).view(-1, 1)
        
        # update K_epoch
        for _ in range(self.K_epoch):        
            
            # calculate critic loss
            values = self.critic(state)
            returns = self.compute_returns(rewards)
            advantage = returns - values
            critic_loss = (0.5*(advantage**2)).mean()
            self.recorder['v_loss'].append(critic_loss.item())
            
            # calculate actor_loss
            new_p = torch.gather(self.actor_new(state),1,action)
            old_p = torch.gather(self.actor_old(state),1,action)
            ratio = new_p / old_p
            self.recorder['ratio'].append(ratio.mean().item())
            
            surr1 = ratio * advantage.detach()
            surr2 = torch.clamp(ratio,1 - self.clip,1 + self.clip) * advantage.detach()
            entropy_loss = Categorical(self.actor_new(state)).entropy().mean()
            self.recorder['e_loss'].append(entropy_loss.item())
            
            actor_loss = -torch.min(surr1,surr2).mean() - 0.001*entropy_loss
            self.recorder['a_loss'].append(actor_loss.item())
            
            # update critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            # update actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
        
        # sync actor_old and actor_new
        self.sync()
        print('update ok!')
    
    def train(self,max_episodes):
        episode_rewards = []
        for episode in range(max_episodes):
            
            # initialize all
            state = env.reset()
            state = self.normalize_img(state)#預處理畫面
            trajectory = []
            episode_reward = 0
            done = False
            
            # game loop
            while not done:
                env.render()
                action = self.get_action([state])
                next_state, reward, done, info = self.env.step(action)
                reward,done = self.reward_fn(reward,done,info)
                next_state = self.normalize_img(next_state)#預處理畫面
                
                # record
                trajectory.append([state, action, reward, next_state, done])
                episode_reward += reward
                
                # change state
                state = next_state
                    
            # game over
            print("Episode {} reward {} best {}".format(episode,
                                                        episode_reward,
                                                        self.best_score))
            
            # 紀錄這一回合分數
            episode_rewards.append(episode_reward)
            
            # 如果表現比歷史紀錄最好分數還差就更新agent
            if episode_reward <= self.best_score:
                self.update(trajectory) #update
            
            # 更新歷史最佳分數
            self.best_score = max(self.best_score,episode_reward)
            
            # tensorboard update
            try:
                writer.add_scalar('v_loss',self.recorder['v_loss'][-1],episode)
                writer.add_scalar('a_loss',self.recorder['a_loss'][-1],episode)
                writer.add_scalar('e_loss',self.recorder['e_loss'][-1],episode)
                writer.add_scalar('ratio',self.recorder['ratio'][-1],episode)
                writer.add_scalar('score',episode_reward,episode)
            except:
                pass
        
        return episode_rewards
    
    def reward_fn(self,reward,done,info):
        # if agent eat something
        if info['status'] != 'small':
            if info['status'] != self.last_state:
                print('eat!')
                reward += 100
        
        # if agent died
        if info['life'] != 2:
            print('died')
            done = True
        
        # update agent's status
        self.last_state = info['status']
        return reward,done
    
    def play(self,max_episodes):
        for episode in range(max_episodes):
            # initialize all
            state = env.reset()
            state = self.normalize_img(state)#預處理畫面到區間[0,1]
            episode_reward = 0
            done = False
            
            # game loop
            while not done:
                self.env.render()
                time.sleep(0.02)
                action = self.get_action(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = self.normalize_img(next_state)#預處理畫面到區間[0,1]
                episode_reward += reward
                state = next_state
            
            # game over
            print("Episode " + str(episode) + ": " + str(episode_reward))
        
        self.env.close()

# env wrapper

In [5]:
import gym
from gym import spaces
import cv2

class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, is_render, skip = 4):
        # buffer空間只有(2,)
        gym.Wrapper.__init__(self, env)
        self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.uint8)
        self._skip = skip
        self.is_render = is_render

    def step(self, action):
        #重複動作 k 次
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            
            if self.is_render:
                self.env.render()
            
            if i == self._skip - 2:
                self._obs_buffer[0] = obs
            
            if i == self._skip - 1:
                self._obs_buffer[1] = obs
            
            total_reward += reward
            
            if done:
                break
        
        # 在時間維度上做 max pool
        max_frame = self._obs_buffer.max(axis=0)
        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

In [6]:
class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
        super().__init__(env)
        self._width = width
        self._height = height
        self._grayscale = grayscale
        self._key = dict_space_key
        
        if self._grayscale:
            num_colors = 1
        else:
            num_colors = 3

        new_space = gym.spaces.Box(
            low=0,
            high=255,
            shape=(self._height, self._width, num_colors),
            dtype=np.uint8)
        
        if self._key is None:
            original_space = self.observation_space
            self.observation_space = new_space
        else:
            original_space = self.observation_space.spaces[self._key]
            self.observation_space.spaces[self._key] = new_space
        
        assert original_space.dtype == np.uint8 and len(original_space.shape) == 3

    def observation(self, obs):
        
        if self._key is None:
            frame = obs
        else:
            frame = obs[self._key]

        if self._grayscale:
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        
        frame = cv2.resize(
            frame, (self._width, self._height), interpolation=cv2.INTER_AREA)
        
        if self._grayscale:
            frame = np.expand_dims(frame, -1)

        if self._key is None:
            obs = frame
        else:
            obs = obs.copy()
            obs[self._key] = frame
        
        return obs

# main

{'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40, 'x_pos_screen': 40, 'y_pos': 79}

In [None]:
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT,COMPLEX_MOVEMENT

# create env
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env,SIMPLE_MOVEMENT)

# wrapper env
env = WarpFrame(env,grayscale=True)
env = MaxAndSkipEnv(env,is_render=True,skip=4)

# creaet agent
agent = PPOAgent(env)

# train
history = agent.train(max_episodes=1000)

died
Episode 0 reward 230.0 best -inf
died
Episode 1 reward 765.0 best 230.0
died
Episode 2 reward 814.0 best 765.0
died
Episode 3 reward 228.0 best 814.0
update...
update ok!
died
Episode 4 reward 197.0 best 814.0
update...
update ok!
died
Episode 5 reward 180.0 best 814.0
update...
update ok!
died
Episode 6 reward 507.0 best 814.0
update...
update ok!
died
Episode 7 reward 801.0 best 814.0
update...
update ok!
died
Episode 8 reward 854.0 best 814.0
died
Episode 9 reward 1239.0 best 854.0
died
Episode 10 reward 718.0 best 1239.0
update...
update ok!
died
Episode 11 reward 591.0 best 1239.0
update...
update ok!
died
Episode 12 reward 1164.0 best 1239.0
update...
update ok!
died
Episode 13 reward 1080.0 best 1239.0
update...
update ok!
died
Episode 14 reward 968.0 best 1239.0
update...
update ok!
eat!
died
Episode 15 reward 1083.0 best 1239.0
update...
update ok!
died
Episode 16 reward 582.0 best 1239.0
update...
update ok!
died
Episode 17 reward 545.0 best 1239.0
update...
update ok!
d

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

plt.plot(history)
plt.plot(pd.Series(history).rolling(10).mean())

In [None]:
agent.play(max_episodes=10)

In [None]:
agent.env.close()