# Kung_Fu Asynchronous Advantage Actor-Critic (A3C)

In [1]:
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gymnasium as gym
from gymnasium import ObservationWrapper
from gymnasium.spaces import Box

## Creating the architecture of the Neural Network
![Description](https://gymnasium.farama.org/_images/kung_fu_master.gif)

In [2]:
class Network(nn.Module):

    def __init__(self, action_size):
        super(Network, self).__init__()
        self.conv1 = torch.nn.Conv2d(in_channels = 4,  out_channels = 32, kernel_size = (3,3), stride = 2)
        self.conv2 = torch.nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = (3,3), stride = 2)
        self.conv3 = torch.nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = (3,3), stride = 2)
        self.flatten = torch.nn.Flatten()
        self.fc1  = torch.nn.Linear(512, 128)
        self.fc2a = torch.nn.Linear(128, action_size)
        self.fc2s = torch.nn.Linear(128, 1)

    def forward(self, state):
        x = self.conv1(state)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.conv3(x)
        x = F.relu(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = F.relu(x)
        action_values = self.fc2a(x)
        state_value = self.fc2s(x).squeeze(-1)
        return action_values, state_value

## Setting up the environment

In [3]:
class PreprocessAtari(ObservationWrapper):

    def __init__(self, env, height = 42, width = 42, crop = lambda img: img, dim_order = 'pytorch', color = False, n_frames = 4):
        super(PreprocessAtari, self).__init__(env)
        self.img_size = (height, width)
        self.crop = crop
        self.dim_order = dim_order
        self.color = color
        self.frame_stack = n_frames
        n_channels = 3 * n_frames if color else n_frames
        obs_shape = {'tensorflow': (height, width, n_channels), 'pytorch': (n_channels, height, width)}[dim_order]
        self.observation_space = Box(0.0, 1.0, obs_shape)
        self.frames = np.zeros(obs_shape, dtype = np.float32)

    def reset(self):
        self.frames = np.zeros_like(self.frames)
        obs, info = self.env.reset()
        self.update_buffer(obs)
        return self.frames, info

    def observation(self, img):
        img = self.crop(img)
        img = cv2.resize(img, self.img_size)
        if not self.color:
            if len(img.shape) == 3 and img.shape[2] == 3:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        img = img.astype('float32') / 255.
        if self.color:
            self.frames = np.roll(self.frames, shift = -3, axis = 0)
        else:
            self.frames = np.roll(self.frames, shift = -1, axis = 0)
        if self.color:
            self.frames[-3:] = img
        else:
            self.frames[-1] = img
        return self.frames

    def update_buffer(self, obs):
        self.frames = self.observation(obs)

def make_env():
    env = gym.make("ALE/KungFuMaster-v5", render_mode = 'rgb_array')
    env = PreprocessAtari(env, height = 42, width = 42, crop = lambda img: img, dim_order = 'pytorch', color = False, n_frames = 4)
    return env

env = make_env()

state_shape = env.observation_space.shape
number_actions = env.action_space.n
print("State shape:", state_shape)
print("Number actions:", number_actions)
print("Action names:", env.unwrapped.get_action_meanings())

State shape: (4, 42, 42)
Number actions: 14
Action names: ['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'DOWNRIGHT', 'DOWNLEFT', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [4]:
learning_rate = 1e-4
discount_factor = 0.99
number_environments = 32

## Implementing the A3C model

In [5]:
class Agent:

    def __init__(self, action_size):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.action_size = action_size
        self.network = Network(action_size).to(self.device)
        self.optimizer = optim.Adam(self.network.parameters(), lr = learning_rate)

    def act(self, state):
        if state.ndim == 3:
            state = [state]
        state = torch.tensor(state, dtype = torch.float32, device=self.device)
        action_values, _ = self.network(state)
        policy = F.softmax(action_values, dim = -1)
        return np.array([np.random.choice(len(p), p = p) for p in policy.cpu().detach().numpy()])

    def step(self, state, action, reward, next_state, done):
        batch_size = state.shape[0]

        # Convert states to numpy arrays if they are lists of arrays
        if isinstance(state, list):
            state = np.array(state)
        if isinstance(next_state, list):
            next_state = np.array(next_state)
        
        state = torch.tensor(state, dtype = torch.float32, device=self.device)
        next_state = torch.tensor(next_state, dtype = torch.float32, device=self.device)
        reward = torch.tensor(reward, dtype = torch.float32, device=self.device)
        done = torch.tensor(done, dtype = torch.bool, device=self.device).to(dtype=torch.float32)
        action_values, state_values = self.network(state)
        _, next_state_values = self.network(next_state)
        target_state_values = reward + discount_factor * next_state_values * (1 - done)

        # Ensure the shapes match
        target_state_values = target_state_values.view(-1)
        state_values = state_values.view(-1)
        
        advantage = target_state_values - state_values
        probs = F.softmax(action_values, dim = -1)
        logprobs = F.log_softmax(action_values, dim = -1)
        entropy = -torch.sum(probs * logprobs, dim = -1)
        batch_idx = np.arange(batch_size)
        logp_actions = logprobs[batch_idx, action]
        actor_loss = -(logp_actions * advantage.detach()).mean() - 0.001 * entropy.mean()
        critic_loss = F.mse_loss(target_state_values.detach(), state_values)
        total_loss = actor_loss + critic_loss
        self.optimizer.zero_grad()
        total_loss.backward()
        self.optimizer.step()

In [6]:
# Initialize the agent
agent = Agent(number_actions)

In [7]:
# Evaluate the agent
def evaluate(agent, env, n_episodes = 1):
    episodes_rewards = []
    for _ in range(n_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        while True:
            action = agent.act(state)
            next_state, reward, done, info, _ = env.step(action[0])
            total_reward += reward
            if done:
                break
        episodes_rewards.append(total_reward)
    return episodes_rewards

In [8]:
# Managing multiple environments
class EnvBatch:
    
    def __init__(self, n_envs = 16):
        self.envs = [make_env() for _ in range(n_envs)]
        
    def reset(self):
        _states = []
        for env in self.envs:
            _states.append(env.reset()[0])
        return np.array(_states)

    def step(self, actions):
        next_states, rewards, dones, infos, _ = map(np.array, zip(*[env.step(a) for env, a in zip(self.envs, actions)]))
        for i in range(len(self.envs)):
            if dones[i]:
                next_states[i] = self.envs[i].reset()[0]
        return next_states, rewards, dones, infos

## Training the agent

In [115]:
import tqdm
import os
import imageio

# Directory to store the videos
video_dir = 'kung_fu_training_videos'
os.makedirs(video_dir, exist_ok=True)

env_batch = EnvBatch(number_environments)
batch_states = env_batch.reset()

total_number_iterations = 300000

with tqdm.trange(0, total_number_iterations) as progress_bar:
    for i in progress_bar:
        batch_actions = agent.act(batch_states)
        batch_next_states, batch_rewards, batch_dones, _ = env_batch.step(batch_actions)
        batch_rewards *= 0.01
        agent.step(batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones)
        batch_states = batch_next_states
        # Capture video for the first 5 iterations and every 500th iteration
        if i in [0, 1000, 5000] or i % 20000 == 0:
            episode_frames = []
            state, _ = env.reset()
            done = False
            while not done:
                frame = env.render()
                frame = frame[:frame.shape[0] - frame.shape[0] % 16, :frame.shape[1] - frame.shape[1] % 16]
                episode_frames.append(frame)
                action = agent.act(np.expand_dims(state, axis=0))[0] 
                state, reward, done, _, _ = env.step(action)
            env.close()

            # Save video
            video_filename = f'kung_fu_training_video_{i}.mp4'
            video_file_loc = os.path.join(video_dir, video_filename)
            imageio.mimsave(video_file_loc, episode_frames, fps=30)

            print("Saved video for iteration:", i)
            print("Average agent reward:", np.mean(evaluate(agent, env, n_episodes=16)))
    
    print("Training completed!")
    # Save the model
    torch.save(agent.network.state_dict(), f'kung_fu_agent_number_training_{total_number_iterations}.pth')

  0%|          | 0/300000 [00:00<?, ?it/s]

Saved video for iteration: 0


  0%|          | 5/300000 [00:28<357:34:34,  4.29s/it] 

Average agent reward: 525.0


  0%|          | 998/300000 [01:01<2:32:16, 32.72it/s]

Saved video for iteration: 1000


  0%|          | 1005/300000 [01:32<145:29:20,  1.75s/it]

Average agent reward: 831.25


  2%|▏         | 4998/300000 [03:36<2:32:58, 32.14it/s]  

Saved video for iteration: 5000


  2%|▏         | 5005/300000 [04:03<123:15:32,  1.50s/it]

Average agent reward: 375.0


  7%|▋         | 19999/300000 [11:46<2:17:43, 33.88it/s] 

Saved video for iteration: 20000


  7%|▋         | 20005/300000 [12:16<141:37:12,  1.82s/it]

Average agent reward: 725.0


 13%|█▎        | 39998/300000 [22:59<2:09:37, 33.43it/s]  

Saved video for iteration: 40000


 13%|█▎        | 40005/300000 [23:49<202:22:23,  2.80s/it]

Average agent reward: 3918.75


 20%|██        | 60000/300000 [33:38<2:03:50, 32.30it/s]  

Saved video for iteration: 60000


 20%|██        | 60005/300000 [34:15<157:31:01,  2.36s/it]

Average agent reward: 1687.5


 27%|██▋       | 79997/300000 [44:27<1:48:11, 33.89it/s]  

Saved video for iteration: 80000


 27%|██▋       | 80005/300000 [45:02<113:32:25,  1.86s/it]

Average agent reward: 2725.0


 33%|███▎      | 99997/300000 [54:58<1:34:35, 35.24it/s]  

Saved video for iteration: 100000


 33%|███▎      | 100005/300000 [55:26<82:01:50,  1.48s/it] 

Average agent reward: 1462.5


 40%|████      | 120000/300000 [1:05:27<1:26:29, 34.69it/s]

Saved video for iteration: 120000


 40%|████      | 120005/300000 [1:05:50<72:34:05,  1.45s/it] 

Average agent reward: 118.75


 47%|████▋     | 139998/300000 [1:16:26<1:21:58, 32.53it/s] 

Saved video for iteration: 140000


 47%|████▋     | 140005/300000 [1:16:49<55:20:56,  1.25s/it]

Average agent reward: 0.0


 53%|█████▎    | 159998/300000 [1:27:44<1:23:19, 28.01it/s] 

Saved video for iteration: 160000


 53%|█████▎    | 160005/300000 [1:28:06<51:20:01,  1.32s/it]

Average agent reward: 0.0


 60%|█████▉    | 179998/300000 [1:39:32<1:15:25, 26.52it/s] 

Saved video for iteration: 180000


 60%|██████    | 180005/300000 [1:39:56<51:01:51,  1.53s/it]

Average agent reward: 0.0


 67%|██████▋   | 199999/300000 [1:50:58<55:07, 30.23it/s]   

Saved video for iteration: 200000


 67%|██████▋   | 200003/300000 [1:51:21<45:54:00,  1.65s/it]

Average agent reward: 0.0


 73%|███████▎  | 220000/300000 [2:02:11<1:00:35, 22.01it/s] 

Saved video for iteration: 220000


 73%|███████▎  | 220005/300000 [2:02:35<36:30:34,  1.64s/it]

Average agent reward: 0.0


 80%|███████▉  | 239999/300000 [2:13:24<32:59, 30.32it/s]   

Saved video for iteration: 240000


 80%|████████  | 240005/300000 [2:13:45<21:01:32,  1.26s/it]

Average agent reward: 0.0


 87%|████████▋ | 259999/300000 [2:24:49<20:53, 31.92it/s]   

Saved video for iteration: 260000


 87%|████████▋ | 260004/300000 [2:25:14<18:12:30,  1.64s/it]

Average agent reward: 0.0


 93%|█████████▎| 279997/300000 [2:36:26<10:40, 31.25it/s]   

Saved video for iteration: 280000


 93%|█████████▎| 280005/300000 [2:36:49<6:43:08,  1.21s/it]

Average agent reward: 0.0


100%|██████████| 300000/300000 [2:47:58<00:00, 29.77it/s]  

Training completed!





In [10]:
import glob
import io
import base64
import imageio
from datetime import datetime
from IPython.display import HTML, display

# Load the model with the best performance
agent.network.load_state_dict(torch.load('kung_fu_agent_number_training_300000.pth', weights_only=False))

# Initialize the environment
def show_video_of_model(agent, env_name):
    # env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frame = frame[:frame.shape[0] - frame.shape[0] % 16, :frame.shape[1] - frame.shape[1] % 16]
        frames.append(frame)
        action = agent.act(state)
        state, reward, done, _, _ = env.step(action[0])
    env.close()
    
    now = datetime.now().strftime("%b%d_%H-%M-%S")
    video_filename = f'video_kungfu_master_{now}.mp4'
    
    imageio.mimsave(video_filename, frames, fps=30, quality=10)
    
# Display the video
env = make_env()
show_video_of_model(agent, env)